In [1]:
from scraper import Scraper

to_scrape = 'https://www.immobiliare.it/'
query = 'vendita-case/roma/?criterio=rilevanza&pag={}'

data_path = './data'
descr_ds_path = data_path + '/description-dataset-raw.tsv'
info_ds_path = data_path + '/information-dataset.tsv'

We created a reusable python module to easily scrape on file as a stream, to avoid losing any changes even in case of Network Exceptions. Thanks to this, we were able to scrape just over 20k items during 2 scraping sessions only.

Then, using terminal tools (awk, cut, wc, cat, ...) we processed and merged our datasets.

In [12]:
Scraper(to_scrape, query_params=query, start_from=1, sleep=0.15, n_ads=20000, onfile=True, verbose=True).init()

The first matrix will have this format: <img src="https://latex.codecogs.com/gif.latex?$m_{ij}&space;=&space;value$" title="$m_{ij} = value$" /> where <img src="https://latex.codecogs.com/gif.latex?$i&space;\in&space;\{announcement_1,&space;...,&space;announcement_n\}$" title="$i \in \{announcement_1, ..., announcement_n\}$" /> and <img src="https://latex.codecogs.com/gif.latex?$j&space;\in&space;\{price,&space;locali,&space;superficie,&space;bagni,&space;piano&space;\}$" title="$j \in \{price, locali, superficie, bagni, piano \}$" />. *n* is the number of the announcements. It's possible that not all the announcements will have all the fields mentioned above, if it's the case don't take it into account. 

The second matrix will have this format: <img src="https://latex.codecogs.com/gif.latex?$m_{ij}&space;=&space;tfIdf_{ij}$" title="$m_{ij} = tfIdf_{ij}$" /> where <img src="https://latex.codecogs.com/gif.latex?$i&space;\in&space;\{announcement_1,&space;...,&space;announcement_n\}$" title="$i \in \{announcement_1, ..., announcement_n\}$" /> and <img src="https://latex.codecogs.com/gif.latex?$j&space;\in&space;\{word_1,&space;...,word_m\}$" title="$j \in \{word_1, ...,word_m\}$" />. *n* is the number of the announcements and *m* is the cardinality of the vocabulary.

Processing the most complex dataset.

In [14]:
# need to process the description_dataset.tsv
# and create a vocabulary
import csv
from utils import preprocessing_nltk

vocabulary = set()
doc2voc = {}
voc2doc = {}

with open(descr_ds_path, 'r') as descr_ds:
    reader = csv.reader(descr_ds, delimiter='\t', quotechar=None)
    
    for elems in reader:
        ad_id = elems[0]
        descr = elems[1]
        
        # process
        words = preprocessing_nltk(descr)
        vocabulary.update(words)
        
        # doc2voc
        doc2voc[ad_id] = words
        
        # voc2doc
        for w in words:
            if not w in voc2doc.keys():
                voc2doc[w] = set([ad_id])
            else:
                voc2doc[w].add(ad_id)

In [15]:
print(len(vocabulary))
print(list(vocabulary)[:20])

28086
['urano', 'numeros', 'modici', 'inverterrilevator', 'tubazioni', 'scritt', 'consultazion', 'intersezioni', 'ringhier', 'chiediamo', 'provvigionezona', 'advisor', 'nn', 'reciproca', 'crocco', 'direttoinfo', 'riorganizzabil', 'dichiarazion', 'appl', 'cucionotto']


In [16]:
%load_ext autoreload
%autoreload 2

# need to calculate tfidf for each word
from utils import tfidf_inverse_index

inv_index = tfidf_inverse_index(voc2doc, doc2voc)
voc2doc_tfidf = inv_index['voc2doc_tfidf']
doc2voc_tfidf = inv_index['doc2voc_tfidf']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# check vectors length
print(len(voc2doc_tfidf.keys()))
print(len(voc2doc.keys()))
print(len(doc2voc_tfidf.keys()))
print(len(doc2voc.keys()))

28086
28086
20013
20013


In [19]:
print(voc2doc_tfidf['albero'])

[('67801957', 0.016224790368615277), ('70397474', 0.04020926308743786), ('70238502', 0.028455786184956026), ('65868305', 0.014797008816177133)]


Dumping the processed dataset into a file.

In [23]:
SEP = '\t'
NL = '\n'

with open('description-dataset.tsv', 'w') as descr_out:
    for doc_id, content in doc2voc.items():
        descr_out.write(doc_id + SEP)
        
        for w in voc2doc.keys():
            if w in content:
                tfidf = [x[1] for x in voc2doc_tfidf[w] if x[0] == doc_id]
                descr_out.write(str(tfidf[0]) + SEP)
            else:
                descr_out.write(str(0.0) + SEP)
        descr_out.write(NL)