In [30]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from nltk import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import inflect
import math
import numpy as np 

In [31]:
informations = []

i = 1
while len(informations) <= 10000:
    temp = []
    page = requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))
    soup = BeautifulSoup(page.content, 'html.parser')
    features = soup.find_all('ul', attrs={'class': 'listing-features list-piped'})
    for annuncio in features:
        row = []
        for tag in annuncio:
            if tag != '\n':
                text = tag.get_text().strip().split()
                temp.append(text)
        if temp[0][0] != 'da' and len(temp) == 6 and temp[0][0] != 'SU':
            row.append(temp[0][1].replace('.','').replace('€',''))
            row.append(temp[1][0])
            row.append(temp[2][0])
            row.append(temp[3][0])
            row.append(temp[4][0])
            informations.append(row)   
        temp = []
    i+=1
    break


In [32]:
informations[1:10]

[['229000', '2', '70', '1', '5'],
 ['690000', '5', '140', '2', '2'],
 ['520000', '3', '105', '2', '1'],
 ['419000', '3', '100', '2', '2'],
 ['1150000', '5', '160', '2', '4'],
 ['1080000', '5+', '250', '3+', '3'],
 ['2500000', '5+', '300', '3', '2'],
 ['940000', '5+', '200', '3', 'A'],
 ['650000', '5+', '193', '3', '4']]

In [33]:
d_informations = pd.DataFrame(informations)

In [34]:
d_informations.rename(columns={0 : "Prezzo", 1 : "Locali", 2 :"Superficie", 3 :"Bagni", 4 :"Piano"}).head(25)

Unnamed: 0,Prezzo,Locali,Superficie,Bagni,Piano
0,225000,2,50,1,1
1,229000,2,70,1,5
2,690000,5,140,2,2
3,520000,3,105,2,1
4,419000,3,100,2,2
5,1150000,5,160,2,4
6,1080000,5+,250,3+,3
7,2500000,5+,300,3,2
8,940000,5+,200,3,A
9,650000,5+,193,3,4


# Create Description dataset only for the announcements for first page 

In [35]:
# In this part I create a list where there are all the announcements in the fist page 
descriptions = []
link = requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=1")
soup = BeautifulSoup(link.content, 'html.parser')
features = soup.find_all('p', attrs={'class': 'titolo text-primary'})
for elem in features:
    link = elem.find('a').get('href')
    if 'https://www.immobiliare.it' in link:
        annuncio = requests.get(link)
        soup = BeautifulSoup(annuncio.content, 'html.parser')
        des = soup.find('div', attrs={'class': 'description-text'}).get_text().strip()
        descriptions.append(des)
    
 


In [36]:
print(descriptions)

['PAPILLO EUR in elegante complesso residenziale rifinitissimo bilocale composto da soggiorno con angolo cottura, stanza da letto bagno e ampio balcone . con Rifiniture di pregio, pavimenti in parquet / grees, infissi in legno con vetro camera e porte in noce, grate nel salone, riscaldamento termoautonomo con caldaia centralizzata, aria condizionata, videocitofono, porta blindata, serramenti elettrici con chiusura centralizzata, antenna satellitare, isolamento termo acustico, pannelli solari e fotovoltaici , rilevatori elettronici di gas. Tutte le camere sono fornite di impianto antifurto, presa antenna satellitare e presa telefonica.\n\nORARI lunedi chiusi\n martedi 10:00-17:00\n mercoledi 10:00-17:00\ngiovedi 10:00-17:00\nvenrdi 10:00-17:00\nsabato 10:00-17:00\n domenica 10:00-13:00', 'Prenestina, Appartamento in Vendita adiacente Largo Preneste, piano alto luminoso ingresso soggiorno camera (possibilità seconda camera) cucina bagno e due balconi. Dilazioni - Permute. Rif. 1070S - Te

# Preprocessing all the article in one page. 

In [37]:
class Preprocessing(): 
    def __init__(self,language):
        self.tokenizer = None
        self.stopwords = None
        self.stemmer = None
        self.number_to_words = None
        self.language = language
    
    def setupNltk(self):
        #Lazy initialization of objects needed to preprocess strings
        if self.tokenizer == None:
            self.tokenizer = RegexpTokenizer(r'\w+')
        if self.stopwords == None:
            self.stopwords = set(stopwords.words(self.language))
        if self.stemmer == None:
            self.stemmer = SnowballStemmer(self.language)
        if self.number_to_words == None:
            self.number_to_words = inflect.engine()
    
    def nltkProcess(self, string):
        #Transform all words to lowercase
        string = string.lower()
        #Setup nltk objects to perform preprocessing
        self.setupNltk()
        #Tokenize the string removing puntuactions
        tokens = self.tokenizer.tokenize(string)
        #Create new sentence
        new_sentence = []
        #Scroll through each word and stemming it
        for word in tokens:
            word = self.stemmer.stem(word)
            #exclude the word if it is a stopword
            if not word in self.stopwords:
                #if the word has length greater than one, it has sufficient information
                #value to be added
                if len(word) > 1:
                    new_sentence.append(word)
                #if the word length is equal to one and it is numeric
                #then the string representation of the number is added
                elif word.isnumeric():
                    new_sentence.append(self.number_to_words.number_to_words(word))           
        # return a list of words
        return new_sentence
        
        
        

In [38]:
# I create an object called preprocessing 
preprocessing = Preprocessing('italian')

In [39]:
# Save in descriptions all the processed words of each ad for one page. 
for i in range(len(descriptions)): 
    descriptions[i] = preprocessing.nltkProcess(descriptions[i])
    

In [40]:
descriptions

[['papill',
  'eur',
  'eleg',
  'compless',
  'residenzial',
  'rifinitissim',
  'bilocal',
  'compost',
  'soggiorn',
  'angol',
  'cottur',
  'stanz',
  'lett',
  'bagn',
  'ampi',
  'balcon',
  'rifinitur',
  'preg',
  'pav',
  'parquet',
  'grees',
  'infiss',
  'legn',
  'vetr',
  'camer',
  'port',
  'noc',
  'grat',
  'salon',
  'riscald',
  'termoautonom',
  'caldai',
  'centralizz',
  'ari',
  'condizion',
  'videocitof',
  'port',
  'blind',
  'serr',
  'elettr',
  'chiusur',
  'centralizz',
  'antenn',
  'satellit',
  'isol',
  'term',
  'acust',
  'pannell',
  'solar',
  'fotovolt',
  'rilev',
  'elettron',
  'gas',
  'tutt',
  'cam',
  'son',
  'forn',
  'impiant',
  'antifurt',
  'pres',
  'antenn',
  'satellit',
  'pres',
  'telefon',
  'orar',
  'luned',
  'chius',
  'marted',
  '10',
  '00',
  '17',
  '00',
  'mercoled',
  '10',
  '00',
  '17',
  '00',
  'gioved',
  '10',
  '00',
  '17',
  '00',
  'venrd',
  '10',
  '00',
  '17',
  '00',
  'sab',
  '10',
  '00',
  '17

In [43]:
# I create a vocabulary where for all words in the announcements is associated an id. 
vocabulary = {}
i = 0 
for ann in descriptions: 
    for word in ann:
        if word not in vocabulary: 
            vocabulary[word] = i
            i += 1
        

In [44]:
vocabulary

{'papill': 0,
 'eur': 1,
 'eleg': 2,
 'compless': 3,
 'residenzial': 4,
 'rifinitissim': 5,
 'bilocal': 6,
 'compost': 7,
 'soggiorn': 8,
 'angol': 9,
 'cottur': 10,
 'stanz': 11,
 'lett': 12,
 'bagn': 13,
 'ampi': 14,
 'balcon': 15,
 'rifinitur': 16,
 'preg': 17,
 'pav': 18,
 'parquet': 19,
 'grees': 20,
 'infiss': 21,
 'legn': 22,
 'vetr': 23,
 'camer': 24,
 'port': 25,
 'noc': 26,
 'grat': 27,
 'salon': 28,
 'riscald': 29,
 'termoautonom': 30,
 'caldai': 31,
 'centralizz': 32,
 'ari': 33,
 'condizion': 34,
 'videocitof': 35,
 'blind': 36,
 'serr': 37,
 'elettr': 38,
 'chiusur': 39,
 'antenn': 40,
 'satellit': 41,
 'isol': 42,
 'term': 43,
 'acust': 44,
 'pannell': 45,
 'solar': 46,
 'fotovolt': 47,
 'rilev': 48,
 'elettron': 49,
 'gas': 50,
 'tutt': 51,
 'cam': 52,
 'son': 53,
 'forn': 54,
 'impiant': 55,
 'antifurt': 56,
 'pres': 57,
 'telefon': 58,
 'orar': 59,
 'luned': 60,
 'chius': 61,
 'marted': 62,
 '10': 63,
 '00': 64,
 '17': 65,
 'mercoled': 66,
 'gioved': 67,
 'venrd': 68,

# Compute the TF 

In [45]:
# Compute the term frequency of each words in each documents. 
tf = {}
for i in range(len(descriptions)): 
    set_descriptions = set(descriptions[i])
    for j in set_descriptions: 
        if j not in tf: 
            tf[j] = [(i , descriptions[i].count(j))]
        else: 
            tf[j].append((i,descriptions[i].count(j)))
          

In [46]:
tf

{'elettr': [(0, 1), (8, 1), (10, 1)],
 'balcon': [(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (7, 1),
  (9, 2),
  (11, 2),
  (12, 1),
  (13, 1),
  (15, 2),
  (16, 1),
  (21, 1)],
 'lett': [(0, 1),
  (5, 1),
  (6, 1),
  (8, 2),
  (9, 2),
  (10, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (20, 3),
  (21, 1)],
 'term': [(0, 1)],
 'grees': [(0, 1)],
 'preg': [(0, 1), (3, 1), (4, 1), (7, 1), (8, 1), (9, 1)],
 'angol': [(0, 1), (4, 1), (19, 1), (21, 1)],
 'infiss': [(0, 1), (2, 1), (3, 1), (4, 1), (8, 1), (10, 1)],
 'vetr': [(0, 1), (2, 1), (8, 1), (23, 1)],
 'acust': [(0, 1)],
 'soggiorn': [(0, 1), (1, 1), (2, 2), (4, 1), (14, 1), (19, 1), (20, 3)],
 'gas': [(0, 1)],
 'tutt': [(0, 1),
  (2, 2),
  (3, 1),
  (5, 1),
  (7, 1),
  (8, 2),
  (9, 1),
  (14, 3),
  (16, 1),
  (17, 1),
  (18, 1)],
 'forn': [(0, 1), (2, 1)],
 'chius': [(0, 1), (23, 1)],
 'ampi': [(0, 1),
  (2, 2),
  (3, 1),
  (5, 1),
  (6, 2),
  (7, 3),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (13, 1),
  (15, 3),
  (18, 2),
  (19,

# Compute the IDF 

In [47]:
n_ads= len(descriptions)
idf = {}
for k in tf: 
    idf[k] = math.log10(n_ads/len(tf[k]))+1



In [48]:
idf

{'elettr': 1.9030899869919435,
 'balcon': 1.2662678894047692,
 'lett': 1.338818556553381,
 'term': 2.380211241711606,
 'grees': 2.380211241711606,
 'preg': 1.6020599913279625,
 'angol': 1.7781512503836436,
 'infiss': 1.6020599913279625,
 'vetr': 1.7781512503836436,
 'acust': 2.380211241711606,
 'soggiorn': 1.5351132016973492,
 'gas': 2.380211241711606,
 'tutt': 1.338818556553381,
 'forn': 2.079181246047625,
 'chius': 2.079181246047625,
 'ampi': 1.149762320333332,
 'eleg': 1.4259687322722812,
 '10': 2.079181246047625,
 'parquet': 1.5351132016973492,
 'condizion': 1.6812412373755872,
 'papill': 2.380211241711606,
 'videocitof': 2.380211241711606,
 'noc': 2.380211241711606,
 'sab': 2.380211241711606,
 'ari': 2.079181246047625,
 'antifurt': 2.380211241711606,
 'luned': 2.380211241711606,
 'centralizz': 2.079181246047625,
 'gioved': 2.380211241711606,
 'residenzial': 1.7781512503836436,
 'salon': 1.101457640758777,
 'serr': 2.380211241711606,
 'son': 1.380211241711606,
 '17': 2.380211241711

# Compute the TF-IDF

In [49]:
tf_idf = {}
for k in tf:
    for i in tf[k]:
        if k not in tf_idf:
            tf_idf[k] = [(i[0], i[1] * idf[k])]
        else: 
            tf_idf[k].append((i[0], i[1] * idf[k]))

    

In [50]:
tf_idf

{'elettr': [(0, 1.9030899869919435),
  (8, 1.9030899869919435),
  (10, 1.9030899869919435)],
 'balcon': [(0, 1.2662678894047692),
  (1, 1.2662678894047692),
  (2, 1.2662678894047692),
  (3, 1.2662678894047692),
  (4, 1.2662678894047692),
  (7, 1.2662678894047692),
  (9, 2.5325357788095384),
  (11, 2.5325357788095384),
  (12, 1.2662678894047692),
  (13, 1.2662678894047692),
  (15, 2.5325357788095384),
  (16, 1.2662678894047692),
  (21, 1.2662678894047692)],
 'lett': [(0, 1.338818556553381),
  (5, 1.338818556553381),
  (6, 1.338818556553381),
  (8, 2.677637113106762),
  (9, 2.677637113106762),
  (10, 1.338818556553381),
  (16, 1.338818556553381),
  (17, 1.338818556553381),
  (18, 1.338818556553381),
  (20, 4.016455669660143),
  (21, 1.338818556553381)],
 'term': [(0, 2.380211241711606)],
 'grees': [(0, 2.380211241711606)],
 'preg': [(0, 1.6020599913279625),
  (3, 1.6020599913279625),
  (4, 1.6020599913279625),
  (7, 1.6020599913279625),
  (8, 1.6020599913279625),
  (9, 1.6020599913279625

# Execute the Normalization

In [51]:
# Create a dictionary ads that has for key the ads and for values the tf-idf for each word of the ads. 
ads = {}
norm_tfidf = {}
for k in tf_idf: 
    for j in tf_idf[k]: 
        if j[0] not in ads: 
            ads[j[0]] = j[1]**2
        else: 
            ads[j[0]] += j[1]**2
for word in tf_idf:
    for j in tf_idf[word]: 
        if word not in norm_tfidf: 
            norm_tfidf[word] = [(j[0], j[1]/math.sqrt(ads[j[0]]))]
        else: 
            norm_tfidf[word].append((j[0], j[1]/math.sqrt(ads[j[0]])))              

In [52]:
norm_tfidf

{'elettr': [(0, 0.049664143898952746),
  (8, 0.07223582174850657),
  (10, 0.10514789521345264)],
 'balcon': [(0, 0.03304526380984415),
  (1, 0.06425872048860162),
  (2, 0.05870504432908806),
  (3, 0.07230577877325577),
  (4, 0.07719114112593908),
  (7, 0.04150604952922139),
  (9, 0.09355059090234503),
  (11, 0.10075683281929039),
  (12, 0.06886706711005273),
  (13, 0.09465792038360717),
  (15, 0.11745691773369916),
  (16, 0.06303466732427898),
  (21, 0.11584734990632503)],
 'lett': [(0, 0.03493858824424407),
  (5, 0.07283285994461593),
  (6, 0.07470608938239104),
  (8, 0.10163540270383684),
  (9, 0.09891056080989943),
  (10, 0.07397125425309527),
  (16, 0.06664623104324603),
  (17, 0.06610190633803878),
  (18, 0.055767947939623676),
  (20, 0.14494796422237102),
  (21, 0.12248480995204519)],
 'term': [(0, 0.06211537784669697)],
 'grees': [(0, 0.06211537784669697)],
 'preg': [(0, 0.04180828993264192),
  (3, 0.091480006942998),
  (4, 0.09766088196467515),
  (7, 0.05251272807691554),
  (8,

In [59]:
n_ann = len(descriptions)
print(n_ann)

24


In [60]:
# I create an empty matrix in which there are as columns all words in the announcements without duplicates and as rows the announcements. 
m = []
for i in range(n_ann): 
    m.append(np.zeros(len(words)))

In [61]:
m

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

In [56]:
# Insert tf-idf values inside the matrix and to do this I use the id associated to the words in vocabulary (as a id of columns 
# in m) and the id of announcements (as id of rows in m).
for k in norm_tfidf: 
    for elem in norm_tfidf[k]: 
        m[elem[0]][vocabulary[k]] = elem[1]



In [58]:
pd.DataFrame(m, columns = list(vocabulary.keys()))

Unnamed: 0,papill,eur,eleg,compless,residenzial,rifinitissim,bilocal,compost,soggiorn,angol,...,ballatoi,uso,suddivis,mobil,pellegrin,usi,div,ricett,professional,02c1bw
0,0.062115,0.05426,0.037213,0.049664,0.046404,0.062115,0.05426,0.036019,0.040061,0.046404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077902,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.088229,0.164873,0.0,0.0,0.063988,0.142338,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.081425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.116012,0.0,0.0,0.0,0.084137,0.09358,0.108395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.077574,0.0,0.0,0.0,0.0,0.075085,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154032,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.046741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.108251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.065684,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
