# Homework 4 - Group 1

## First task

In this notebook we want to verify if the basic house informations reflect the house description.

First of all we imported all the libraries needed for this purpose.

In [29]:
import pickle
import pandas as pd
import numpy as np

import re
from bs4 import BeautifulSoup
import requests
import csv
import threading

import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from collections import Counter
from math import log

from sklearn.cluster import KMeans

### Web scrapping

In this first part we perfomed the web scrapping from the web page www.immobiliare.it.

We defined a function to get all the information needed.

In [None]:
def page_scrapping(web_link):
    
    source = requests.get(web_link).text
    soup = BeautifulSoup (source, 'lxml')
    info_matrix = []
    
    for announcement in soup.find_all('div', class_ = 'listing-item_body'):
        try:
            title = announcement.a.text
            price = announcement.li.text
        
            v=[]
            i=0
        
            for dati in announcement.find_all('div', class_ = 'lif__data'):
                a = dati.text
                v.append(a)
                i = i+1
            
            locali = v[0]
            superficie = v[1]
            bagni = v[2]
            piano = v[3]
        
            link =announcement.a['href']
            if link.find('https://www.immobiliare.it') == -1:
                link = 'https://www.immobiliare.it' + link
         
            source_desc = requests.get(link).text
            soup1 = BeautifulSoup (source_desc, 'lxml')
            description = soup1.find('div', class_ = 'description-text').text
       
            announce = [title, price, locali, superficie, bagni, piano, link, description]
        
            info_matrix.append(announce)
        except:
            continue
          
    return info_matrix
    

In [None]:
A = page_scrapping('https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=1')
matrix = np.array([np.array(xi) for xi in A])

In [None]:
for i in range(2, 600):
    try:
        A = page_scrapping('https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=' + str(i))
        newMatrix = np.array([np.array(xi) for xi in A])
        print('Download' + str(i))
        matrix = np.concatenate((matrix, newMatrix), axis = 0)
    except:
        continue

Once we have all the data, we created a dataframe for them. And we save it in a pickle file.

In [None]:
df= pd.DataFrame(matrix)
df.columns = ['Title', 'Price', 'Locali', 'Superficie', 'Bagni', 'Piano', 'Link', 'Description']
df.to_pickle('data_immobiliare')

In [4]:
df = pd.read_pickle("data_immobiliare")

### Text mining

Once we have our dataframe, we need to clean our data:

First we cleaned the informations as 'Price', 'Piano', 'Locali' in order to have all float numbers.

In [5]:
for i in range(len(df)):
    df.Price.loc[i] = re.sub('[€]', '' , df.Price.loc[i])
    df.Price.loc[i] = df.Price.loc[i].replace("PREZZO SU RICHIESTA", "0")
    df.Price.loc[i] = df['Price'].loc[i].split()[0]
    df.Price.loc[i] = df.Price.loc[i].replace(".", "")
    
    df.Piano.loc[i] = re.sub('T', '0', df.Piano.loc[i])
    df.Piano.loc[i] = re.sub('R', '0.5', df.Piano.loc[i])
    df.Piano.loc[i] = re.sub('A', '100', df.Piano.loc[i])
    df.Piano.loc[i] = re.sub('S', '-1', df.Piano.loc[i])
    df.Piano.loc[i] = re.sub(' \n', '', df.Piano.loc[i])
    df.Piano.loc[i] = df.Piano.loc[i].replace("11+", "11.5")
    
    
    df["Bagni"][i] = df["Bagni"][i].replace("+", ".5")
    
    df["Locali"][i] = df["Locali"][i].replace("+", ".5")
    
    df.replace(df.Superficie.loc[i], df.Superficie.loc[i].split()[0], inplace = True)

In [6]:
for i in range(len(df)):
    try:
        df.replace(df.Price.loc[i], float(df.Price.loc[i]), inplace = True)
        df.replace(df.Locali.loc[i], float(df.Locali.loc[i]), inplace = True)
        df.replace(df.Superficie.loc[i], float(df.Superficie.loc[i]), inplace = True)
        df.replace(df.Bagni.loc[i], float(df.Bagni.loc[i]), inplace = True)
        df.replace(df.Piano.loc[i], float(df.Piano.loc[i]), inplace = True)
        
    except:
        print('errore ' + str(i))
        continue

Then we can create two matrixes: one for the informations, and one for the description

In [11]:
df_info = df[['Price', 'Locali', 'Superficie', 'Bagni', 'Piano']].copy()
df_desc = df.filter(['Description'], axis=1)

df_info.to_pickle('df_info')
df_desc.to_pickle('df_desc')

We start to work on the description matrix: it's defined here a data cleaning function, used to clean all the reviews' texts and later on the queries.
A number of steps are performed for the task:
- replace $ with 'dollar', to avoid losing relevant information from the text
- replace new line symbols with whitespace
- remove puntuation
- separate numbers from words
- remove stopwords
- tokenize the text
- stem of the words, just keeping the root

In [17]:
def string_cleaning(string_raw):
    m = string_raw
    #removing website url
    m = re.sub('http.* *', '', string_raw, flags=re.MULTILINE)
    m = re.sub(r'\d+', '', string_raw)
    m = m.replace('\\r', ' ').replace('\\n', ' ')
    m = re.sub('[%s]' % re.escape(string.punctuation), ' ', m)
    m = re.sub('[%s]' % re.escape('“”€'), ' ', m)
    m = re.sub('[%s]' % re.escape('”'), ' ', m)
    m = re.sub(r'(?<=[\d+])(?=[a-zA-Z_])', r' ', m)
    m = nltk.tokenize.word_tokenize(m)
    m = [word for word in m if word.lower() not in stopwords.words('italian')]
    sno = nltk.stem.SnowballStemmer('italian')
    string_new = [sno.stem(word) for word in m]
    
    return string_new
            

Then we create a full corpus with all the words appeared in the descriptions

In [13]:
corpus = []
for i in range(len(df_desc)):
    corpus.append(df_desc.iloc[i][0])

For each word in the corpus it's performed the data cleaning and tokenization and the corpus has been flattened for the words' occurrence.

In [18]:
lista = []
for i in range(len(corpus)):
    lista.append(string_cleaning(corpus[i]))
    

lista_flatten = [y for x in lista for y in x]

The next step is to create a dictionary with all the words counted and then sorted by frequency.

In [21]:
c = Counter(lista_flatten)
c = dict(c.most_common())

final_dict = {str(i+1):x for i,x in enumerate(c)}
final_dict_inv = {str(x):(i+1) for i,x in enumerate(c)}

In the final_dict we assigned a number to each word, and the reverse in the final_dict_inv, that we are using to build the cosine similarity index.

In [22]:
indx = {}
for n,document in enumerate(lista):
    new_document = []
    for word in document:
        new_document.append(final_dict_inv[str(word)])
    indx[str(n+1)] = new_document 
    
inv_indx = {str(i):[] for i in list(final_dict.keys())}
for word in list(final_dict.keys()):
    for i in range(len(indx)):
        if int(word) in indx[str(i+1)]:
            inv_indx[word].append(i+1)

In the indx dictionary for each announce we reported the list of words that appear in it whit the correspondent number.

### Cosine similarity

To use the Cosine similarity it's needed to calcolate for each term:
- the frequency in each document: TF
- the IDF$_{i}$ defined has: $$IDF_{i}=\log{\frac{n}{N_{i}}}$$
where i is refered to term i

In [26]:
file_count= len(df_desc)

inv_indx_cosine = {i:[] for i in list(final_dict.keys())}
for word in list(final_dict.keys()):
    for i,l in enumerate(list(indx.values())):
        if int(word) in l:
            tf = l.count(int(word))/len(l)
            id_f = log(file_count/len(inv_indx[word]))
            inv_indx_cosine[word].append((i+1,tf*id_f))

In [None]:
raws = len(indx) + 2
cols = len(inv_indx)
matrix = np.zeros((raws, cols))

We are able now to build a matrix with the announments on the raws and the words on the columns, were are recorded the tf-idf indexes.

In [None]:
for i in range(len(indx)):
    for j in range(len(a[i])):
        col = int (a[i][j][0])
        print('i ' + str(i))
        print(j)
        print(col)
        
        matrix[i][col] = a[i][j][1]

In [None]:
df_tfidf = pd.DataFrame(matrix)

We are saving that dataframe so we can use it later.

In [None]:
df_tfidf.to_pickle('df_tfidf')

In [28]:
df_tfidf = pd.read_pickle("df_tfidf")

## Clustering

We are going to apply the K-means++ clustering to our info matrix first and than to the tf-idf matrix, related to the description.


In [35]:
kmeans_info = KMeans(n_clusters = 3).fit(df_info)

cls_info = dict()

for i in range(1, len(kmeans_info.labels_)):
    if kmeans_info.labels_[i] not in cls_info:
        cls_info[kmeans_info.labels_[i]] = [i]
    elif kmeans_info.labels_[i] in cls_info:
        cls_info[kmeans_info.labels_[i]].append(i)

In [31]:
kmeans_desc = KMeans(n_clusters = 6).fit(df_tfidf)

cls_desc = dict()

for i in range(1, len(kmeans_desc.labels_)):
    if kmeans_desc.labels_[i] not in cls_desc:
        cls_desc[kmeans_desc.labels_[i]] = [i]
    elif kmeans_desc.labels_[i] in cls_desc:
        cls_desc[kmeans_desc.labels_[i]].append(i)

In [32]:
def jaccard(a, b):
   c = set(a).intersection(b)
   return float(len(c)) / (len(a) + len(b) - len(c))

In [36]:
A=[]
for i in range(6):
    v = []
    for j in range(3):
        jac = jaccard(cls_desc.get(i), cls_info.get(j))
        v.append(jac)
    A.append(v)

In [37]:
A

[[0.017250284061563887, 0.002857142857142857, 0.016683518705763397],
 [0.07667698658410732, 0.0021413276231263384, 0.01604696673189824],
 [0.01695440918019229, 0.0, 0.013171225937183385],
 [0.5968183826778612, 0.01660538488909975, 0.17530155083285467],
 [0.17742755465175394, 0.002888781896966779, 0.05110923897781522],
 [0.007772020725388601, 0.004405286343612335, 0.0010604453870625664]]

In [41]:
cls_desc.get(3)

8421

In [42]:
cls_info.get(0)

9647