#### Import libraries and packages

In [1]:
import psycopg2
import psycopg2.extras as extras
import pandas as pd
import numpy as np
import seaborn as sns
from difflib import SequenceMatcher

#### Set connection with postgres database

In [2]:
host = 'postgresfib.fib.upc.edu'
dbname = 'ADSDBjordi.cluet'
user = 'jordi.cluet'
pwd = 'DB151199'
port = 6433
sslmode = 'require'

conn = psycopg2.connect("host='{}' port={} dbname='{}' user={} password={}".format(host, port, dbname, user, pwd))
cursor = conn.cursor()

#### Get dataframe from exploitation zone

In [3]:
sql = "SELECT * from exploitation_zone.housing_view;"
df = pd.read_sql_query(sql, conn)

In [4]:
df

Unnamed: 0,id,bathrooms,building_subtype,conservation_state,discount,floor_elevator,price,rooms,sq_meters,neighbourhood,...,agressio_sex,conviv_veinal,vigilancia_poli,molesties_espai_pub,contra_prop_priv,incendis,estupefaents,agressions,proves_alcohol,proves_droga
0,1,1,Flat,Nearly new,0,True,1250.0,3,93.0,Sant Antoni,...,32,11474,4721,4445,2178,793,451,477,3770,326
1,62,1,Flat,Good,0,False,1200.0,3,80.0,Sant Antoni,...,32,11474,4721,4445,2178,793,451,477,3770,326
2,359,2,Flat,Good,0,True,1200.0,4,125.0,Sant Antoni,...,32,11474,4721,4445,2178,793,451,477,3770,326
3,214,2,Attic,New construction,0,True,1800.0,2,155.0,Sant Antoni,...,32,11474,4721,4445,2178,793,451,477,3770,326
4,228,1,Flat,New construction,150,False,1100.0,4,80.0,Sant Antoni,...,32,11474,4721,4445,2178,793,451,477,3770,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9393,9023,1,Flat,New construction,45,False,780.0,3,72.0,la Font d'en Fargues,...,24,4784,3062,2827,663,349,195,262,1568,106
9394,10925,1,Flat,New construction,50,False,650.0,2,50.0,la Font d'en Fargues,...,24,4784,3062,2827,663,349,195,262,1568,106
9395,2389,1,Flat,New construction,50,True,850.0,3,65.0,la Vall d'Hebron,...,24,4784,3062,2827,663,349,195,262,1568,106
9396,5538,1,Flat,Good,0,True,775.0,3,75.0,la Vall d'Hebron,...,24,4784,3062,2827,663,349,195,262,1568,106


In [5]:
df.columns

Index(['id', 'bathrooms', 'building_subtype', 'conservation_state', 'discount',
       'floor_elevator', 'price', 'rooms', 'sq_meters', 'neighbourhood',
       'price_per_sqm', 'districte', 'superficie', 'poblacio', 'furt',
       'estafes', 'danys', 'rob_viol_intim', 'rob_en_vehicle', 'rob_força',
       'lesions', 'aprop_indeg', 'amenaces', 'rob_de_vehicle', 'ocupacions',
       'salut_pub', 'abusos_sex', 'entrada_domicili', 'agressio_sex',
       'conviv_veinal', 'vigilancia_poli', 'molesties_espai_pub',
       'contra_prop_priv', 'incendis', 'estupefaents', 'agressions',
       'proves_alcohol', 'proves_droga'],
      dtype='object')

#### Feature engineering

##### Remove some variables that are not useful for our modelling

neighbourhood has too many levels for our modelling, price_per_sqm since it was computed from the target and discount as it does not seem logical to take it into account for modelling.

In [6]:
df.drop(['id', 'price_per_sqm', 'discount'], axis=1, inplace=True)
df.columns

Index(['bathrooms', 'building_subtype', 'conservation_state', 'floor_elevator',
       'price', 'rooms', 'sq_meters', 'neighbourhood', 'districte',
       'superficie', 'poblacio', 'furt', 'estafes', 'danys', 'rob_viol_intim',
       'rob_en_vehicle', 'rob_força', 'lesions', 'aprop_indeg', 'amenaces',
       'rob_de_vehicle', 'ocupacions', 'salut_pub', 'abusos_sex',
       'entrada_domicili', 'agressio_sex', 'conviv_veinal', 'vigilancia_poli',
       'molesties_espai_pub', 'contra_prop_priv', 'incendis', 'estupefaents',
       'agressions', 'proves_alcohol', 'proves_droga'],
      dtype='object')

##### Add categories that may be missing if dataset is not exhaustive

In [12]:
building_subtypes = ['Flat', 'Apartment', 'Attic', 'Duplex', 'Loft', 'Study', 'House_Chalet', 'GroundFloorWithGarden', 'SemidetachedHouse', 'SemiDetached']
df.building_subtype = df.building_subtype.astype('category')
df.building_subtype = df.building_subtype.cat.add_categories(list(set(building_subtypes) - set(list(df.building_subtype))))

In [13]:
neighbourhoods = ['el Raval', 'el Barri Gòtic', 'la Barceloneta', 'Sant Pere, Santa Caterina i la Ribera', 'el Fort Pienc', 'la Sagrada Família', "la Dreta de l'Eixample", "l'Antiga Esquerra de l'Eixample", "la Nova Esquerra de l'Eixample", 'Sant Antoni', 'el Poble Sec', 'la Marina del Prat Vermell', 'la Marina de Port', 'la Font de la Guatlla', 'Hostafrancs', 'la Bordeta', 'Sants - Badal', 'Sants', 'les Corts', 'la Maternitat i Sant Ramon', 'Pedralbes', 'Vallvidrera, el Tibidabo i les Planes', 'Sarrià', 'les Tres Torres', 'Sant Gervasi - la Bonanova', 'Sant Gervasi - Galvany', 'el Putxet i el Farró', 'Vallcarca i els Penitents', 'el Coll', 'la Salut', 'la Vila de Gràcia', "el Camp d'en Grassot i Gràcia Nova", 'el Baix Guinardó', 'Can Baró', 'el Guinardó', "la Font d'en Fargues", 'el Carmel', 'la Teixonera', 'Sant Genís dels Agudells', 'Montbau', "la Vall d'Hebron", 'la Clota', 'Horta', 'Vilapicina i la Torre Llobeta', 'Porta', 'el Turó de la Peira', 'Can Peguera', 'la Guineueta', 'Canyelles', 'les Roquetes', 'Verdun', 'la Prosperitat', 'la Trinitat Nova', 'Torre Baró', 'Ciutat Meridiana', 'Vallbona', 'la Trinitat Vella', 'Baró de Viver', 'el Bon Pastor', 'Sant Andreu', 'la Sagrera', 'el Congrés i els Indians', 'Navas', "el Camp de l'Arpa del Clot", 'el Clot', 'el Parc i la Llacuna del Poblenou', 'la Vila Olímpica del Poblenou', 'el Poblenou', 'Diagonal Mar i el Front Marítim del Poblenou', 'el Besòs i el Maresme', 'Provençals del Poblenou', 'Sant Martí de Provençals', 'la Verneda i la Pau']
df.neighbourhood = df.neighbourhood.astype('category')
df.neighbourhood = df.neighbourhood.cat.add_categories(list(set(neighbourhoods) - set(list(df.neighbourhood))))

In [14]:
conservation_states = ['New construction', 'Nearly new', 'Very good', 'Good', 'Renovated', 'To renovate']
df.conservation_state = df.conservation_state.astype('category')
df.conservation_state = df.conservation_state.cat.add_categories(list(set(conservation_states) - set(list(df.conservation_state))))

In [15]:
districtes = ['Ciutat Vella', 'Eixample', 'Sants-Montjuïc', 'Les Corts', 'Sarrià-Sant Gervasi', 'Gràcia', 'Horta-Guinardó', 'Nou Barris', 'Sant Andreu', 'Sant Martí']
df.districte = df.districte.astype('category')
df.districte = df.districte.cat.add_categories(list(set(districtes) - set(list(df.districte))))

##### One-hot enconding of some variables

In [16]:
ohe_bs = pd.get_dummies(df.building_subtype, prefix='bs')
ohe_cs = pd.get_dummies(df.conservation_state, prefix='cs')
ohe_d = pd.get_dummies(df.districte, prefix='d')
ohe_n = pd.get_dummies(df.neighbourhood, prefix='n')
df = pd.concat([df, ohe_bs, ohe_cs, ohe_d, ohe_n], axis=1)
df.drop(['building_subtype', 'conservation_state', 'districte', 'neighbourhood'], axis=1, inplace=True)
df.columns

Index(['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters',
       'superficie', 'poblacio', 'furt', 'estafes', 'danys',
       ...
       'n_les Roquetes', 'n_les Tres Torres', 'n_Torre Baró', 'n_Porta',
       'n_Sants - Badal', 'n_Baró de Viver', 'n_la Marina del Prat Vermell',
       'n_la Vila Olímpica del Poblenou', 'n_la Clota',
       'n_Provençals del Poblenou'],
      dtype='object', length=130)

In [17]:
num = iter(range(len(df.columns)))
dictio = dict(zip(num, df.columns))
dictio

{0: 'bathrooms',
 1: 'floor_elevator',
 2: 'price',
 3: 'rooms',
 4: 'sq_meters',
 5: 'superficie',
 6: 'poblacio',
 7: 'furt',
 8: 'estafes',
 9: 'danys',
 10: 'rob_viol_intim',
 11: 'rob_en_vehicle',
 12: 'rob_força',
 13: 'lesions',
 14: 'aprop_indeg',
 15: 'amenaces',
 16: 'rob_de_vehicle',
 17: 'ocupacions',
 18: 'salut_pub',
 19: 'abusos_sex',
 20: 'entrada_domicili',
 21: 'agressio_sex',
 22: 'conviv_veinal',
 23: 'vigilancia_poli',
 24: 'molesties_espai_pub',
 25: 'contra_prop_priv',
 26: 'incendis',
 27: 'estupefaents',
 28: 'agressions',
 29: 'proves_alcohol',
 30: 'proves_droga',
 31: 'bs_Apartment',
 32: 'bs_Attic',
 33: 'bs_Duplex',
 34: 'bs_Flat',
 35: 'bs_GroundFloorWithGarden',
 36: 'bs_House_Chalet',
 37: 'bs_Loft',
 38: 'bs_SemiDetached',
 39: 'bs_SemidetachedHouse',
 40: 'bs_Study',
 41: 'cs_Good',
 42: 'cs_Nearly new',
 43: 'cs_New construction',
 44: 'cs_Renovated',
 45: 'cs_To renovate',
 46: 'cs_Very good',
 47: 'd_Ciutat Vella',
 48: 'd_Eixample',
 49: 'd_Gràcia

#### Create several dataframes with different columns

In [18]:
# Without crime variables, without district, without neighbourhood
df_cn_dn = df.loc[:,['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters', 'bs_Apartment', 'bs_Attic', 'bs_Duplex', 'bs_Flat', 'bs_GroundFloorWithGarden', 'bs_House_Chalet', 'bs_Loft', 'bs_SemiDetached', 'bs_SemidetachedHouse', 'bs_Study', 'cs_Good', 'cs_Nearly new', 'cs_New construction', 'cs_Renovated', 'cs_To renovate', 'cs_Very good']]
df_cn_dn.columns

Index(['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters',
       'bs_Apartment', 'bs_Attic', 'bs_Duplex', 'bs_Flat',
       'bs_GroundFloorWithGarden', 'bs_House_Chalet', 'bs_Loft',
       'bs_SemiDetached', 'bs_SemidetachedHouse', 'bs_Study', 'cs_Good',
       'cs_Nearly new', 'cs_New construction', 'cs_Renovated',
       'cs_To renovate', 'cs_Very good'],
      dtype='object')

In [19]:
# Without crime variables, with district, without neighbourhood
df_cn_dy = df.loc[:, ['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters', 'bs_Apartment', 'bs_Attic', 'bs_Duplex', 'bs_Flat', 'bs_GroundFloorWithGarden', 'bs_House_Chalet', 'bs_Loft', 'bs_SemiDetached', 'bs_SemidetachedHouse', 'bs_Study', 'cs_Good', 'cs_Nearly new', 'cs_New construction', 'cs_Renovated', 'cs_To renovate', 'cs_Very good', 'superficie', 'poblacio', 'd_Ciutat Vella', 'd_Eixample', 'd_Gràcia', 'd_Horta-Guinardó', 'd_Les Corts', 'd_Nou Barris', 'd_Sant Andreu', 'd_Sant Martí', 'd_Sants-Montjuïc', 'd_Sarrià-Sant Gervasi']]
df_cn_dy.columns

Index(['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters',
       'bs_Apartment', 'bs_Attic', 'bs_Duplex', 'bs_Flat',
       'bs_GroundFloorWithGarden', 'bs_House_Chalet', 'bs_Loft',
       'bs_SemiDetached', 'bs_SemidetachedHouse', 'bs_Study', 'cs_Good',
       'cs_Nearly new', 'cs_New construction', 'cs_Renovated',
       'cs_To renovate', 'cs_Very good', 'superficie', 'poblacio',
       'd_Ciutat Vella', 'd_Eixample', 'd_Gràcia', 'd_Horta-Guinardó',
       'd_Les Corts', 'd_Nou Barris', 'd_Sant Andreu', 'd_Sant Martí',
       'd_Sants-Montjuïc', 'd_Sarrià-Sant Gervasi'],
      dtype='object')

In [20]:
# With crime variables, without district, without neighbourhood
df_cy_dn = df.loc[:, ['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters', 'bs_Apartment', 'bs_Attic', 'bs_Duplex', 'bs_Flat', 'bs_GroundFloorWithGarden', 'bs_House_Chalet', 'bs_Loft', 'bs_SemiDetached', 'bs_SemidetachedHouse', 'bs_Study', 'cs_Good', 'cs_Nearly new', 'cs_New construction', 'cs_Renovated', 'cs_To renovate', 'cs_Very good', 'furt', 'estafes', 'danys', 'rob_viol_intim', 'rob_en_vehicle', 'rob_força', 'lesions', 'aprop_indeg', 'amenaces', 'rob_de_vehicle', 'ocupacions', 'salut_pub', 'abusos_sex', 'entrada_domicili', 'agressio_sex', 'conviv_veinal', 'vigilancia_poli', 'molesties_espai_pub', 'contra_prop_priv', 'incendis', 'estupefaents', 'agressions', 'proves_alcohol', 'proves_droga']]
df_cy_dn.columns

Index(['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters',
       'bs_Apartment', 'bs_Attic', 'bs_Duplex', 'bs_Flat',
       'bs_GroundFloorWithGarden', 'bs_House_Chalet', 'bs_Loft',
       'bs_SemiDetached', 'bs_SemidetachedHouse', 'bs_Study', 'cs_Good',
       'cs_Nearly new', 'cs_New construction', 'cs_Renovated',
       'cs_To renovate', 'cs_Very good', 'furt', 'estafes', 'danys',
       'rob_viol_intim', 'rob_en_vehicle', 'rob_força', 'lesions',
       'aprop_indeg', 'amenaces', 'rob_de_vehicle', 'ocupacions', 'salut_pub',
       'abusos_sex', 'entrada_domicili', 'agressio_sex', 'conviv_veinal',
       'vigilancia_poli', 'molesties_espai_pub', 'contra_prop_priv',
       'incendis', 'estupefaents', 'agressions', 'proves_alcohol',
       'proves_droga'],
      dtype='object')

In [21]:
# With crime variables, with district, without neighbourhood (i.e. with everything except for neighbourhood)
df_cy_dy = df.loc[:, 'bathrooms':'d_Sarrià-Sant Gervasi']
df_cy_dy.columns

Index(['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters',
       'superficie', 'poblacio', 'furt', 'estafes', 'danys', 'rob_viol_intim',
       'rob_en_vehicle', 'rob_força', 'lesions', 'aprop_indeg', 'amenaces',
       'rob_de_vehicle', 'ocupacions', 'salut_pub', 'abusos_sex',
       'entrada_domicili', 'agressio_sex', 'conviv_veinal', 'vigilancia_poli',
       'molesties_espai_pub', 'contra_prop_priv', 'incendis', 'estupefaents',
       'agressions', 'proves_alcohol', 'proves_droga', 'bs_Apartment',
       'bs_Attic', 'bs_Duplex', 'bs_Flat', 'bs_GroundFloorWithGarden',
       'bs_House_Chalet', 'bs_Loft', 'bs_SemiDetached', 'bs_SemidetachedHouse',
       'bs_Study', 'cs_Good', 'cs_Nearly new', 'cs_New construction',
       'cs_Renovated', 'cs_To renovate', 'cs_Very good', 'd_Ciutat Vella',
       'd_Eixample', 'd_Gràcia', 'd_Horta-Guinardó', 'd_Les Corts',
       'd_Nou Barris', 'd_Sant Andreu', 'd_Sant Martí', 'd_Sants-Montjuïc',
       'd_Sarrià-Sant Gervasi'],
      dty

In [22]:
# With crime variables, with district, with neighbourhood (i.e. with everything)
df_neigh = df
df_neigh.columns

Index(['bathrooms', 'floor_elevator', 'price', 'rooms', 'sq_meters',
       'superficie', 'poblacio', 'furt', 'estafes', 'danys',
       ...
       'n_les Roquetes', 'n_les Tres Torres', 'n_Torre Baró', 'n_Porta',
       'n_Sants - Badal', 'n_Baró de Viver', 'n_la Marina del Prat Vermell',
       'n_la Vila Olímpica del Poblenou', 'n_la Clota',
       'n_Provençals del Poblenou'],
      dtype='object', length=130)

### Modelling

#### Dataframe 1
Without crime variables, without district, without neighbourhood

##### Divide between train and validation sets

In [23]:
X = df_cn_dn.drop('price', axis=1)
y = df_cn_dn['price']

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print(len(X_train), len(y_train), len(X_val), len(y_val))

7518 7518 1880 1880


##### SVR model

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [27]:
clf = make_pipeline(StandardScaler(), SVR(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(gamma='auto'))])

In [28]:
score1 = clf.score(X_val, y_val)
score1

0.18556884376704064

##### Linear regression model

In [29]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [30]:
score2 = reg.score(X_val, y_val)
score2

0.559740570882381

##### K Neighbors regressor

In [31]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=8)

In [32]:
score3 = neigh.score(X_val, y_val)
score3

0.5574049807391436

##### Store scores

In [33]:
scores = {'df_cn_dn': [score1, score2, score3]}

---

#### Dataframe 2
Without crime variables, with district, without neighbourhood

##### Divide between train and validation sets

In [34]:
X = df_cn_dy.drop('price', axis=1)
y = df_cn_dy['price']

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
print(len(X_train), len(y_train), len(X_val), len(y_val))

7518 7518 1880 1880


##### SVR model

In [37]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [38]:
clf = make_pipeline(StandardScaler(), SVR(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(gamma='auto'))])

In [39]:
score1 = clf.score(X_val, y_val)
score1

0.13126077658777424

##### Linear regression model

In [40]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [41]:
score2 = reg.score(X_val, y_val)
score2

0.5779942269756002

##### K Neighbors regressor

In [42]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=8)

In [43]:
score3 = neigh.score(X_val, y_val)
score3

0.5757711231714306

##### Store scores

In [44]:
scores['df_cn_dy'] = [score1, score2, score3]
scores

{'df_cn_dn': [0.18556884376704064, 0.559740570882381, 0.5574049807391436],
 'df_cn_dy': [0.13126077658777424, 0.5779942269756002, 0.5757711231714306]}

---

#### Dataframe 3
With crime variables, without district, without neighbourhood

##### Divide between train and validation sets

In [45]:
X = df_cy_dn.drop('price', axis=1)
y = df_cy_dn['price']

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
print(len(X_train), len(y_train), len(X_val), len(y_val))

7518 7518 1880 1880


##### SVR model

In [48]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [49]:
clf = make_pipeline(StandardScaler(), SVR(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(gamma='auto'))])

In [50]:
score1 = clf.score(X_val, y_val)
score1

0.11040112825122961

##### Linear regression model

In [51]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [52]:
score2 = reg.score(X_val, y_val)
score2

0.5779942269755974

##### K Neighbors regressor

In [53]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=8)

In [54]:
score3 = neigh.score(X_val, y_val)
score3

0.5747284544236343

##### Store scores

In [55]:
scores['df_cy_dn'] = [score1, score2, score3]
scores

{'df_cn_dn': [0.18556884376704064, 0.559740570882381, 0.5574049807391436],
 'df_cn_dy': [0.13126077658777424, 0.5779942269756002, 0.5757711231714306],
 'df_cy_dn': [0.11040112825122961, 0.5779942269755974, 0.5747284544236343]}

---

#### Dataframe 4
With crime variables, with district, without neighbourhood (i.e. with everything except for neighbourhood)

##### Divide between train and validation sets

In [56]:
X = df_cy_dy.drop('price', axis=1)
y = df_cy_dy['price']

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
print(len(X_train), len(y_train), len(X_val), len(y_val))

7518 7518 1880 1880


##### SVR model

In [59]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [60]:
clf = make_pipeline(StandardScaler(), SVR(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(gamma='auto'))])

In [61]:
score1 = clf.score(X_val, y_val)
score1

0.0899753409979892

##### Linear regression model

In [62]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [63]:
score2 = reg.score(X_val, y_val)
score2

0.5779942269756129

##### K Neighbors regressor

In [64]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=8)

In [65]:
score3 = neigh.score(X_val, y_val)
score3

0.5757711231714306

##### Store scores

In [66]:
scores['df_cy_dy'] = [score1, score2, score3]
scores

{'df_cn_dn': [0.18556884376704064, 0.559740570882381, 0.5574049807391436],
 'df_cn_dy': [0.13126077658777424, 0.5779942269756002, 0.5757711231714306],
 'df_cy_dn': [0.11040112825122961, 0.5779942269755974, 0.5747284544236343],
 'df_cy_dy': [0.0899753409979892, 0.5779942269756129, 0.5757711231714306]}

---

#### Dataframe 5
With crime variables, with district, with neighbourhood (i.e. with everything)

##### Divide between train and validation sets

In [67]:
X = df_neigh.drop('price', axis=1)
y = df_neigh['price']

In [68]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
print(len(X_train), len(y_train), len(X_val), len(y_val))

7518 7518 1880 1880


##### SVR model

In [70]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [71]:
clf = make_pipeline(StandardScaler(), SVR(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(gamma='auto'))])

In [72]:
score1 = clf.score(X_val, y_val)
score1

0.05379930798696131

##### Linear regression model

In [73]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [74]:
score2 = reg.score(X_val, y_val)
score2

0.5951750863009462

##### K Neighbors regressor

In [75]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=8)

In [76]:
score3 = neigh.score(X_val, y_val)
score3

0.5860647421092329

##### Store scores

In [77]:
scores['df_neigh'] = [score1, score2, score3]
scores

{'df_cn_dn': [0.18556884376704064, 0.559740570882381, 0.5574049807391436],
 'df_cn_dy': [0.13126077658777424, 0.5779942269756002, 0.5757711231714306],
 'df_cy_dn': [0.11040112825122961, 0.5779942269755974, 0.5747284544236343],
 'df_cy_dy': [0.0899753409979892, 0.5779942269756129, 0.5757711231714306],
 'df_neigh': [0.05379930798696131, 0.5951750863009462, 0.5860647421092329]}

---

### Final scores

In [78]:
scores = pd.DataFrame(scores)
scores = scores.T
scores.columns = ['SVR model', 'Linear Regressor', 'K Neighbors Regressor']
scores

Unnamed: 0,SVR model,Linear Regressor,K Neighbors Regressor
df_cn_dn,0.185569,0.559741,0.557405
df_cn_dy,0.131261,0.577994,0.575771
df_cy_dn,0.110401,0.577994,0.574728
df_cy_dy,0.089975,0.577994,0.575771
df_neigh,0.053799,0.595175,0.586065


---

### Dump final model in pickle file

In [79]:
import pickle
pkl_filename = "final_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(reg, file)

---

## Conclusions
We have observed that the best model for our dataset is the Linear Regression Model. However, it performs very similarly as the K Neighbors Regressor. The third model that is presented (SVR), as well as some others that we tried, perform equal or worse than the formers.

We can also see that the most relevant variables are in fact those in the original dataset (i.e. the flat properties such as the number of rooms, bathrooms, squared meters...). Neither the neighbourhood and district where the flat is located, nor its characteristics such as criminality, add any significant amount of information to the model (the performance rises from around 56% to around 60% in the best case).

Therefore, we can conclude that, surprisingly, the district (or the neighbourhood) do not affect significantly the price of the flat. Or, alternatively, that the district differences are implicitly included in the main characteristics of the flat.