In [20]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Import data

df = pd.read_csv('LandRegistry.csv')
df.head(3)

Unnamed: 0,Link,Title,indirizzo,riferimento e Data annuncio,contratto,tipologia,superficie,locali,piano,totale piani edificio,...,prezzo,spese condominio,anno di costruzione,stato,riscaldamento,Efficienza energetica,Climatizzatore,immobile garantito,Posti Auto,Owner
0,https://www.immobiliare.it/annunci/91836408/,"Quadrilocale via Pietro Panzeri 15, Ticinese, ...","Milano,Ticinese,Via Pietro Panzeri, 15",EK-91836408 - 25/11/2021,Vendita,Appartamento,167 m²,"4 (3 camere da letto, 1 altro), 2 bagni, cucin...","Piano terra, con ascensore",6 piani,...,€ 560.000,€ 250/mese,1960.0,Ottimo / Ristrutturato,"Autonomo, ad aria, alimentazione elettrica",G ≥ 175 kWh/m² anno,"Autonomo, freddo/caldo",,,Karen Moore
1,https://www.immobiliare.it/annunci/92317086/,"Quadrilocale via Francesco Primaticcio 140, Ba...","Milano,Bande Nere,Via Francesco Primaticcio 140",00324 - 29/11/2021,Vendita,Appartamento,130 m²,"4 (3 camere da letto, 1 altro), 2 bagni, cucin...","7°, con ascensore, con accesso disabili",8 piani,...,€ 564.000,€ 333/mese,1980.0,Buono / Abitabile,"Centralizzato, a radiatori, alimentato a metano","G 175,00 kWh/m² anno","Autonomo, freddo","Dati certificabili, documentazione completa",1 in garage/box,Dwight Hardaway
2,https://www.immobiliare.it/annunci/86462824/,"Trilocale via Ruggiero Settimo, Washington, Mi...","Milano,Washington,Via Ruggiero Settimo",T3212I - 11/11/2021,Vendita,Appartamento,60 m²,"3 (2 camere da letto, 1 altro), 1 bagno, cucin...","1°, con ascensore",8 piani,...,€ 420.000,€ 120/mese,1970.0,Ottimo / Ristrutturato,"Centralizzato, a radiatori, alimentato a metano","G 224,41 kWh/m² anno","Autonomo, freddo/caldo","Dati certificabili, documentazione completa",,Marcus Hunter


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Link                         451 non-null    object 
 1   Title                        451 non-null    object 
 2   indirizzo                    451 non-null    object 
 3   riferimento e Data annuncio  451 non-null    object 
 4   contratto                    451 non-null    object 
 5   tipologia                    451 non-null    object 
 6   superficie                   451 non-null    object 
 7   locali                       451 non-null    object 
 8   piano                        451 non-null    object 
 9   totale piani edificio        450 non-null    object 
 10  disponibilità                384 non-null    object 
 11  Tipo proprietà               451 non-null    object 
 12  altre caratteristiche        450 non-null    object 
 13  prezzo              

In [4]:
# Drop useless columns and variables we don't have in our registry

data = df.drop(['Link', 'Title', 'indirizzo', 'riferimento e Data annuncio', 'contratto', 'tipologia', 'altre caratteristiche', 'spese condominio', 'stato', 'totale piani edificio', 
                'disponibilità', 'Tipo proprietà', 'anno di costruzione', 'riscaldamento', 'Efficienza energetica', 'Climatizzatore', 'immobile garantito', 'Posti Auto', 'Owner'],axis=1)

# Rename columns in english
data.rename(columns={'superficie' : 'Surface', 'locali' : 'Rooms', 'piano' : 'Floor', 'prezzo' : 'Price'}, inplace=True)
data.head(3)

Unnamed: 0,Surface,Rooms,Floor,Price
0,167 m²,"4 (3 camere da letto, 1 altro), 2 bagni, cucin...","Piano terra, con ascensore",€ 560.000
1,130 m²,"4 (3 camere da letto, 1 altro), 2 bagni, cucin...","7°, con ascensore, con accesso disabili",€ 564.000
2,60 m²,"3 (2 camere da letto, 1 altro), 1 bagno, cucin...","1°, con ascensore",€ 420.000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Surface  451 non-null    object
 1   Rooms    451 non-null    object
 2   Floor    451 non-null    object
 3   Price    451 non-null    object
dtypes: object(4)
memory usage: 14.2+ KB


In [6]:
# Clean Surface

for i in range(len(data['Surface'])):
    a_string = data['Surface'][i]
    numbers = []
    for word in a_string.split():
       if word.isdigit():
          numbers.append(int(word))
    data.at[i, 'Surface'] = numbers[-(len(numbers))]

In [7]:
# Clean Rooms

for i in range(len(data['Rooms'])):
    a_string = data['Rooms'][i]
    numbers = []
    for word in a_string.split():
       if word.isdigit():
          numbers.append(int(word))
    data.at[i, 'Rooms'] = numbers[-(len(numbers))]
    

In [8]:
# Clean Floor

for i in range(len(data['Floor'])):
    if re.findall(r'\d+', data['Floor'][i]):
        data.at[i, 'Floor'] = int(re.findall(r'\d+', data['Floor'][i])[0]) # RegEx to get the floor
    elif not re.findall(r'Piano terra+', data['Floor'][i]):
        data.at[i, 'Floor'] = 0 # Ground floor
    else:
        data.at[i, 'Floor'] = 'Others' # Basement / others

In [9]:
# Clean Price

data['Price'] = data['Price'].apply(str)

for i in range(len(data['Price'])):
    data.at[i, 'Price'] = data['Price'][i].strip().replace('€', '').replace('.', '')
    if data['Price'][i] == 'Prezzo su richiesta':
        data = data.drop(i, axis=0)

data['Price'] = data['Price'].apply(float)
data.reset_index(drop=True, inplace=True)

In [10]:
# Fix column types

data['Surface'] = data['Surface'].apply(int)  

data['Rooms'] = data['Rooms'].apply(str)
data['Floor'] = data['Floor'].apply(str)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449 entries, 0 to 448
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Surface  449 non-null    int64  
 1   Rooms    449 non-null    object 
 2   Floor    449 non-null    object 
 3   Price    449 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 14.2+ KB


In [12]:
# Get target variable and regressors

X = data.drop(['Price'], axis=1)
y = np.log(np.array(data['Price'])) # y = log(€) 

In [13]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [14]:
# Column selector

encoder = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)), # num_columns
    (OneHotEncoder(), make_column_selector(dtype_include=object))) # cat_columns


In [15]:
class DenseTransformer():

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [16]:
# Pipeline for HistGradientBoostingRegressor

hgbr = Pipeline(steps=[('encoder', encoder), ('to_dense', DenseTransformer()), ('estimator', HistGradientBoostingRegressor())])

hgbr.fit(X_train, y_train)

Pipeline(steps=[('encoder',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EAB925D1C0>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EAB925DA00>)])),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x000001EAB925DC10>),
                ('estimator', HistGradientBoostingRegressor())])

In [17]:
# Performance

y_pred = np.array(hgbr.predict(X_test))
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Root mean squared error: 0.30
Coefficient of determination: 0.78


In [18]:
# Hyperparameters tuning for HGBR

### Commented because no improvements

""" param_grid = {'estimator__learning_rate' : [0.075, 0.1, 0.125],
              'estimator__max_leaf_nodes' : [i for i in range(24, 41, 2)],
              'estimator__min_samples_leaf' : [i for i in range(10, 23, 2)],
              'estimator__max_iter' : [500],
              'estimator__l2_regularization' : [0, 0.05, 0.1]
              }

hgbr_tuned = GridSearchCV(hgbr, param_grid, verbose = 1, n_jobs=-1)
best_hgbr = hgbr_tuned.fit(X_train, y_train)

y_pred = np.array(best_hgbr.predict(X_test))
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred)) """

' param_grid = {\'estimator__learning_rate\' : [0.075, 0.1, 0.125],\n              \'estimator__max_leaf_nodes\' : [i for i in range(24, 41, 2)],\n              \'estimator__min_samples_leaf\' : [i for i in range(10, 23, 2)],\n              \'estimator__max_iter\' : [500],\n              \'estimator__l2_regularization\' : [0, 0.05, 0.1]\n              }\n\nhgbr_tuned = GridSearchCV(hgbr, param_grid, verbose = 1, n_jobs=-1)\nbest_hgbr = hgbr_tuned.fit(X_train, y_train)\n\ny_pred = np.array(best_hgbr.predict(X_test))\nprint("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))\nprint("Coefficient of determination: %.2f" % r2_score(y_test, y_pred)) '

In [19]:
### FINAL MODEL

hgbr.fit(X, y)

Pipeline(steps=[('encoder',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EAB925D1C0>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EAB925DA00>)])),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x000001EAB925DC10>),
                ('estimator', HistGradientBoostingRegressor())])

In [21]:
# Save model
filename = 'hgbr_final.pickle'
pickle.dump(hgbr, open(filename, 'wb'))