In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('LandRegistry.csv')

In [3]:
data = df.drop(['Title','Link','riscaldamento','Climatizzatore','anno di costruzione','Tipo proprietà','Title','indirizzo', 'riferimento e Data annuncio', 'contratto', 'tipologia','Owner','immobile garantito','Efficienza energetica', 'disponibilità','spese condominio' ],axis=1)

In [4]:
# clean locali
for i in range(len(data['locali'])):
    a_string = data['locali'][i]
    numbers = []
    for word in a_string.split():
       if word.isdigit():
          numbers.append(int(word))
    data['locali'][i] = numbers[-(len(numbers))]
    

In [5]:
# clean superficie
for i in range(len(data['superficie'])):
    a_string = data['superficie'][i]
    numbers = []
    for word in a_string.split():
       if word.isdigit():
          numbers.append(int(word))
    data['superficie'][i] = numbers[-(len(numbers))]

In [6]:
# transform cat values in dummy vars (altre car, tipo proprietà, stato, riscaldamento, Climatizzatore, Posti Auto)
data=pd.get_dummies(data, columns= ['altre caratteristiche'])
data=pd.get_dummies(data, columns= ['stato'])
data=pd.get_dummies(data, columns= ['totale piani edificio'])
data=pd.get_dummies(data, columns= ['Posti Auto'])


In [7]:
data = data[data.prezzo != 'Prezzo su richiesta']

In [8]:
data=pd.get_dummies(data, columns= ['piano'])

In [9]:
X = data.drop(['prezzo'], axis=1)
#y = np.array(data['prezzo'].str.replace('\.','').astype(int)) / np.array(data['superficie'])  'y = €/m2'
y = np.log(np.array(data['prezzo'].str.replace('\.','').astype(int))) # 'y = log(€)' (BEST)
#y = np.array(data['prezzo'].str.replace('\.','').astype(int))  'y = €'

In [10]:
model = linear_model.LinearRegression()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [12]:
fit=model.fit(X_train, y_train)

In [13]:
y_pred = np.array(model.predict(X_test))

In [14]:
from sklearn import metrics

print(metrics.mean_absolute_error(y_test,y_pred))

0.23207882991372572


In [15]:
y.mean()

13.143708957416122

In [16]:
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [17]:
# The coefficients
print("Coefficients: \n", model.coef_)
# The mean squared error
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Coefficients: 
 [ 8.77016452e-03  8.49358896e-02 -1.52186299e-01  2.65132227e-01
  1.68104277e-01  1.42575015e-01  4.08810753e-01  1.51262844e-01
  2.18868741e-01  4.28648682e-01  2.11723614e-01  1.20126125e-01
  1.88242280e-01 -1.01243182e-01 -1.43773882e-14  2.07111409e-01
  1.73491634e-01 -2.22494226e-01 -1.69605607e-01  5.64328802e-02
 -3.29031529e-02 -1.53186484e-01 -2.92926211e-01 -5.30551668e-01
 -9.04161717e-02  4.38538095e-15 -9.57702362e-02 -1.30093650e-01
 -4.25866684e-01 -2.69229083e-15  6.65455179e-03  4.56817738e-02
  3.91942175e-02 -8.85675894e-03 -5.73757511e-02 -1.30001531e-01
  1.94508707e-01 -7.84640639e-02 -8.15692521e-02 -9.99570629e-02
  3.74860902e-01 -4.30274173e-01  7.03792214e-02 -1.11524878e-01
 -2.68357114e+00  2.43272783e-01  3.51592515e-01 -1.71348174e-02
 -6.10232449e-01 -2.04841615e-01 -1.98258923e-01  1.30594570e-01
  4.28027590e-02  8.54714735e-02  2.54343491e-01 -2.16493490e-15
  1.84638643e-01  3.51393231e-01 -1.68233692e-01  2.15836513e-01
  2.88985

In [18]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 17)
# Train the model on training data
rf.fit(X_train, y_train);


In [19]:
y_predrf = rf.predict(X_test)

In [20]:
# The mean squared error
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_predrf)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_predrf))


Root mean squared error: 0.29
Coefficient of determination: 0.79


In [21]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [22]:
HGBR = HistGradientBoostingRegressor().fit(X, y)

In [23]:
HGBR.score(X, y)

0.8691776799066354

In [24]:
y_pred = np.array(HGBR.predict(X_test))

In [25]:
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Root mean squared error: 0.19
Coefficient of determination: 0.91


In [None]:
# HGBR best performance