In [1]:
import pandas as pd
import numpy as np

In [2]:
fotocasa = pd.read_csv("fotocasa_eda.csv")

In [3]:
X = fotocasa.drop(["Price"], axis=1)
y = fotocasa["Price"]

# Modeling

### Simple model

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import sklearn_dummies as skdm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [5]:
#model = LinearRegression()
#model = Ridge(random_state = 1492)
#model = Lasso(random_state = 1492)
model = RandomForestRegressor(random_state = 1492, oob_score = True, n_estimators = 300)

In [6]:
# Linear regression pipeline
pipeline = Pipeline([('encoder', skdm.DataFrameDummies()), 
                        ('model', model)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1492)
cv_score = cross_val_score(pipeline, X_train, y_train, 
                              scoring = "neg_mean_absolute_error", cv=5)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('encoder',
                 <sklearn_dummies.base.DataFrameDummies object at 0x7fa522a295b0>),
                ('model',
                 RandomForestRegressor(n_estimators=300, oob_score=True,
                                       random_state=1492))])

In [7]:
print("MEA: ", np.quantile(-cv_score/np.mean(y_train),[0.025,0.975]), 
      '(', np.mean(-cv_score)/np.mean(y_train), ')')
print("Relative error: ", np.quantile((y_test-pipeline.predict(X_test))/y_test,[0.025,0.975]),
     '(', np.mean(np.abs((y_test-pipeline.predict(X_test))/y_test)),')')

MEA:  [0.14633446 0.15217487] ( 0.14897659984240147 )
Relative error:  [-0.48251111  0.31503356] ( 0.14615696531880262 )


### Model per district

In [8]:
# Linear regression pipeline
models = {}
cv = []
test_resids = []
for district in X["District"].unique():
    # Train and test sets
    X_dis = X[X["District"]==district].drop(["District"], axis=1)
    y_dis = y[X["District"]==district]
    X_traind, X_testd, y_traind, y_testd = train_test_split(X_dis, y_dis, test_size=0.2, random_state=1492)
    pipeline = Pipeline([('encoder', skdm.DataFrameDummies()), 
                            ('model', model)])
    cv_score = cross_val_score(pipeline, X_traind, y_traind, scoring = "neg_mean_absolute_error", cv=5)
    for score in cv_score:
        cv.append(score/np.mean(y_traind))
    pipeline.fit(X_traind, y_traind)
    test_resids.append(((y_testd.values - pipeline.predict(X_testd))/y_testd.values).tolist())
    models[district] = pipeline

In [9]:
print("MEA: ", np.quantile(np.abs(cv),[0.025,0.975]), 
      '(', np.mean(np.abs(cv)), ')')
print("Relative error: ", np.quantile(sum(test_resids,[]),[0.025,0.975]),
     '(', np.mean(np.abs(sum(test_resids,[]))), ')')

MEA:  [0.0616974  0.20656791] ( 0.1284434058422505 )
Relative error:  [-0.49555216  0.3075642 ] ( 0.1426323768943182 )


Random forest performs relatively better (\~14/15% error) compared to the other three methods (\~18% error). The model per district seems to work a little bit better.

### Saving the model:

In [10]:
models = {}
cv = []
test_resids = []
for district in X["District"].unique():
    # Train and test sets
    X_dis = X[X["District"]==district].drop(["District"], axis=1)
    y_dis = y[X["District"]==district]
    X_traind, X_testd, y_traind, y_testd = train_test_split(X_dis, y_dis, test_size=0.2, random_state=1492)
    pipeline = Pipeline([('encoder', skdm.DataFrameDummies()), 
                            ('model', model)])
    pipeline.fit(X_traind, y_traind)
    models[district] = pipeline

In [11]:
import pickle
pickl = {'model': models}
pickle.dump( pickl, open( 'mrm_rf' + ".p", "wb" ) )