# Data

## Import libraries and Set Options

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, ParameterGrid
from sklearn.preprocessing import (StandardScaler, OneHotEncoder,
                                   PolynomialFeatures)
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error

from daftpy.daftmodel import split_data, scores_statistics, metrics_regression, plot_learning_curves, compare_models

import joblib

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

## Load Data

In [3]:
sale_data = pd.read_csv('data_available/sale_data_post_out.csv', sep=',')
sale_data.shape  #### cambiar esto

(7387, 34)

In [4]:
features = [
    
    'price',
    'floor_area',
    'views',
    'latitude',
    'longitude',
#    'bedroom',
    'bathroom',
#    'sale_type',
    'type_house',
#    'postcode',
#    'state_district',
#    'county',
#    'city_district',
#    'road',
#    'place',
    'code',
#    'admin1',
#    'cities'
]

data = sale_data[features].copy()
data.shape

(7387, 8)

In [5]:
num_features = list(data.select_dtypes('number').columns) #X_train
num_features.remove('price')
cat_features =  list(data.select_dtypes('object').columns)

num_feat_df = pd.DataFrame({'numerical': num_features}) 
cat_feat_df = pd.DataFrame({'categorical': cat_features})

feat_df = num_feat_df.merge(cat_feat_df, how='left', left_index=True, right_index=True).fillna(' ')
feat_df

Unnamed: 0,numerical,categorical
0,floor_area,type_house
1,views,code
2,latitude,
3,longitude,
4,bathroom,


In [6]:
levels_type_house = data.type_house.unique()
levels_code = data.code.unique()

cat_feat_df['n_levels'] = [data.type_house.nunique(), data.code.nunique()]
cat_feat_df['sample'] = [levels_type_house, levels_code[:5]]
cat_feat_df # mejorar

Unnamed: 0,categorical,n_levels,sample
0,type_house,2,"[house, apartment]"
1,code,137,"[D09, T23, F91, V95, R32]"


## Split Data

In [7]:
X_train, X_test, y_train, y_test = split_data(data=data, 
                                              target='price', 
                                              test_size=.2, 
                                              output='X_y_train_test',
                                              random_state=42)

X_train: (5909, 7) 
X_test: (1478, 7) 
y_train: (5909,) 
y_test: (1478,) 



------------

# Metrics

In [8]:
scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}

# Models

## Multiple Linear Regression

En estadística, el **coeficiente de determinación**, denominado R² y pronunciado R cuadrado, es un estadístico usado en el contexto de un modelo estadístico cuyo principal propósito es predecir futuros resultados o probar una hipótesis. El coeficiente determina la calidad del modelo para replicar los resultados, y la proporción de variación de los resultados que puede explicarse por el modelo.

Es el porcentaje de la variación en la variable de respuesta que es explicado por un modelo lineal. Es decir:

R-cuadrado = Variación explicada / variación total

El R-cuadrado siempre está entre 0 y 100%:

https://scikit-learn.org/stable/modules/cross_validation.html

In [9]:
num_pipe = Pipeline([
    ('std_scaler', StandardScaler()),
#    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ])

cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(categories=[levels_type_house, levels_code]))  # No hace nada si ya transformadas
                                      #handle_unknown='ignore'
    ])
# Las transforme antes para evitar problemas no las variables a la hora de predecir e el test_set...

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
    ]) #, remainder='passthrough'


lr_pipe_estimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', # esto lo puedo agnadir en los otros pipes
                              fill_value=None)),
    ('regressor', LinearRegression())
    ])

In [10]:
scores_lr, scores_resume_lr = scores_statistics(estimator=lr_pipe_estimator, 
                                                scoring_dict=scoring, 
                                                X_train=X_train, 
                                                y_train=y_train, 
                                                cv=10, 
                                                return_train_score=False)

r2 mean: 0.6581505970522035
r2 std: 0.03247492814306588 

MAE mean: -108719.58221709491
MAE std: 5568.4947907050655 

MAPE mean: -0.32069981130761466
MAPE std: 0.015889144852600887 

RMSE mean: -176983.98634043717
RMSE std: 12945.409397794008 



### Overfitting-Underfitting Analysis

#### Evaluating on the Training and Test Sets

We can identify if a machine learning model has overfit by first evaluating the model on the training dataset and then evaluating the same model on a holdout test dataset.

If the performance of the model on the training dataset is significantly better than the performance on the test dataset, then the model may have overfit the training dataset.

In [11]:
lr = lr_pipe_estimator.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = lr.predict(X_train)
metrics_regression(y_test=y_train, 
                   y_pred=y_pred_train, 
                   squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = lr.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)

Performance on the training set:
 ----------
R²: 0.6739892726197527
MAE: 106081.66588347829
MAPE: 0.31183387356867376
RMSE: 173716.41014659725

Performance on the test set:
 ----------
R²: 0.6468123069342815
MAE: 111099.76275851484
MAPE: 0.321239513969634
RMSE: 183180.30027983472



#### Learning Curves

The model is underfitting the training data, so we need to use more complex models or come up with better features.

-------------

------------

In [12]:
#joblib.dump(lr, 'models/linear_regression_01-11-2021.plk')

-------------------

## Polynomial Regression

In [13]:
num_pipe = Pipeline([
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ('std_scaler', StandardScaler()),
    ])

cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(categories=[levels_type_house, levels_code]))  #code_levels
                                      #handle_unknown='ignore'
    ])


preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
    ]) #, remainder='passthrough'


poly_pipe_estimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', LinearRegression())
                           ])

In [14]:
#pd.DataFrame(grid_search.cv_results_) # plotearlo

In [15]:
scores = scores_statistics(estimator=poly_pipe_estimator, 
                           scoring_dict=scoring, 
                           X_train=X_train, 
                           y_train=y_train, 
                           cv=10, 
                           return_train_score=False)

r2 mean: 0.7509266721418995
r2 std: 0.029110140197970813 

MAE mean: -94476.7902938415
MAE std: 3679.6981417813968 

MAPE mean: -0.2738010265729388
MAPE std: 0.010822070061438843 

RMSE mean: -150821.55756749702
RMSE std: 10517.670372640698 



In [16]:
poly = poly_pipe_estimator.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = poly.predict(X_train)
metrics_regression(y_test=y_train, 
                    y_pred=y_pred_train, 
                    squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = poly.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)


Performance on the training set:
 ----------
R²: 0.7737738943989698
MAE: 90784.73634486872
MAPE: 0.2629339442693284
RMSE: 144709.23924104206

Performance on the test set:
 ----------
R²: 0.7341027244460809
MAE: 97188.98815136666
MAPE: 0.271332077376683
RMSE: 158939.89776053437



## Support Vector Machines

## K Nearest Neighbors Regressor

In [17]:
from sklearn.neighbors import KNeighborsRegressor

In [18]:
num_pipe = Pipeline([
    ('poly', PolynomialFeatures(degree=1, include_bias=False)),
    ('std_scaler', StandardScaler()),
    ])

cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(categories=[levels_type_house, levels_code]))  #code_levels
                                      #handle_unknown='ignore'
    ])


preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
    ]) #, remainder='passthrough'


knnr_pipe_est = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', KNeighborsRegressor())
                           ])

In [19]:
knnr_pipe_est = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', KNeighborsRegressor(n_neighbors=10,
                                      
                                     ))
                           ])

scores = scores_statistics(estimator=knnr_pipe_est, 
                           scoring_dict=scoring, 
                           X_train=X_train, 
                           y_train=y_train, 
                           cv=5, 
                           return_train_score=False)

r2 mean: 0.71811523350896
r2 std: 0.047035588574515515 

MAE mean: -94951.2568978797
MAE std: 3616.1771208504674 

MAPE mean: -0.2673478345689654
MAPE std: 0.006640880486903855 

RMSE mean: -160776.974205796
RMSE std: 12246.337737638607 



In [20]:
knnr = knnr_pipe_est.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = knnr.predict(X_train)
metrics_regression(y_test=y_train, 
                   y_pred=y_pred_train, 
                   squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = knnr.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)

Performance on the training set:
 ----------
R²: 0.7808931429920254
MAE: 83952.01976645795
MAPE: 0.23402768381886224
RMSE: 142414.06577198117

Performance on the test set:
 ----------
R²: 0.7149744015727351
MAE: 94532.48254397836
MAPE: 0.25724751266987317
RMSE: 164557.59012118593



## Decission Tree Regressor

In [21]:
num_pipe = Pipeline([
    ('std_scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)),
    ])

cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(categories=[levels_type_house, levels_code]))  #code_levels
                                      #handle_unknown='ignore'
    ])


preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
    ]) #, remainder='passthrough'


dtr_pipe_estimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', DecisionTreeRegressor()) # max_depth=9spoiler
                           ])

In [22]:
#pd.DataFrame(grid_search.cv_results_)

In [23]:
dtr_pipe_estimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', DecisionTreeRegressor(max_depth=10,
                                        min_samples_leaf=16,
                                        min_samples_split=43,
                                        max_leaf_nodes=68
                                       )) 
                           ])

scores = scores_statistics(estimator=dtr_pipe_estimator, 
                           scoring_dict=scoring, 
                           X_train=X_train, 
                           y_train=y_train, 
                           cv=5, 
                           return_train_score=False)

r2 mean: 0.7347439806116778
r2 std: 0.024814399630088618 

MAE mean: -95341.41736197876
MAE std: 1686.2968105662758 

MAPE mean: -0.27692435189721787
MAPE std: 0.006257078986299098 

RMSE mean: -156514.0008343089
RMSE std: 7832.77912950293 



In [24]:
dtr = dtr_pipe_estimator.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = dtr.predict(X_train)
metrics_regression(y_test=y_train, 
                   y_pred=y_pred_train, 
                   squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = dtr.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)

Performance on the training set:
 ----------
R²: 0.7927341211787164
MAE: 86542.06835968455
MAPE: 0.25752620175637614
RMSE: 138512.4486203107

Performance on the test set:
 ----------
R²: 0.7304601782182829
MAE: 96912.11046384604
MAPE: 0.27164761682651445
RMSE: 160024.8593006069



## Random Forest Regressor

In [25]:
num_pipe = Pipeline([
    ('std_scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)),
    ])

cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(categories=[levels_type_house, levels_code]))  #code_levels
                                      #handle_unknown='ignore'
    ])


preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
    ]) #, remainder='passthrough'


rfr_pipe_est = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', RandomForestRegressor())
                           ])

In [26]:
#pd.DataFrame(grid_search.cv_results_).tail()

In [27]:
rfr_pipe_est = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value=None)),
    ('regressor', RandomForestRegressor(
        n_estimators=93,
#        max_depth=10,
#        min_samples_split=43,
#        min_samples_leaf=16,
#        max_leaf_nodes=68,
        n_jobs=-1,
                                       )) 
                           ])
#estimator

scores = scores_statistics(estimator=rfr_pipe_est, 
                           scoring_dict=scoring, 
                           X_train=X_train, 
                           y_train=y_train, 
                           cv=10, 
                           return_train_score=False)

r2 mean: 0.8084185025882459
r2 std: 0.03533087756712576 

MAE mean: -76529.52083543413
MAE std: 3230.2443584497014 

MAPE mean: -0.21597816094047123
MAPE std: 0.010471981238357719 

RMSE mean: -131567.36330133112
RMSE std: 9881.895461758715 



In [28]:
rfr = rfr_pipe_est.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = rfr.predict(X_train)
metrics_regression(y_test=y_train, 
                   y_pred=y_pred_train, 
                   squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = rfr.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)

Performance on the training set:
 ----------
R²: 0.9740265692445357
MAE: 28274.738223267956
MAPE: 0.07939757600171692
RMSE: 49033.12353117212

Performance on the test set:
 ----------
R²: 0.808566936260323
MAE: 77531.87912319758
MAPE: 0.20983086774383677
RMSE: 134860.40530476318



The model is overfitting the training data. Necesitamos regularizar!!

## Voting Regressor

In [29]:
from sklearn.ensemble import VotingRegressor

In [30]:
scoring_comp = {'r2': 'r2',
                'MAPE': 'neg_mean_absolute_percentage_error'}

In [35]:
polyr = poly_pipe_estimator
knnr = knnr_pipe_est
dtr = dtr_pipe_estimator
rfr = rfr_pipe_est

voting_reg = VotingRegressor(
    estimators=[('poly', polyr),('knn', knnr),('dt', dtr)], #('rfr',rfr)
   # weights= ,
)

models_dict = {'Polynomial Regression': polyr, 
               'K Nearest Neighbors Regressor': knnr, 
               'Decission Tree Regressor': dtr, 
             #  'Random Forest Regressor': rfr,
               'Voting Regressor': voting_reg}

for key in models_dict:
    print(key)  
    scores = compare_models(estimator=models_dict[key], 
                            scoring_dict=scoring_comp, 
                            X_train=X_train, 
                            y_train=y_train, 
                            cv=10, 
                            return_train_score=False)

Polynomial Regression
r2 mean: 0.7509266721418995
MAPE mean: -0.2738010265729388
----------
K Nearest Neighbors Regressor
r2 mean: 0.723478756178525
MAPE mean: -0.263912331770578
----------
Decission Tree Regressor
r2 mean: 0.7193170788329176
MAPE mean: -0.28016911561435026
----------
Voting Regressor
r2 mean: 0.7832744953168376
MAPE mean: -0.23493532523244634
----------


Con el random forest mejora un poco pero el rfr sigue siendo mejor

In [36]:
voting_reg.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = voting_reg.predict(X_train)
metrics_regression(y_test=y_train, 
                   y_pred=y_pred_train, 
                   squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = voting_reg.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)

Performance on the training set:
 ----------
R²: 0.8250862917671474
MAE: 76360.16018358443
MAPE: 0.21690479282084155
RMSE: 127243.85680175616

Performance on the test set:
 ----------
R²: 0.7779982353388047
MAE: 83711.91493403589
MAPE: 0.22538889007518018
RMSE: 145229.28488682045



In [40]:
models_r2 = {'poly': 73.41, 'knn': 71.49, 'dt': 73.04}

tot = 0
for key in models_r2:
    tot += models_r2[key]

models_weigth = {}
models_weigth_list = []
for key in models_r2:
    weight = models_r2[key] / tot
    models_weigth[key] = weight
    models_weigth_list.append(weight)
models_weigth
#models_weigth_list

{'poly': 0.33683582637423143,
 'knn': 0.3280260622189593,
 'dt': 0.33513811140680927}

In [41]:
polyr = poly_pipe_estimator
knnr = knnr_pipe_est
dtr = dtr_pipe_estimator
#rfr = rfr_pipe_est

voting_reg = VotingRegressor(
    estimators= [('poly', polyr),('knn', knnr),('dt', dtr)], #('rfr',rfr)
    weights= models_weigth_list,
)

models_dict = {'Polynomial Regression': polyr, 
               'K Nearest Neighbors Regressor': knnr, 
               'Decission Tree Regressor': dtr, 
             #  'Random Forest Regressor': rfr,
               'Voting Regressor': voting_reg}

for key in models_dict:
    print(key)  
    scores = compare_models(estimator=models_dict[key], 
                            scoring_dict=scoring_comp, 
                            X_train=X_train, 
                            y_train=y_train, 
                            cv=10, 
                            return_train_score=False)

Polynomial Regression
r2 mean: 0.7509266721418995
MAPE mean: -0.2738010265729388
----------
K Nearest Neighbors Regressor
r2 mean: 0.723478756178525
MAPE mean: -0.263912331770578
----------
Decission Tree Regressor
r2 mean: 0.7193170788329176
MAPE mean: -0.28016911561435026
----------
Voting Regressor
r2 mean: 0.7833601411843516
MAPE mean: -0.2349531052600688
----------


In [42]:
voting_reg.fit(X_train, y_train)

print('Performance on the training set:\n', '-'*10)
y_pred_train = voting_reg.predict(X_train)
metrics_regression(y_test=y_train, 
                   y_pred=y_pred_train, 
                   squared=False)

print('Performance on the test set:\n', '-'*10)
y_pred = voting_reg.predict(X_test)
metrics_regression(y_test=y_test, 
                   y_pred=y_pred, 
                   squared=False)

Performance on the training set:
 ----------
R²: 0.8250917470379119
MAE: 76384.77001301781
MAPE: 0.2170076639656279
RMSE: 127241.87252301836

Performance on the test set:
 ----------
R²: 0.77813329415568
MAE: 83715.56093234899
MAPE: 0.22537719887227378
RMSE: 145185.10172562115

