In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.feature_selection import SequentialFeatureSelector

from get_data import *

### Data load

In [10]:
ds = get_data_model_v1()
ds.info()

Database object created
<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 0 to 636
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  637 non-null    int64  
 1   titulo              637 non-null    object 
 2   descripcion         625 non-null    object 
 3   extra_info          633 non-null    object 
 4   n_habitaciones      637 non-null    int64  
 5   tamano              637 non-null    int64  
 6   precio              637 non-null    float64
 7   municipio           637 non-null    object 
 8   n_banos             637 non-null    int64  
 9   n_plazas_garaje     637 non-null    int64  
 10  direccion           637 non-null    object 
 11  landmarks_cercanos  637 non-null    object 
 12  piscina             637 non-null    bool   
 13  valoracion          637 non-null    int64  
 14  densidad            637 non-null    int64  
 15  pib_capita          637 non-null 

  df = pd.read_sql_query(query, self.connection)


In [11]:
object_columns = ds.select_dtypes(include=['object']).columns.tolist()
ds_object = ds[object_columns]
ds_object.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 0 to 636
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   titulo              637 non-null    object
 1   descripcion         625 non-null    object
 2   extra_info          633 non-null    object
 3   municipio           637 non-null    object
 4   direccion           637 non-null    object
 5   landmarks_cercanos  637 non-null    object
dtypes: object(6)
memory usage: 34.8+ KB


In [12]:
cols = ds.columns
cols_exclude = ['id', 'titulo', 'descripcion', 'extra_info', 'direccion', 'landmarks_cercanos', 'municipio']
cols_check_nas = ['n_banos', 'n_plazas_garaje', 'valoracion']
for col in cols_check_nas:
    if sum(ds[col]==-1)/len(ds) > 0.25:
        print(f'Column {col} has more than 20% of missing values.')
        cols_exclude.append(col)

cols_model = [col for col in cols if col not in cols_exclude]

#convert int and bool columns to float
for col in cols:
    if ds[col].dtype == 'int64':
        ds[col] = ds[col].astype('float64')
    elif ds[col].dtype == 'object':
        cols_exclude.append(col)
    elif ds[col].dtype == 'bool':
        ds[col] = ds[col].astype('float64')

Column n_banos has more than 20% of missing values.
Column n_plazas_garaje has more than 20% of missing values.


### Data visualization

### Deal with NAs

In [13]:
for col in cols_check_nas:
    ds[col] = ds[col].fillna(np.mean(ds[col]))

### Municipios column

In [14]:
ds['coruna'] = ds["municipio"].apply(lambda x: 1.0 if 'coru' in x else 0.0)
ds['oleiros'] = ds["municipio"].apply(lambda x: 1.0 if 'oleiros' in x else 0.0)
ds['art_berg_camb'] = ds["municipio"].apply(lambda x: 1.0 if ('artei' in x or 'berg' in x or 'cambr' in x) else 0.0)

cols_model.append('coruna')
cols_model.append('oleiros')
cols_model.append('art_berg_camb')

### Data split

In [22]:
x = ds[cols_model].drop('precio', axis=1)
y = ds['precio']

# slit data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Model

In [23]:
model = linear_model.Lasso(alpha=5)

rkf = RepeatedKFold(n_splits=5, n_repeats=10)
scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=rkf)
scores = np.sqrt(np.abs(scores))

scores_mean = np.mean(scores)
scores_std = np.std(scores)

print(f'RMSE--->\t mean:{scores_mean}\t std:{scores_std}')

RMSE--->	 mean:411.50122909748103	 std:78.05898030484792


In [44]:
cols_selected.to_list()

['tamano', 'densidad', 'playa', 'vistas', 'es_casa', 'oleiros']

In [57]:
lambda_vals = [0.1, 0.5, 1, 2, 4, 5, 7, 9 ,10, 15, 20, 30, 40, 50, 100]
scores_record= []
models = []

for val in lambda_vals:
    model = linear_model.Lasso(alpha=val, max_iter=10000)

    rkf = RepeatedKFold(n_splits=5, n_repeats=10)
    scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=rkf)
    scores = np.sqrt(np.abs(scores))

    scores_mean = np.mean(scores)
    scores_record.append(scores_mean)

    model.fit(x_train, y_train)
    models.append(model)
    cols_selected = x_train.columns[model.coef_ > 0.0001]
    print("lambda val: ", val, '\t', "RMSE: ", scores_mean, '\t', len(cols_selected), cols_selected.to_list())

  model = cd_fast.enet_coordinate_descent(


lambda val:  0.1 	 RMSE:  419.96103898058834 	 12 ['tamano', 'piscina', 'densidad', 'pib_capita', 'parking', 'playa', 'balcon', 'vacacional', 'vistas', 'profesores', 'es_casa', 'oleiros']
lambda val:  0.5 	 RMSE:  417.03366166206354 	 11 ['tamano', 'piscina', 'densidad', 'parking', 'playa', 'balcon', 'vacacional', 'vistas', 'profesores', 'es_casa', 'oleiros']
lambda val:  1 	 RMSE:  413.5578102522577 	 10 ['tamano', 'piscina', 'densidad', 'parking', 'playa', 'balcon', 'vacacional', 'vistas', 'es_casa', 'oleiros']
lambda val:  2 	 RMSE:  417.5982410008651 	 10 ['tamano', 'piscina', 'densidad', 'parking', 'playa', 'balcon', 'vacacional', 'vistas', 'es_casa', 'oleiros']
lambda val:  4 	 RMSE:  413.7685678186555 	 8 ['tamano', 'piscina', 'densidad', 'playa', 'balcon', 'vistas', 'es_casa', 'oleiros']
lambda val:  5 	 RMSE:  416.1748015094862 	 6 ['tamano', 'densidad', 'playa', 'vistas', 'es_casa', 'oleiros']
lambda val:  7 	 RMSE:  417.4087129672812 	 6 ['tamano', 'densidad', 'playa', 'vist

In [133]:
def get_step_score(current_cols, new_col, x_train, y_train):
    model = LinearRegression()
    rkf = RepeatedKFold(n_splits=5, n_repeats=10)
    cols = current_cols + [new_col]
    scores = cross_val_score(model, x_train[cols], y_train, scoring='neg_mean_squared_error', cv=rkf)
    scores = np.sqrt(np.abs(scores))
    scores_mean = np.mean(scores)
    return scores_mean, model

In [138]:
current_cols = []
models = []
scores = []
cols = []
all_columns = cols_model[:]
all_columns.remove('precio')

for k in range(len(all_columns)-1):
    best_col = None
    best_score = 1e20
    best_model = None
    for col in set(all_columns)-set(current_cols):
        score, model = get_step_score(current_cols, col, x_train, y_train)
        if score < best_score:
            best_score = score
            best_col = col
            best_model = model
    current_cols.append(best_col)
    models.append(best_model)
    scores.append(best_score)
    cols.append(current_cols[:])
    print(f'Best column: {best_col}\t score: {best_score}')

Best column: tamano	 score: 436.66021290349397
Best column: es_casa	 score: 429.0434762856611
Best column: art_berg_camb	 score: 423.5560468448748
Best column: estudiantes	 score: 416.3899806460782
Best column: playa	 score: 413.16563718353393
Best column: sin_ascensor	 score: 409.4622944628043
Best column: vistas	 score: 407.87192096681554
Best column: trastero	 score: 407.81949938727684
Best column: pib_capita	 score: 406.1802710494341
Best column: vacacional	 score: 409.1944766158256
Best column: valoracion	 score: 407.9332809964294
Best column: coruna	 score: 407.48485230165915
Best column: oleiros	 score: 408.08400333663826
Best column: parking	 score: 408.5682266233929
Best column: balcon	 score: 406.47989847273874
Best column: piscina	 score: 409.7519125616878
Best column: amueblado	 score: 410.63413882835476
Best column: profesores	 score: 410.7149208282941
Best column: densidad	 score: 412.5010212053801


In [15]:
model = LinearRegression()
model.fit(x_train, y_train)
r2_scored = model.score(x_test, y_test)
print("R2 Scored: ", r2_scored)

R2 Scored:  0.3828136716884596


In [11]:
abs(x.corrwith(y)).sort_values(ascending=False)

tamano            0.656707
es_casa           0.508830
n_habitaciones    0.430743
oleiros           0.259161
sin_ascensor      0.233355
amueblado         0.228033
vistas            0.201227
piscina           0.188358
estudiantes       0.147420
playa             0.136011
pib_capita        0.132718
coruna            0.117547
densidad          0.105291
art_berg_camb     0.063401
profesores        0.042951
trastero          0.031509
parking           0.026320
balcon            0.023787
valoracion        0.018504
vacacional        0.016390
dtype: float64

### Evaluate model

In [16]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Training RMSE score: {}.'.format(rmse_train))
print('Testing RMSE score: {}\n'.format(rmse_test))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)  
print('Training MAE score: {}'.format(mae_train))
print('Testing MAE score: {}'.format(mae_test))

Training RMSE score: 375.5887441083849.
Testing RMSE score: 399.44957624450495

Training MAE score: 219.40739463830664
Testing MAE score: 265.8617295896231


### Feature selection

In [20]:
model = LinearRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select=5)
sfs.fit(x, y)

idx = np.where(sfs.get_support())[0].tolist()
[col for col in x.columns[idx]]

['tamano', 'playa', 'sin_ascensor', 'es_casa', 'art_berg_camb']

In [19]:
idx = np.where(sfs.get_support())[0].tolist()
[col for col in x.columns[idx]]

['tamano', 'es_casa', 'art_berg_camb']