In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SequentialFeatureSelector

from get_data import *

### Data load

In [2]:
ds = get_data_model_v1()
ds.info()

Database object created
<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 0 to 636
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  637 non-null    int64  
 1   titulo              637 non-null    object 
 2   descripcion         625 non-null    object 
 3   extra_info          633 non-null    object 
 4   n_habitaciones      637 non-null    int64  
 5   tamano              637 non-null    int64  
 6   precio              637 non-null    float64
 7   municipio           637 non-null    object 
 8   n_banos             637 non-null    int64  
 9   n_plazas_garaje     637 non-null    int64  
 10  direccion           637 non-null    object 
 11  landmarks_cercanos  637 non-null    object 
 12  piscina             637 non-null    bool   
 13  valoracion          637 non-null    int64  
 14  densidad            637 non-null    int64  
 15  pib_capita          637 non-null 

  df = pd.read_sql_query(query, self.connection)


In [3]:
object_columns = ds.select_dtypes(include=['object']).columns.tolist()
ds_object = ds[object_columns]
ds_object.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 0 to 636
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   titulo              637 non-null    object
 1   descripcion         625 non-null    object
 2   extra_info          633 non-null    object
 3   municipio           637 non-null    object
 4   direccion           637 non-null    object
 5   landmarks_cercanos  637 non-null    object
dtypes: object(6)
memory usage: 34.8+ KB


In [4]:
cols = ds.columns
cols_exclude = ['id', 'titulo', 'descripcion', 'extra_info', 'direccion', 'landmarks_cercanos', 'municipio']
cols_check_nas = ['n_banos', 'n_plazas_garaje', 'valoracion']
for col in cols_check_nas:
    if sum(ds[col]==-1)/len(ds) > 0.25:
        print(f'Column {col} has more than 20% of missing values.')
        cols_exclude.append(col)

cols_model = [col for col in cols if col not in cols_exclude]

#convert int and bool columns to float
for col in cols:
    if ds[col].dtype == 'int64':
        ds[col] = ds[col].astype('float64')
    elif ds[col].dtype == 'object':
        cols_exclude.append(col)
    elif ds[col].dtype == 'bool':
        ds[col] = ds[col].astype('float64')

Column n_banos has more than 20% of missing values.
Column n_plazas_garaje has more than 20% of missing values.


### Data visualization

### Deal with NAs

In [5]:
for col in cols_check_nas:
    ds[col] = ds[col].fillna(np.mean(ds[col]))

### Municipios column

In [6]:
ds['coruna'] = ds["municipio"].apply(lambda x: 1.0 if 'coru' in x else 0.0)
ds['oleiros'] = ds["municipio"].apply(lambda x: 1.0 if 'oleiros' in x else 0.0)
ds['art_berg_camb'] = ds["municipio"].apply(lambda x: 1.0 if ('artei' in x or 'berg' in x or 'cambr' in x) else 0.0)

cols_model.append('coruna')
cols_model.append('oleiros')
cols_model.append('art_berg_camb')

### Data split

In [51]:
x = ds[cols_model].drop('precio', axis=1)
y = ds['precio']

# slit data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# scaler = StandardScaler()

### Model

In [52]:
model = linear_model.Lasso()
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(np.abs(scores))

np.mean(scores)

362.2975833055367

In [44]:
model = linear_model.Lasso(alpha=10)
model.fit(x_train, y_train)
r2_scored = model.score(x_test, y_test)
print("R2 Scored: ", r2_scored)

R2 Scored:  0.40761241229932943


In [46]:
# get name columns coef > 0
cols_coef = x_train.columns[model.coef_ > 0]
cols_coef

Index(['tamano', 'densidad', 'playa', 'vistas', 'oleiros'], dtype='object')

In [47]:
abs(x.corrwith(y)).sort_values(ascending=False)

tamano            0.656707
es_casa           0.508830
n_habitaciones    0.430743
oleiros           0.259161
sin_ascensor      0.233355
amueblado         0.228033
vistas            0.201227
piscina           0.188358
estudiantes       0.147420
playa             0.136011
pib_capita        0.132718
coruna            0.117547
densidad          0.105291
art_berg_camb     0.063401
profesores        0.042951
trastero          0.031509
parking           0.026320
balcon            0.023787
valoracion        0.018504
vacacional        0.016390
dtype: float64

### Evaluate model

In [48]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Training RMSE score: {}.'.format(rmse_train))
print('Testing RMSE score: {}\n'.format(rmse_test))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)  
print('Training MAE score: {}'.format(mae_train))
print('Testing MAE score: {}'.format(mae_test))

Training RMSE score: 389.7758310777012.
Testing RMSE score: 417.4312671568933

Training MAE score: 222.80474055073597
Testing MAE score: 232.93766504224644


### Feature selection

In [20]:
model = LinearRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select=5)
sfs.fit(x, y)

idx = np.where(sfs.get_support())[0].tolist()
[col for col in x.columns[idx]]

['tamano', 'playa', 'sin_ascensor', 'es_casa', 'art_berg_camb']

In [19]:
idx = np.where(sfs.get_support())[0].tolist()
[col for col in x.columns[idx]]

['tamano', 'es_casa', 'art_berg_camb']