In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from get_data import *

IndentationError: expected an indented block after function definition on line 57 (bd.py, line 58)

### Data load

In [3]:
src_folder = os.path.join('data', 'idealista')
src_folder = 'bd_sql'

# df is created
ds = pd.DataFrame()

for file in os.listdir(src_folder):
    file = os.path.join(src_folder, file)
    ds = pd.concat([ds, pd.read_csv(file)], ignore_index=True)
    ds = ds.drop_duplicates(subset=['id'])

ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 656 entries, 0 to 655
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                656 non-null    int64  
 1   titulo            656 non-null    object 
 2   descripcion       644 non-null    object 
 3   extra_info        652 non-null    object 
 4   n_habitaciones    656 non-null    int64  
 5   tamano            656 non-null    int64  
 6   precio            656 non-null    int64  
 7   precio_modalidad  656 non-null    object 
 8   municipio         656 non-null    object 
 9   ascensor          656 non-null    bool   
 10  tipo_vivienda     656 non-null    object 
 11  es_casa           656 non-null    bool   
 12  para_estudiantes  656 non-null    bool   
 13  parking           656 non-null    bool   
 14  valoracion        637 non-null    float64
 15  n_banos           637 non-null    float64
dtypes: bool(4), float64(2), int64(4), object(6)


In [8]:
cols = ds.columns
cols_exclude = ['id']

#convert int columns to float
for col in cols:
    if ds[col].dtype == 'int64':
        ds[col] = ds[col].astype('float64')
    elif ds[col].dtype == 'object':
        cols_exclude.append(col)

cols_model = [col for col in cols if col not in cols_exclude]

### Deal with NAs

In [9]:
ds["valoracion"] = ds["valoracion"].replace("-1", np.nan)
ds["n_banos"] = ds["n_banos"].replace("-1", np.nan)

In [13]:
ds['valoracion'] = ds['valoracion'].fillna(ds['valoracion'].mean())
ds['n_banos'] = ds['n_banos'].fillna(ds['n_banos'].mean())

In [14]:
ds[cols_model].drop(['precio'], axis=1).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 656 entries, 0 to 655
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   n_habitaciones    656 non-null    float64
 1   tamano            656 non-null    float64
 2   ascensor          656 non-null    bool   
 3   es_casa           656 non-null    bool   
 4   para_estudiantes  656 non-null    bool   
 5   parking           656 non-null    bool   
 6   valoracion        656 non-null    float64
 7   n_banos           656 non-null    float64
dtypes: bool(4), float64(4)
memory usage: 28.2 KB


### Data split

In [20]:
x = ds[cols_model].drop(['precio'], axis=1)
y_original = ds['precio']
y = y_original/ds['tamano']

# slit data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Model

In [21]:
model = LinearRegression()
model.fit(x_train, y_train)
r2_scored = model.score(x_test, y_test)
print("R2 Scored: ", r2_scored)

R2 Scored:  -0.15940921067071745


In [22]:
x.corrwith(y).sort_values(ascending=False)

es_casa             0.067188
ascensor            0.057443
parking             0.021823
valoracion         -0.047876
n_banos            -0.077735
para_estudiantes   -0.122910
tamano             -0.232770
n_habitaciones     -0.291157
dtype: float64

### Evaluate model

y is price/m2

In [24]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Training RMSE score: {}.'.format(rmse_train))
print('Testing RMSE score: {}\n'.format(rmse_test))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)  
print('Training MAE score: {}'.format(mae_train))
print('Testing MAE score: {}'.format(mae_test))

Training RMSE score: 4.927834694120737.
Testing RMSE score: 2.777246416794873

Training MAE score: 2.7895739004083233
Testing MAE score: 2.2924990250898745


y is price

In [26]:
y_train_pred = model.predict(x_train)*x_train['tamano']
y_test_pred = model.predict(x_test)*x_test['tamano']

rmse_train = np.sqrt(mean_squared_error(y_original[y_train.index], y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_original[y_test.index], y_test_pred))
#print rmse and mae
print('Training RMSE score: {}.'.format(rmse_train))
print('Testing RMSE score: {}\n'.format(rmse_test))

mae_train = mean_absolute_error(y_original[y_train.index], y_train_pred)
mae_test = mean_absolute_error(y_original[y_test.index], y_test_pred)  
print('Training MAE score: {}'.format(mae_train))
print('Testing MAE score: {}'.format(mae_test))

Training RMSE score: 491.99348453737457.
Testing RMSE score: 406.33038115290316

Training MAE score: 297.99291420872385
Testing MAE score: 246.00931123530796
