# Load Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import joblib

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

In [3]:
sale_data = pd.read_csv('data_available/sale_data_post_out.csv', sep=',')
sale_data.shape  #### cambiar esto

(7387, 34)

In [4]:
sale_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7387 entries, 0 to 7386
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   daft_id            7387 non-null   int64  
 1   url                7387 non-null   object 
 2   name               7387 non-null   object 
 3   price              7387 non-null   float64
 4   sale_type          7387 non-null   object 
 5   floor_area         7387 non-null   int64  
 6   entered_renewed    7387 non-null   object 
 7   views              7387 non-null   float64
 8   type_house         7387 non-null   object 
 9   type               7387 non-null   object 
 10  scraping_date      7387 non-null   object 
 11  latitude           7387 non-null   float64
 12  longitude          7387 non-null   float64
 13  bedroom            7387 non-null   int64  
 14  bathroom           7387 non-null   int64  
 15  country_code       7387 non-null   object 
 16  country            7387 

In [5]:
features = [#'price',
            'floor_area',
            'views',
            'latitude',
            'longitude',
      #      'bedroom',
            'bathroom',
            #'sale_type',
            'type_house',
#            'postcode',
 #           'state_district',
  #          'county',
  #          'city_district',
   #         'road',
      #      'place',
            'code',
  #          'admin1',
  #          'cities'
]

#sale_data = sale_data.dropna().copy()
data = sale_data[features].copy()
data.shape

(7387, 7)

In [6]:
data.isna().sum()

floor_area       0
views            0
latitude         0
longitude        0
bathroom         0
type_house       0
code          1328
dtype: int64

In [7]:
num_features = data.select_dtypes('number').columns
cat_features =  data.select_dtypes('object').columns

-----------

# Split Data

In [8]:
y = sale_data['price'].copy()
y.shape

(7387,)

In [9]:
print(data.shape)
#X = data.copy()
X = pd.get_dummies(data).copy() #[num_features]
print(X.shape)

(7387, 7)
(7387, 144)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6278, 144), (1109, 144), (6278,), (1109,))

# Train and Metrics

En estadística, el **coeficiente de determinación**, denominado R² y pronunciado R cuadrado, es un estadístico usado en el contexto de un modelo estadístico cuyo principal propósito es predecir futuros resultados o probar una hipótesis. El coeficiente determina la calidad del modelo para replicar los resultados, y la proporción de variación de los resultados que puede explicarse por el modelo.

Es el porcentaje de la variación en la variable de respuesta que es explicado por un modelo lineal. Es decir:

R-cuadrado = Variación explicada / variación total

El R-cuadrado siempre está entre 0 y 100%:

In [11]:
from sklearn import metrics
#metrics.SCORERS

In [12]:
def metrics_regression(y_test, y_pred, squared=False):
    
    r2_score = metrics.r2_score(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=squared)
    
    print(f'R²: {r2_score}')
    print(f'MAE: {mae}')
    print(f'MAPE: {mape}')
    #print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    

https://scikit-learn.org/stable/modules/cross_validation.html

In [13]:
from sklearn.model_selection import cross_validate

def cross_validate_custom(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False):
    estimator = estimator
    scoring_dict = scoring_dict

    scores = cross_validate(estimator, X=X_train, y=y_train, scoring=scoring_dict, cv=cv, 
                            return_train_score=return_train_score)
    print(scores.keys())
    return scores

In [14]:
def scores_statistics(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False, time_info=False):
    
    scores = cross_validate(estimator, 
                            X=X_train, y=y_train, 
                            scoring=scoring_dict, 
                            cv=cv, 
                            return_train_score=return_train_score)
    
    if time_info:
        fit_time_mean = np.mean(scores['fit_time'])
        fit_time_std = np.std(scores['fit_time'])
        score_time_mean = np.mean(scores['score_time'])
        score_time_std = np.std(scores['score_time'])
        #time_list = []    
        print('fit_time mean:', fit_time_mean)
        print('fit_time std:', fit_time_std)
        print('score_time mean:', score_time_mea)
        print('score_time std:', score_time_std)
    
    
    for key in scoring_dict:
        try:
            mean = np.mean(scores['test_' + key])
            std = np.std(scores['test_' + key])
            print(key, 'mean:', mean)
            print(key, 'std:', std, '\n')
        except:
            continue
    return scores

-------------

------------

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6278, 144), (1109, 144), (6278,), (1109,))

In [16]:
print(X_train.shape)
#X_train.isna().sum()

(6278, 144)


In [17]:
# Get dummy data
X_train = pd.get_dummies(X_train).copy()
print(X_train.shape)
# esto me esta dando resultados muy malos, por que? -> ya no
#from sklearn.preprocessing import LabelEncoder
#
#le = LabelEncoder()
#
#for column in cat_features:
 #   X_train[column] = le.fit_transform(X_train[column])

(6278, 144)


In [18]:
#print(X_train.shape)
#X_train.isna().sum()

In [19]:
from sklearn import metrics

scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}
print(X_train.shape)
scores_statistics(estimator=LinearRegression(), 
                  scoring_dict=scoring, 
                  X_train=X_train, 
                  y_train=y_train, 
                  cv=10, 
                  return_train_score=False)

(6278, 144)
r2 mean: 0.6587611305066721
r2 std: 0.03517679127895557 

MAE mean: -107850.82693683557
MAE std: 5129.985607460678 

MAPE mean: -0.3171449077544576
MAPE std: 0.011619781144053657 

RMSE mean: -174596.7853236304
RMSE std: 12612.673623854978 



{'fit_time': array([0.05124664, 0.0228467 , 0.01930833, 0.02088499, 0.0195477 ,
        0.01676369, 0.01694822, 0.01664162, 0.01620865, 0.01598287]),
 'score_time': array([0.00477767, 0.00414991, 0.0047729 , 0.00406289, 0.00409317,
        0.00350189, 0.00340915, 0.00351954, 0.00333381, 0.00346804]),
 'test_r2': array([0.66491897, 0.58516617, 0.66505855, 0.67411618, 0.71837922,
        0.68437437, 0.63132709, 0.64612149, 0.63149244, 0.68665683]),
 'test_MAE': array([-108275.36275876, -114335.23365346, -108532.46991317,
        -115688.07083499, -100545.33066282, -104749.92070561,
        -114786.52486813, -103434.77369875, -103962.97655004,
        -104197.60572263]),
 'test_MAPE': array([-0.32095181, -0.33014757, -0.31072275, -0.32474615, -0.29417467,
        -0.30518902, -0.33555384, -0.31286578, -0.32278502, -0.31431246]),
 'test_RMSE': array([-168545.31288794, -198511.22855113, -173586.97063944,
        -185271.98381194, -158795.39402639, -165580.01996221,
        -193149.96145729,

In [20]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train, y_train)
#lr.intercept_, lr.coef_
#lr.score(X_train, y_train)

#X_test = pd.get_dummies(X_test).copy()

metrics_regression(y_test=y_test, 
                   y_pred=lr.predict(X_test), 
                   squared=False)

R²: 0.6459638415138771
MAE: 113418.05850569207
MAPE: 0.3300826513842755
RMSE: 198091.79105672828


In [21]:
joblib.dump(lr, 'models/linear_regression_01-11-2021.plk')

['models/linear_regression_01-11-2021.plk']