# Load Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import joblib

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

In [3]:
sale_data = pd.read_csv('data_available/sale_data_post_out.csv', sep=',')
sale_data.shape  #### cambiar esto

(7387, 15)

In [4]:
sale_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7387 entries, 0 to 7386
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   daft_id          7387 non-null   int64  
 1   url              7387 non-null   object 
 2   name             7387 non-null   object 
 3   price            7387 non-null   float64
 4   sale_type        7387 non-null   object 
 5   floor_area       7387 non-null   int64  
 6   entered_renewed  7387 non-null   object 
 7   views            7387 non-null   float64
 8   type_house       7387 non-null   object 
 9   type             7387 non-null   object 
 10  scraping_date    7387 non-null   object 
 11  latitude         7387 non-null   float64
 12  longitude        7387 non-null   float64
 13  bedroom          7387 non-null   int64  
 14  bathroom         7387 non-null   int64  
dtypes: float64(4), int64(4), object(7)
memory usage: 865.8+ KB


-----------

# Split Data

In [5]:
y = sale_data['price'].copy()
y.shape

(7387,)

In [6]:
print(sale_data.shape)
X = sale_data[['floor_area', 'bedroom', 'bathroom']]
print(X.shape)

(7387, 15)
(7387, 3)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6278, 3), (1109, 3), (6278,), (1109,))

In [8]:
lr = LinearRegression().fit(X_train, y_train)
lr.intercept_, lr.coef_

(125383.3335334807, array([  2017.52528504, -19308.42811438,  27338.55891913]))

In [9]:
y_pred = lr.predict(X_test)

In [10]:
len(lr.predict(X_test))

1109

# Train and Metrics

En estadística, el **coeficiente de determinación**, denominado R² y pronunciado R cuadrado, es un estadístico usado en el contexto de un modelo estadístico cuyo principal propósito es predecir futuros resultados o probar una hipótesis. El coeficiente determina la calidad del modelo para replicar los resultados, y la proporción de variación de los resultados que puede explicarse por el modelo.

Es el porcentaje de la variación en la variable de respuesta que es explicado por un modelo lineal. Es decir:

R-cuadrado = Variación explicada / variación total

El R-cuadrado siempre está entre 0 y 100%:

In [19]:
from sklearn import metrics
metrics.SCORERS

{'explained_variance': make_scorer(explained_variance_score),
 'r2': make_scorer(r2_score),
 'max_error': make_scorer(max_error, greater_is_better=False),
 'neg_median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
 'neg_mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
 'neg_mean_absolute_percentage_error': make_scorer(mean_absolute_percentage_error, greater_is_better=False),
 'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
 'neg_mean_squared_log_error': make_scorer(mean_squared_log_error, greater_is_better=False),
 'neg_root_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False, squared=False),
 'neg_mean_poisson_deviance': make_scorer(mean_poisson_deviance, greater_is_better=False),
 'neg_mean_gamma_deviance': make_scorer(mean_gamma_deviance, greater_is_better=False),
 'accuracy': make_scorer(accuracy_score),
 'top_k_accuracy': make_scorer(top_k_accuracy_score, ne

In [32]:
def metrics_regression(y_test, y_pred, squared=False):
    
    r2_score = metrics.r2_score(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=squared)
    
    print(f'R²: {r2_score}')
    print(f'MAE: {mae}')
    print(f'MAPE: {mape}')
    #print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    

In [33]:
metrics_regression(y_test=y_test, y_pred=y_pred, squared=False)

R²: 0.25612480853651065
MAE: 163855.79449242458
MAPE: 0.5529567800747028
RMSE: 244605.6808316629


https://scikit-learn.org/stable/modules/cross_validation.html

In [46]:
def cross_validate_custom(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False):
    from sklearn.model_selection import cross_validate
    estimator = estimator
    scoring_dict = scoring_dict

    scores = cross_validate(estimator, X=X_train, y=y_train, scoring=scoring_dict, cv=cv, 
                            return_train_score=return_train_score)
    print(scores.keys())
    return scores

In [66]:
scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}

scores = cross_validate_custom(estimator=LinearRegression(), scoring_dict=scoring, 
                               X_train=X_train, y_train=y_train, cv=5, return_train_score=False)

dict_keys(['fit_time', 'score_time', 'test_r2', 'test_MAE', 'test_MAPE', 'test_RMSE'])


In [64]:
def scores_statistics(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False, time_info=False):
    
    scores = cross_validate(estimator, X=X_train, y=y_train, scoring=scoring_dict, cv=cv, 
                            return_train_score=return_train_score)
    
    if time_info:
        fit_time_mean = np.mean(scores['fit_time'])
        fit_time_std = np.std(scores['fit_time'])
        score_time_mean = np.mean(scores['score_time'])
        score_time_std = np.std(scores['score_time'])
        #time_list = []    
        print('fit_time mean:', fit_time_mean)
        print('fit_time std:', fit_time_std)
        print('score_time mean:', score_time_mea)
        print('score_time std:', score_time_std)
    
    
    for key in scoring_dict:
        try:
            mean = np.mean(scores['test_' + key])
            std = np.std(scores['test_' + key])
            print(key, 'mean:', mean)
            print(key, 'std:', std, '\n')
        except:
            continue
    return scores

In [65]:
scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}

scores_statistics(estimator=LinearRegression(), scoring_dict=scoring, 
                               X_train=X_train, y_train=y_train, cv=5, return_train_score=False)

r2 mean: 0.28213891967775284
r2 std: 0.028825670744816942 

MAE mean: -168927.75887667557
MAE std: 3188.0910054992414 

MAPE mean: -0.5353545179960923
MAPE std: 0.010160211793787616 

RMSE mean: -260917.70382416985
RMSE std: 9646.858055290568 



{'fit_time': array([0.00338578, 0.00248647, 0.00237679, 0.00219226, 0.00217009]),
 'score_time': array([0.00284076, 0.00253391, 0.00238419, 0.00237727, 0.00236821]),
 'test_r2': array([0.25085074, 0.32644945, 0.2928558 , 0.2904633 , 0.2500753 ]),
 'test_MAE': array([-166785.49861016, -172291.35292877, -173257.33364059,
        -165533.81175592, -166770.79744793]),
 'test_MAPE': array([-0.53527614, -0.51866855, -0.54428586, -0.54724833, -0.53129371]),
 'test_RMSE': array([-259079.74939786, -271722.70815766, -267641.4472675 ,
        -243668.21408528, -262476.40021255])}

-------------

In [14]:
joblib.dump(lr, 'models/linear_regression.plk')
# esto es genial <3

['models/linear_regression.plk']

In [15]:
testing_load = joblib.load('models/linear_regression.plk')

In [16]:
testing_load

LinearRegression()

In [17]:
testing_load.intercept_, testing_load.coef_

(125383.3335334807, array([  2017.52528504, -19308.42811438,  27338.55891913]))