# Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import joblib

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

In [None]:
sale_data = pd.read_csv('data_available/sale_data_post_out.csv', sep=',')
sale_data.shape  #### cambiar esto

In [None]:
sale_data.info()

In [None]:
features = [#'price',
            'floor_area',
            'views',
            'latitude',
            'longitude',
      #      'bedroom',
            'bathroom',
            #'sale_type',
            'type_house',
#            'postcode',
 #           'state_district',
  #          'county',
  #          'city_district',
   #         'road',
      #      'place',
            'code',
  #          'admin1',
  #          'cities'
]

#sale_data = sale_data.dropna().copy()
data = sale_data[features].copy()
data.shape

In [None]:
data.isna().sum()

In [None]:
num_features = data.select_dtypes('number').columns
cat_features =  data.select_dtypes('object').columns

-----------

# Split Data

In [None]:
y = sale_data['price'].copy()
y.shape

In [None]:
print(data.shape)
#X = data.copy()
X = pd.get_dummies(data).copy() #[num_features]
print(X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train and Metrics

En estadística, el **coeficiente de determinación**, denominado R² y pronunciado R cuadrado, es un estadístico usado en el contexto de un modelo estadístico cuyo principal propósito es predecir futuros resultados o probar una hipótesis. El coeficiente determina la calidad del modelo para replicar los resultados, y la proporción de variación de los resultados que puede explicarse por el modelo.

Es el porcentaje de la variación en la variable de respuesta que es explicado por un modelo lineal. Es decir:

R-cuadrado = Variación explicada / variación total

El R-cuadrado siempre está entre 0 y 100%:

In [None]:
from sklearn import metrics
#metrics.SCORERS

In [None]:
def metrics_regression(y_test, y_pred, squared=False):
    
    r2_score = metrics.r2_score(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=squared)
    
    print(f'R²: {r2_score}')
    print(f'MAE: {mae}')
    print(f'MAPE: {mape}')
    #print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    

https://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
from sklearn.model_selection import cross_validate

def cross_validate_custom(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False):
    estimator = estimator
    scoring_dict = scoring_dict

    scores = cross_validate(estimator, X=X_train, y=y_train, scoring=scoring_dict, cv=cv, 
                            return_train_score=return_train_score)
    print(scores.keys())
    return scores

In [None]:
def scores_statistics(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False, time_info=False):
    
    scores = cross_validate(estimator, 
                            X=X_train, y=y_train, 
                            scoring=scoring_dict, 
                            cv=cv, 
                            return_train_score=return_train_score)
    
    if time_info:
        fit_time_mean = np.mean(scores['fit_time'])
        fit_time_std = np.std(scores['fit_time'])
        score_time_mean = np.mean(scores['score_time'])
        score_time_std = np.std(scores['score_time'])
        #time_list = []    
        print('fit_time mean:', fit_time_mean)
        print('fit_time std:', fit_time_std)
        print('score_time mean:', score_time_mea)
        print('score_time std:', score_time_std)
    
    
    for key in scoring_dict:
        try:
            mean = np.mean(scores['test_' + key])
            std = np.std(scores['test_' + key])
            print(key, 'mean:', mean)
            print(key, 'std:', std, '\n')
        except:
            continue
    return scores

-------------

------------

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
print(X_train.shape)
#X_train.isna().sum()

In [None]:
# Get dummy data
X_train = pd.get_dummies(X_train).copy()
print(X_train.shape)
# esto me esta dando resultados muy malos, por que? -> ya no
#from sklearn.preprocessing import LabelEncoder
#
#le = LabelEncoder()
#
#for column in cat_features:
 #   X_train[column] = le.fit_transform(X_train[column])

In [None]:
#print(X_train.shape)
#X_train.isna().sum()

In [None]:
from sklearn import metrics

scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}
print(X_train.shape)
scores_statistics(estimator=LinearRegression(), 
                  scoring_dict=scoring, 
                  X_train=X_train, 
                  y_train=y_train, 
                  cv=10, 
                  return_train_score=False)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train, y_train)
#lr.intercept_, lr.coef_
#lr.score(X_train, y_train)

#X_test = pd.get_dummies(X_test).copy()

metrics_regression(y_test=y_test, 
                   y_pred=lr.predict(X_test), 
                   squared=False)

In [None]:
joblib.dump(lr, 'models/linear_regression_01-11-2021.plk')

---------------

-------------------

# Polynomial Regression

In [246]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, 
                                   PolynomialFeatures)
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.impute import SimpleImputer

In [247]:
def split_train_test(df, test_ratio=.15):
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]

In [248]:
def split_x_y(train_set, test_set, features, target='price'):
    y_train = train_set[target].copy()
    X_train = train_set[features].copy()
    y_test = test_set[target].copy()
    X_test = test_set[features].copy()
    print('X_train:', X_train.shape, '\n' + 
          'X_test:', X_test.shape, '\n' + 
          'y_train:', y_train.shape, '\n' + 
          'y_test:', y_test.shape, '\n')
    return X_train, X_test, y_train, y_test

In [249]:
def scores_statistics(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False, time_info=False):
    
    scores = cross_validate(estimator, 
                            X=X_train, y=y_train, 
                            scoring=scoring_dict, 
                            cv=cv, 
                            return_train_score=return_train_score)
    
    if time_info:
        fit_time_mean = np.mean(scores['fit_time'])
        fit_time_std = np.std(scores['fit_time'])
        score_time_mean = np.mean(scores['score_time'])
        score_time_std = np.std(scores['score_time'])
        #time_list = []    
        print('fit_time mean:', fit_time_mean)
        print('fit_time std:', fit_time_std)
        print('score_time mean:', score_time_mea)
        print('score_time std:', score_time_std)
    
    
    for key in scoring_dict:
        try:
            mean = np.mean(scores['test_' + key])
            std = np.std(scores['test_' + key])
            print(key, 'mean:', mean)
            print(key, 'std:', std, '\n')
        except:
            continue
    return scores

In [250]:
features = ['price',
            'floor_area',
            'views',
            'latitude',
            'longitude',
      #      'bedroom',
            'bathroom',
            #'sale_type',
            'type_house',
#            'postcode',
 #           'state_district',
  #          'county',
  #          'city_district',
   #         'road',
      #      'place',
            'code',
  #          'admin1',
  #          'cities'
           ]

In [251]:
sale_data = pd.read_csv('data_available/sale_data_post_out.csv', sep=',')
print(sale_data.shape)  #### cambiar esto

data = sale_data[features].copy()
print(data.shape)

(7387, 34)
(7387, 8)


In [252]:
print(data.isna().sum())

price            0
floor_area       0
views            0
latitude         0
longitude        0
bathroom         0
type_house       0
code          1328
dtype: int64


In [253]:
train_set, test_set = split_train_test(data, .15)
print(train_set.shape, test_set.shape)
#print(train_set.isna().sum())

(6279, 8) (1108, 8)


In [254]:
features = list(train_set.columns)
features.remove('price')
features

['floor_area',
 'views',
 'latitude',
 'longitude',
 'bathroom',
 'type_house',
 'code']

In [263]:
X_train, X_test, y_train, y_test = split_x_y(train_set=train_set, 
                                             test_set=test_set, 
                                             features=features, 
                                             target='price')

#print(X_train.isna().sum())

X_train: (6279, 7) 
X_test: (1108, 7) 
y_train: (6279,) 
y_test: (1108,) 



In [262]:
num_pipe = Pipeline([
    ('std_scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ])

#cat_features = train_set.select_dtypes('object').columns
type_house_levels = train_set.type_house.unique()
code_levels = train_set.code.unique()

cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(categories=[type_house_levels, code_levels]))  #code_levels
                                      #handle_unknown='ignore'
    ])

#num_features = data.select_dtypes('number').columns
#cat_features =  data.select_dtypes('object').columns
num_features = ['floor_area', 
                'views', 
                'latitude', 
                'longitude', 
                'bathroom']
cat_features = ['type_house',
                'code']

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
    ]) #, remainder='passthrough'
#preprocessor


estimator = Pipeline(steps=[('preprocessor', preprocessor),
                     ('imputer', SimpleImputer(strategy='constant', 
                                               fill_value=None)),
                     ('regressor', LinearRegression())
                    ])
#estimator

In [264]:
scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}
print(X_train.shape)

scores = scores_statistics(estimator=estimator, 
                           scoring_dict=scoring, 
                           X_train=X_train, 
                           y_train=y_train, 
                           cv=10, 
                           return_train_score=False)

(6279, 7)
r2 mean: 0.7409140436903824
r2 std: 0.04372488177613465 

MAE mean: -95497.44682321073
MAE std: 3504.089447897079 

MAPE mean: -0.2746810244943
MAPE std: 0.013699581137712797 

RMSE mean: -153433.27998466874
RMSE std: 9163.199683532202 

