# Modeling
Comenzaremos con un modelo baseline para luego poder comparar la performance de modelos más complejos.

## Import libraries

In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

## Load dataset

In [2]:
df = pd.read_csv('../data/processed/kc_house_data_clean_with_outliers.csv')
df.head()

Unnamed: 0,price,sqft_living,grade,sqft_above,sqft_living15,bathrooms,view,sqft_basement,bedrooms,lat,waterfront,floors,renovated,sqft_lot,sqft_lot15,yr_built,condition,long,zipcode,house_age
7129300520,221900,1180,7,1180,1340,1,0,0,3,47.5112,0,1,0,5650,5650,1955,3,-122.257,98178,59
6414100192,538000,2570,7,2170,1690,2,0,400,3,47.721,0,2,1,7242,7639,1951,3,-122.319,98125,63
5631500400,180000,770,6,770,2720,1,0,0,2,47.7379,0,1,0,10000,8062,1933,3,-122.233,98028,82
2487200875,604000,1960,7,1050,1360,3,0,910,4,47.5208,0,1,0,5000,5000,1965,5,-122.393,98136,49
1954400510,510000,1680,8,1680,1800,2,0,0,3,47.6168,0,1,0,8080,7503,1987,3,-122.045,98074,28


## Baseline

In [3]:
y = df['price']
X = df.drop('price', axis=1)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split (X, y, test_size = 0.20, random_state=42)

In [10]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledRFR', Pipeline([('Scaler', StandardScaler()),('RFR', RandomForestRegressor())])))
pipelines.append(('ScaledBR', Pipeline([('Scaler', StandardScaler()),('BR', BaggingRegressor())])))
pipelines.append(('ScaledABR', Pipeline([('Scaler', StandardScaler()),('ABR', AdaBoostRegressor())])))
pipelines.append(('ScaledETR', Pipeline([('Scaler', StandardScaler()),('ETR', ExtraTreesRegressor())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', StandardScaler()),('XGB', XGBRegressor())])))
pipelines.append(('ScaledLGBM', Pipeline([('Scaler', StandardScaler()),('LGBM', LGBMRegressor())])))
#pipelines.append(('ScaledCB', Pipeline([('Scaler', StandardScaler()),('GBM', CatBoostRegressor())])))

results = []
names = []
r_scuared = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='r2')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: 0.698482 (0.013185)
ScaledLASSO: 0.697675 (0.012840)
ScaledEN: 0.670505 (0.013958)
ScaledKNN: 0.785186 (0.027530)
ScaledCART: 0.735533 (0.069105)
ScaledGBM: 0.865195 (0.019668)
ScaledRFR: 0.856599 (0.042510)
ScaledBR: 0.851772 (0.040377)
ScaledABR: 0.130887 (0.176382)
ScaledETR: 0.860495 (0.024206)
ScaledXGB: 0.864449 (0.016561)
ScaledLGBM: 0.878307 (0.018618)
