## 2. ENTRENAMIENTOS

In [243]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt

In [154]:
from sklearn.model_selection import train_test_split

import statsmodels.formula.api as smf 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

En este notebook se realizarán entrenamientos sobre el dataset de train limpio mediante los siguientes modelos:

#### Possible Regression Models
    
    - LinearRegression ✓
    - DecisionTreeRegressor ✓
    - KNeighborsRegressor ✓
    - GradientBoostingRegressor ✓
    - RandomForestRegressor ✓

In [3]:
data_train_clean = pd.read_csv("../data/data_train_clean.csv")
data_train_clean

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_num,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,color_E,color_F,color_G,color_H,color_I,color_J
0,0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,3,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0.41,63.0,56.0,4.80,4.75,3.01,6.824,5,...,0,0,0,0,0,0,0,0,0,0
2,2,2,0.32,61.6,56.0,4.37,4.39,2.70,6.107,5,...,0,1,0,0,0,0,0,0,1,0
3,3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.390,5,...,0,0,0,1,0,0,0,1,0,0
4,4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,4,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,40450,0.52,61.2,58.0,5.16,5.20,3.17,7.508,4,...,0,1,0,0,0,0,0,0,0,0
40451,40451,40451,0.52,62.0,55.0,5.14,5.17,3.19,7.232,5,...,0,0,0,0,0,1,0,0,0,0
40452,40452,40452,0.73,63.5,58.0,5.68,5.72,3.62,8.065,3,...,0,1,0,0,0,0,0,0,0,0
40453,40453,40453,0.31,56.9,59.0,4.45,4.48,2.54,6.629,1,...,0,0,0,1,0,1,0,0,0,0


#### Train-Test-Split

In [7]:
X = data_train_clean.drop(['price'], axis=1)
y= data_train_clean["price"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                  y, 
                                                  test_size=0.20, 
                                                  random_state=123)

#### 1. Lineal Regression

In [10]:
lr = LinearRegression()

In [12]:
lr.fit(X_train, y_train)

LinearRegression()

In [89]:
linear_pred = lr.predict(X_test)
linear_pred

array([9.65340894, 6.66176601, 9.92630902, ..., 6.50188276, 7.48498783,
       8.582722  ])

In [90]:
mean_squared_error(y_test, linear_pred)

0.03179958502065602

#### 2. RandomForestRegressor

In [17]:
rf = RandomForestRegressor()

In [18]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [91]:
forest_pred = rf.predict(X_test)
forest_pred

array([9.49925, 6.83187, 9.73558, ..., 6.38659, 7.53571, 8.55804])

In [92]:
mean_squared_error(y_test, forest_pred)

0.011947044382091213

#### 3. Descission TreeRegressor

A) Árbol con 1 rama:

In [221]:
model = DecisionTreeRegressor(max_depth=1)

In [222]:
model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=1)

In [223]:
tree_pred = model.predict(X_test)

In [224]:
mean_squared_error(y_test, tree_pred)

0.2813794378171035

B) Árbol con 3 ramas:

In [225]:
# Arbol con 3 ramas (menos error que con solo una rama)
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)

mean_squared_error(y_test, tree_pred)

0.07523495702590297

C) Árbol con 10 ramas:

In [226]:
# Arbol con 10 ramas (menos error que con 3 ramas)
model = DecisionTreeRegressor(max_depth=10)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)

mean_squared_error(y_test, tree_pred)

0.027368998456566717

D) Árbol con 20 ramas --> ÁRBOL MÁS PRECISO

In [227]:
# Arbol con 20 ramas (menos error que con 10 ramas) ---> NOS QUEDAMOS CON ESTE!!!!! EL MÁS PRECISO
model = DecisionTreeRegressor(max_depth=20)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)

mean_squared_error(y_test, tree_pred)

0.02197230365157715

E) Árbol con 30 ramas --> OVERFITTING

In [111]:
'''
Arbol con 30 ramas (menos error que con 20 ramas) --> OVERFITTING 
(nos encontramos en un overfitting porque hemos llegado a un arbol tan complejo 
que empieza a subir nuestro mean squared error)

'''
model = DecisionTreeRegressor(max_depth=30)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)

mean_squared_error(y_test, tree_pred)

0.022868849071690314

#### 4. KNeighborsRegressor

In [262]:
n_neighbors = np.arange(1, 100)

In [263]:
model = KNeighborsRegressor()
parameter_space = {'n_neighbors': n_neighbors
                   }

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           cv=7)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=7, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])})

In [272]:
grid_search.best_score_

0.036331508207816485

In [277]:
best_knr = grid_search.best_estimator_
best_knr.score(X_test, y_test)

0.09216582107459104

In [280]:
knr_pred = best_knr.predict(X_test)

In [281]:
mean_squared_error(y_test, knr_pred)

0.9344860159127425

#### 5. GridSearch

In [124]:
model = Ridge()
parameter_space = {'alpha': np.linspace(0, 100, 1000),
                   }

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           cv=3)

grid_search.fit(X_train, y_train)

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


GridSearchCV(cv=3, estimator=Ridge(),
             param_grid={'alpha': array([  0.        ,   0.1001001 ,   0.2002002 ,   0.3003003 ,
         0.4004004 ,   0.5005005 ,   0.6006006 ,   0.7007007 ,
         0.8008008 ,   0.9009009 ,   1.001001  ,   1.1011011 ,
         1.2012012 ,   1.3013013 ,   1.4014014 ,   1.5015015 ,
         1.6016016 ,   1.7017017 ,   1.8018018 ,   1.9019019 ,
         2.002002  ,   2.1021021 ,   2.2022022 ,   2.3023023 ,
         2.4024024 ,   2.5025025 ,   2.6026026 ,   2.7027027 ,
         2.8028028 ,   2.9029...
        96.4964965 ,  96.5965966 ,  96.6966967 ,  96.7967968 ,
        96.8968969 ,  96.996997  ,  97.0970971 ,  97.1971972 ,
        97.2972973 ,  97.3973974 ,  97.4974975 ,  97.5975976 ,
        97.6976977 ,  97.7977978 ,  97.8978979 ,  97.997998  ,
        98.0980981 ,  98.1981982 ,  98.2982983 ,  98.3983984 ,
        98.4984985 ,  98.5985986 ,  98.6986987 ,  98.7987988 ,
        98.8988989 ,  98.998999  ,  99.0990991 ,  99.1991992 ,
        99.29

In [126]:
best_model = grid_search.best_estimator_
best_model

Ridge(alpha=0.0)

In [127]:
y_pred = best_model.predict(X_test)

In [129]:
mean_squared_error(y_test, y_pred)

0.03179291686843309

#### 6. GradientBoostingRegressor

In [156]:
model = GradientBoostingRegressor(n_estimators = 100)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [4, 6],
          'min_samples_leaf': [3, 10, 17],
          'max_features': [3, 1, 5]}
grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.6s
[Parallel(n_jobs=3)]: Done 144 out of 144 | elapsed:   43.2s finished


GridSearchCV(cv=2, estimator=GradientBoostingRegressor(), n_jobs=3,
             param_grid={'learning_rate': [0.1, 0.05, 0.02, 0.01],
                         'max_depth': [4, 6], 'max_features': [3, 1, 5],
                         'min_samples_leaf': [3, 10, 17]},
             verbose=1)

In [165]:
grid_search.best_score_

0.9851537613313581

In [172]:
best_gb = grid_search.best_estimator_
best_gb.score(X_test, y_test)

0.9863562821108569

In [167]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 6,
 'max_features': 5,
 'min_samples_leaf': 3}

In [238]:
boosting_pred = best_gb.predict(X_test)

In [239]:
mean_squared_error(y_test, boosting_pred)

0.01404426476601105