## 3. MODELS

In [34]:
import pandas as pd
import numpy as np
import seaborn as sns

In [48]:
from sklearn.model_selection import train_test_split

import statsmodels.formula.api as smf 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

#### Possible Regression Models
    
    - LinearRegression ✓
    - RandomForestRegressor ✓
    - DecisionTreeRegressor ✓
    - GradientBoostingRegressor ✓
    - KNeighborsRegressor ✓

### TRAIN DATAFRAME

In [30]:
data_train_clean = pd.read_csv("../data/data_train_clean.csv")
data_train_clean

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_num,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,color_E,color_F,color_G,color_H,color_I,color_J
0,0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,3,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0.41,63.0,56.0,4.80,4.75,3.01,6.824,5,...,0,0,0,0,0,0,0,0,0,0
2,2,2,0.32,61.6,56.0,4.37,4.39,2.70,6.107,5,...,0,1,0,0,0,0,0,0,1,0
3,3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.390,5,...,0,0,0,1,0,0,0,1,0,0
4,4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,4,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,40450,0.52,61.2,58.0,5.16,5.20,3.17,7.508,4,...,0,1,0,0,0,0,0,0,0,0
40451,40451,40451,0.52,62.0,55.0,5.14,5.17,3.19,7.232,5,...,0,0,0,0,0,1,0,0,0,0
40452,40452,40452,0.73,63.5,58.0,5.68,5.72,3.62,8.065,3,...,0,1,0,0,0,0,0,0,0,0
40453,40453,40453,0.31,56.9,59.0,4.45,4.48,2.54,6.629,1,...,0,0,0,1,0,1,0,0,0,0


### TEST DATAFRAME

In [31]:
data_test_clean = pd.read_csv("../data/data_test_clean.csv")
data_test_clean

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,cut_num,clarity_IF,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,color_E,color_F,color_G,color_H,color_I,color_J
0,0,0,0.33,61.9,55.0,4.44,4.42,2.74,5,1,...,0,0,0,0,0,0,0,1,0,0
1,1,1,0.41,61.8,54.0,4.79,4.76,2.95,5,0,...,0,1,0,0,1,0,0,0,0,0
2,2,2,0.91,62.5,59.0,6.16,6.23,3.87,3,0,...,0,0,0,0,1,0,0,0,0,0
3,3,3,0.42,62.6,57.0,4.76,4.80,2.99,3,0,...,0,1,0,0,0,0,1,0,0,0
4,4,4,0.54,61.5,56.0,5.28,5.25,3.24,5,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,5,0,...,0,0,0,0,0,1,0,0,0,0
13481,13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,4,0,...,0,1,0,0,0,0,0,1,0,0
13482,13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,5,0,...,0,0,0,0,0,0,0,0,0,0
13483,13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,2,0,...,0,0,0,0,1,0,0,0,0,0


In [36]:
X = data_train_clean.drop(['price'], axis=1)
y= data_train_clean["price"]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=123)

#### 1. Linear Regression

In [39]:
lr = LinearRegression()

In [40]:
lr.fit(X, y)

LinearRegression()

In [41]:
linear_pred = lr.predict(data_test_clean)
linear_pred

array([6.80038957, 7.02236445, 8.11225314, ..., 6.77175396, 7.26510509,
       8.13410327])

In [42]:
linear_pred = pd.DataFrame(linear_pred, columns=["price"])
linear_pred = linear_pred.reset_index()
linear_pred = linear_pred.rename(columns={'index':'id'})

In [44]:
linear_pred

Unnamed: 0,id,price
0,0,6.800390
1,1,7.022364
2,2,8.112253
3,3,6.906468
4,4,7.778775
...,...,...
13480,13480,7.350483
13481,13481,8.749096
13482,13482,6.771754
13483,13483,7.265105


In [46]:
# GUARDAMOS (para subirlo a kaggle)
linear_pred.to_csv("../data/LinearRegression.csv", header= True, index=False)

#### 2. RandomForestRegressor

In [49]:
rf = RandomForestRegressor()

In [50]:
rf.fit(X, y)

RandomForestRegressor()

In [51]:
forest_pred = rf.predict(data_test_clean)
forest_pred

array([6.82784, 6.91809, 8.24527, ..., 6.70305, 7.2753 , 8.12209])

In [52]:
forest_pred = pd.DataFrame(forest_pred, columns=["price"])
forest_pred = forest_pred.reset_index()
forest_pred = forest_pred.rename(columns={'index':'id'})

In [53]:
forest_pred

Unnamed: 0,id,price
0,0,6.82784
1,1,6.91809
2,2,8.24527
3,3,6.73466
4,4,7.72144
...,...,...
13480,13480,7.27865
13481,13481,8.63965
13482,13482,6.70305
13483,13483,7.27530


In [54]:
# GUARDAMOS (para subirlo a kaggle)
forest_pred.to_csv("../data/RandomForestRegressor.csv", header= True, index=False)

#### 3. Descission TreeRegressor

In [56]:
model = DecisionTreeRegressor(max_depth=20)
model.fit(X, y)
tree_pred = model.predict(data_test_clean)

In [57]:
tree_pred = pd.DataFrame(tree_pred, columns=["price"])
tree_pred = tree_pred.reset_index()
tree_pred = tree_pred.rename(columns={'index':'id'})

In [58]:
tree_pred

Unnamed: 0,id,price
0,0,7.016000
1,1,6.996000
2,2,8.297053
3,3,6.604000
4,4,7.736000
...,...,...
13480,13480,7.436000
13481,13481,8.667286
13482,13482,6.773000
13483,13483,7.212000


In [59]:
# GUARDAMOS (para subirlo a kaggle)
tree_pred.to_csv("../data/DescissionTreeRegressor.csv", header= True, index=False)

#### 4. GradientBoostingRegressor

In [64]:
model = GradientBoostingRegressor(n_estimators = 100)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [4, 6],
          'min_samples_leaf': [3, 10, 17],
          'max_features': [3, 1, 5]}
grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)
grid_search.fit(X, y)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   14.9s
[Parallel(n_jobs=3)]: Done 144 out of 144 | elapsed:   57.3s finished


GridSearchCV(cv=2, estimator=GradientBoostingRegressor(), n_jobs=3,
             param_grid={'learning_rate': [0.1, 0.05, 0.02, 0.01],
                         'max_depth': [4, 6], 'max_features': [3, 1, 5],
                         'min_samples_leaf': [3, 10, 17]},
             verbose=1)

In [65]:
best_gb = grid_search.best_estimator_
best_gb.score(X, y)

0.9870129723757362

In [67]:
boosting_pred = best_gb.predict(data_test_clean)

In [68]:
boosting_pred = pd.DataFrame(boosting_pred, columns=["price"])
boosting_pred = boosting_pred.reset_index()
boosting_pred = boosting_pred.rename(columns={'index':'id'})

In [69]:
boosting_pred

Unnamed: 0,id,price
0,0,6.764118
1,1,6.941861
2,2,8.255597
3,3,6.784380
4,4,7.751058
...,...,...
13480,13480,7.317853
13481,13481,8.678702
13482,13482,6.634077
13483,13483,7.344144


In [70]:
# GUARDAMOS (para subirlo a kaggle)
boosting_pred.to_csv("../data/GradientBoostingRegressor.csv", header= True, index=False)

#### 5. KNeighborsRegressor

In [72]:
n_neighbors = np.arange(1, 100)

In [73]:
model = KNeighborsRegressor()
parameter_space = {'n_neighbors': n_neighbors
                   }

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           cv=7)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=7, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])})

In [75]:
best_knr = grid_search.best_estimator_
best_knr.score(X_test, y_test)

0.09216582107459104

In [78]:
knr_pred = best_knr.predict(data_test_clean)

In [80]:
knr_pred = pd.DataFrame(knr_pred, columns=["price"])
knr_pred = knr_pred.reset_index()
knr_pred = knr_pred.rename(columns={'index':'id'})

In [81]:
knr_pred

Unnamed: 0,id,price
0,0,6.5820
1,1,6.5820
2,2,7.4240
3,3,6.2485
4,4,7.3355
...,...,...
13480,13480,7.2945
13481,13481,8.0740
13482,13482,8.0740
13483,13483,8.0655


In [82]:
# GUARDAMOS (para subirlo a kaggle)
knr_pred.to_csv("../data/KNeighborsRegressor.csv", header= True, index=False)