In [9]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv("csv preparado/train_ready.csv", index_col = 0)
train_df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.12,3,4,2,61.6,59.0,6.67,6.63,4.1,5363
1,1.14,4,1,3,60.0,54.0,6.74,6.97,4.11,5593
2,0.9,4,0,3,60.3,63.0,6.12,6.22,3.72,3534
3,0.71,2,1,4,61.9,54.0,5.74,5.76,3.56,3212
4,0.34,4,2,3,60.0,62.0,4.51,4.55,2.72,447


In [3]:
columnas = [i for i in train_df.columns if i not in ["price"]]
X = train_df[columnas]
y = train_df["price"]

In [92]:
"""
pipeline = [
    StandardScaler()
]

transformer = make_pipeline(*pipeline)

X_data = transformer.fit_transform(X)
"""

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

## Segundo modelo: RandomForestRegressor

In [5]:
forest = RandomForestRegressor(n_estimators = 600, n_jobs=-1)

forest.fit(X, y)

RandomForestRegressor(n_estimators=600, n_jobs=-1)

In [6]:
y_pred = forest.predict(X_test)

In [7]:
print(f"r2: {round(r2_score(y_pred, y_test),3)}")
print(f"RMSE: {round(sqrt(mean_squared_error(y_pred, y_test)),3)}")

r2: 0.997
RMSE: 211.791


## Hyperparameter Tuning

In [22]:
parameters = {
 'max_depth': [20, 40, 60, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [200, 400, 600]}

In [23]:
forest = RandomForestRegressor()

forest.fit(X_train, y_train)

RandomForestRegressor()

In [24]:
grid = GridSearchCV(forest,parameters,verbose=1, n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 11.2min finished


GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [20, 40, 60, None],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [200, 400, 600]},
             verbose=1)

In [25]:
print(grid.best_params_)

{'max_depth': None, 'max_features': 'auto', 'n_estimators': 400}


In [26]:
params = {
 'n_estimators': [200, 400, 600, 800, 1000, 1200]}

In [27]:
forest = RandomForestRegressor()

forest.fit(X_train, y_train)

RandomForestRegressor()

In [28]:
grid = GridSearchCV(forest,params,verbose=1, n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.1min finished


GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'n_estimators': [200, 400, 600, 800, 1000, 1200]},
             verbose=1)

In [29]:
print(grid.best_params_)

{'n_estimators': 800}


## Entrenando modelo con todo el df

In [30]:
forest = RandomForestRegressor(n_estimators = 800, n_jobs=-1)

forest.fit(X, y)

RandomForestRegressor(n_estimators=800, n_jobs=-1)

In [31]:
y_pred = forest.predict(X_test)

In [32]:
print(f"r2: {round(r2_score(y_pred, y_test),3)}")
print(f"RMSE: {round(sqrt(mean_squared_error(y_pred, y_test)),3)}")

r2: 0.997
RMSE: 209.441


## Predicción

In [33]:
predict_df = pd.read_csv("csv preparado/predict_ready.csv", index_col = 0)
predict_df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.33,4,5,1,62.0,58.0,4.44,4.46,2.76
1,1.21,4,0,3,62.4,58.0,6.77,6.83,4.24
2,1.06,4,0,2,59.3,60.0,6.64,6.71,3.96
3,0.36,2,1,6,61.4,57.0,4.64,4.61,2.54
4,0.7,2,1,4,62.3,54.0,5.67,5.72,3.55


In [73]:
"""
pipeline2 = [
    StandardScaler(),
    Normalizer()
]

transformer2 = make_pipeline(*pipeline2)

X_data = transformer2.fit_transform(predict_df)
"""

In [34]:
y_test_pred = forest.predict(predict_df)

In [35]:
y_test_pred

array([ 800.41125   , 5945.1725    , 5800.69708333, ..., 2528.3875    ,
       1271.7025    , 1070.00625   ])

In [36]:
sol_forest = pd.DataFrame(y_test_pred, columns=["price"])

In [37]:
sol_forest.to_csv("resultados/sol_forest_lunes_tun.csv", index_label="id")