In [3]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import *

In [4]:
data = pd.read_csv("../output/clean_diamonds.csv")

In [5]:
data.drop(["id", "x", "y", "z"],  axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.78,4,0,4,61.5,58.0,3446
1,0.31,5,0,3,60.8,56.0,732
2,0.3,5,0,3,62.3,54.0,475
3,1.04,5,0,5,62.0,58.0,9552
4,0.65,5,1,3,61.4,55.0,1276


## Estandarización y normalización

In [7]:
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

In [8]:
X = data.drop("price", axis=1)
y = data["price"]

In [9]:
# No normalizo como sugerencia de Eli por la media

pipeline = [
    StandardScaler(),
    Normalizer(),
]

tr = make_pipeline(*pipeline)

Xpr = tr.fit_transform(X)
Xpr = pd.DataFrame(Xpr,columns=X.columns)

In [10]:
Xpr.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,-0.033799,0.076382,-0.935572,0.206895,-0.158423,0.223217
1,-0.498364,0.475403,-0.500639,-0.268626,-0.322409,-0.314449
2,-0.429466,0.401448,-0.422758,-0.226837,0.157747,-0.63193
3,0.276896,0.531653,-0.559875,0.548034,0.095017,0.13358
4,-0.164913,0.519452,0.512696,-0.293515,-0.129723,-0.580635


## Modelización Random Forest básico

In [9]:
rf_reg = RandomForestRegressor(n_estimators=500, max_depth=60, min_samples_leaf=3, random_state=111)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=60, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=111, verbose=0, warm_start=False)

In [12]:
print('METRIC SUMMARY')
print('MSE', mean_squared_error(y_test, rf_reg.predict(X_test)))
print('RMSE', np.sqrt(mean_squared_error(y_test, rf_reg.predict(X_test))))
print('MSLE', mean_squared_log_error(y_test, rf_reg.predict(X_test)))
print('MAE', mean_absolute_error(y_test, rf_reg.predict(X_test)))
print('R2', r2_score(y_test, rf_reg.predict(X_test)))

METRIC SUMMARY
MSE 601769.2177303312
RMSE 775.737853743345
MSLE 0.018307867643151576
MAE 416.3017975353854
R2 0.9615940114837317


In [13]:
## ME DA ERROR

from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

### Random Forest "vitaminado"

In [20]:

rf = RandomForestRegressor(n_estimators=200,
 min_samples_split=5,
 min_samples_leaf=4,
 max_features='auto',
 max_depth=10,
 bootstrap=Tru

In [21]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [22]:
print('METRIC SUMMARY')
print('MSE', mean_squared_error(y_test, rf.predict(X_test)))
print('RMSE', np.sqrt(mean_squared_error(y_test, rf.predict(X_test))))
print('MSLE', mean_squared_log_error(y_test, rf.predict(X_test)))
print('MAE', mean_absolute_error(y_test, rf.predict(X_test)))
print('R2', r2_score(y_test, rf.predict(X_test)))

METRIC SUMMARY
MSE 584687.7906106163
RMSE 764.6488021376979
MSLE 0.018491118194274646
MAE 416.0963677158166
R2 0.9626841787346848


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
gsc = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid={
            'C': [0.1, 1, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
            'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

In [12]:
grid_result = gsc.fit(X, y)
best_params = grid_result.best_params_
best_svr = SVR(kernel='rbf', C=best_params["C"], epsilon=best_params["epsilon"], gamma=best_params["gamma"],
                   coef0=0.1, shrinking=True,
                   tol=0.001, cache_size=200, verbose=False, max_iter=3)

KeyboardInterrupt: 