In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
diamonds = pd.read_csv('../inputs/diamonds_train.csv')

In [3]:
#Función::

def clean (df):

    #Creo dict para reemplazar los valores de cut a formato numérico:

    punt_cut = {'Premium': 5, 'Ideal': 4, 'Very Good': 3, 'Good':2, 'Fair': 1}
    df['cut'] = df['cut'].map(punt_cut)

    #Creo dict para reemplazar los valores de cut a formato numérico:
    punt_color = {'F':2, 'D':7, 'E':3, 'J':4, 'H':5, 'I':6, 'G':4}

    df['color'] = df['color'].map(punt_color)

    #Creo dict para reemplazar los valores de cut a formato numérico:
    punt_clarity = {'VS1':4, 'SI1':2, 'VVS2':7, 'SI2':3, 'VS2':5, 'IF':8, 'VVS1':6, 'I1':1}

    df['clarity'] = df['clarity'].map(punt_clarity)
    
    return df


In [4]:
diamonds = clean(diamonds)

In [5]:
X = diamonds.drop(columns=["price"])
y = diamonds['price']

X_copy = X.copy()
y_copy = y.copy()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32276, 10) (8069, 10) (32276,) (8069,)


In [6]:
forest_model = RandomForestRegressor(n_estimators=15, max_depth=3, min_samples_leaf=3, random_state=111)
selector = RFECV(forest_model, step=1, cv=5)

selector.fit(X, y)

RFECV(cv=5,
      estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                      criterion='mse', max_depth=3,
                                      max_features='auto', max_leaf_nodes=None,
                                      max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=3, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=15, n_jobs=None,
                                      oob_score=False, random_state=111,
                                      verbose=0, warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring=None, step=1, verbose=0)

In [7]:
selector.n_features_

4

In [8]:
pd.Series(X.columns)[selector.support_.tolist()]

1      carat
4    clarity
7          x
8          y
dtype: object

In [9]:
X.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'],
      dtype='object')

In [10]:
#Elimino las columnas que ha desechado el selector:
X.drop(['id','cut','color','depth','table','z'], axis =1, inplace = True)

In [11]:
X.head()

Unnamed: 0,carat,clarity,x,y
0,0.78,4,5.93,5.98
1,0.31,2,4.37,4.32
2,0.3,2,4.3,4.34
3,1.04,7,6.54,6.46
4,0.65,2,5.58,5.62


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
forest_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=15, n_jobs=None, oob_score=False,
                      random_state=111, verbose=0, warm_start=False)

In [13]:
print(forest_model.score(X_train, y_train))
print(forest_model.score(X_test, y_test))

0.8925484576875466
0.8914728973613338


In [14]:
def errors (y, X, model):

    print(f'METRIC SUMMARY')
    print(f'MSE: {mean_squared_error(y, model.predict(X))}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y, model.predict(X)))}')
    print(f'MSLE:  {mean_squared_log_error(y, model.predict(X))}')
    print(f'MAE:  {mean_absolute_error(y, model.predict(X))}')
    print(f'R2:  {r2_score(y, model.predict(X))}')


In [15]:
errors(y_test,X_test, forest_model)

METRIC SUMMARY
MSE: 1700470.9989366832
RMSE: 1304.0210883788204
MSLE:  0.06813815211998303
MAE:  742.040370628593
R2:  0.8914728973613338


In [16]:
forest_model.fit(X, y)

print(forest_model.score(X, y))

0.8933358936031671


In [17]:
errors(y_test, X_test, forest_model)

METRIC SUMMARY
MSE: 1672144.4720836752
RMSE: 1293.1142532984761
MSLE:  0.06742575704147705
MAE:  736.1849290796216
R2:  0.8932807470036362


In [18]:
test = pd.read_csv('../inputs/diamonds_test.csv')

test_copy = test.copy()
test = clean(test)

#Elimino las columnas que ha desechado el selector anteriormente:
test.drop(['id','cut','color','depth','table','z'], axis =1, inplace = True)

test.head()

Unnamed: 0,carat,clarity,x,y
0,1.1,3,6.69,6.6
1,0.51,2,5.07,5.1
2,2.03,2,8.14,8.09
3,1.21,2,6.96,6.91
4,0.55,2,5.27,5.22


In [19]:
y_final_pred = forest_model.predict(test)

y_final_pred

array([ 4966.12430614,  1617.31763675, 14916.41682877, ...,
         787.62642439,   787.62642439,  1711.66712544])

file name:  randomForest1

In [20]:
test_copy['randomForest'] = y_final_pred

final = test_copy[['id', 'randomForest']]

final.rename(columns={'randomForest': "price"}, inplace = True)

final.to_csv('outputs/predict1.csv', header=True, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


## Validación de resultados aplicando el modelo al dataset completo, sin eliminar columnas

name_file: randomForest2_min_params


In [21]:
X_copy_train, X_copy_test, y_copy_train, y_copy_test = train_test_split(X_copy, y_copy, test_size=0.2, random_state=42)

forest_model_copy = RandomForestRegressor(n_estimators=100)

forest_model_copy.fit(X_copy, y_copy)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [22]:
errors(y_copy_test, X_copy_test, forest_model_copy)

METRIC SUMMARY
MSE: 68134.76687817574
RMSE: 261.0263719974971
MSLE:  0.0019134603516933742
MAE:  128.91970008675176
R2:  0.9956515172308888


In [23]:
# Al cambiar los parámetros de RandomForest se reduce en general el error. Lo exportamos como solución:

test = pd.read_csv('../inputs/diamonds_test.csv')

test_copy = test.copy()
test = clean(test)

test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.1,5,5,3,62.2,58.0,6.69,6.6,4.13
1,1,0.51,4,6,2,62.5,57.0,5.07,5.1,3.18
2,2,2.03,5,4,2,61.9,59.0,8.14,8.09,5.02
3,3,1.21,5,2,2,60.0,60.0,6.96,6.91,4.16
4,4,0.55,4,2,2,61.8,55.0,5.27,5.22,3.24


In [24]:
y_pred_copy = forest_model_copy.predict(test)

In [25]:
test_copy['randomForest'] = y_pred_copy

final_copy = test_copy[['id', 'randomForest']]

final_copy.rename(columns={'randomForest': "price"}, inplace = True)

final_copy.to_csv('outputs/predict2.csv', header=True, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [26]:
test_copy.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,randomForest
0,0,1.1,Premium,H,SI2,62.2,58.0,6.69,6.6,4.13,4725.91
1,1,0.51,Ideal,I,SI1,62.5,57.0,5.07,5.1,3.18,1021.32
2,2,2.03,Premium,G,SI1,61.9,59.0,8.14,8.09,5.02,14745.06
3,3,1.21,Premium,F,SI1,60.0,60.0,6.96,6.91,4.16,6721.86
4,4,0.55,Ideal,F,SI1,61.8,55.0,5.27,5.22,3.24,1536.94


## Sin eliminar columnas y variando parametros de randomForest

En model3, se valida que con 200 y mejora la predicción.


In [27]:
#A partir de X_copy y y_copy

X_train, X_test, y_train, y_test = train_test_split(X_copy, y_copy, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32276, 10) (8069, 10) (32276,) (8069,)


In [28]:
forest_model_1 = RandomForestRegressor(n_estimators=200, max_depth=3, min_samples_leaf=3, random_state=111)

forest_model_1.fit(X_train, y_train)
y_pred_1 = forest_model_1.predict(X_test)
errors(y_test, X_test, forest_model_1)

METRIC SUMMARY
MSE: 1769483.1616319786
RMSE: 1330.2192156302578
MSLE:  0.06973185979349956
MAE:  760.5432528671655
R2:  0.8922357082375852


In [29]:
forest_model_2 = RandomForestRegressor(n_estimators=200, max_depth=5, min_samples_leaf=5)

forest_model_2.fit(X_train, y_train)
y_pred_1 = forest_model_2.predict(X_test)
errors(y_test, X_test, forest_model_2)

METRIC SUMMARY
MSE: 1065615.6797098082
RMSE: 1032.2866267223499
MSLE:  0.03970311997928984
MAE:  583.0511682102359
R2:  0.9351023386348927


In [30]:
forest_model_3 = RandomForestRegressor(n_estimators=200, min_samples_leaf=5)

forest_model_3.fit(X_train, y_train)
y_pred_1 = forest_model_3.predict(X_test)
errors(y_test, X_test, forest_model_3)

METRIC SUMMARY
MSE: 567684.7720488325
RMSE: 753.4485862013629
MSLE:  0.013320241700711288
MAE:  371.0902086335245
R2:  0.9654271096042937


In [31]:
forest_model_4 = RandomForestRegressor(n_estimators=200, min_samples_leaf=3, max_features=3)

forest_model_4.fit(X_train, y_train)
y_pred_1 = forest_model_4.predict(X_test)
errors(y_test, X_test, forest_model_4)

METRIC SUMMARY
MSE: 624770.7402879094
RMSE: 790.4244051697224
MSLE:  0.015736896873314737
MAE:  399.5752732718628
R2:  0.96195048486423


In [32]:
forest_model_5 = RandomForestRegressor(n_estimators=200, min_samples_leaf=3,max_features=2, bootstrap=False)

forest_model_5.fit(X_train, y_train)
y_pred_1 = forest_model_5.predict(X_test)
errors(y_test, X_test, forest_model_5)

METRIC SUMMARY
MSE: 669294.9548947936
RMSE: 818.1044889834021
MSLE:  0.017204903229911108
MAE:  417.54273954939646
R2:  0.9592388905651561


Entreno el 4 con el total de datos:

In [33]:
forest_model_4 = RandomForestRegressor(n_estimators=200, min_samples_leaf=3, max_features=3)

forest_model_4.fit(X_copy, y_copy)
y_pred_4 = forest_model_4.predict(test)


In [34]:
test_copy['randomForest4'] = y_pred_4

final_copy = test_copy[['id', 'randomForest4']]

final_copy.rename(columns={'randomForest4': "price"}, inplace = True)

final_copy.to_csv('outputs/predictRandom4.csv', header=True, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [35]:
final_copy.head()

Unnamed: 0,id,price
0,0,4656.882829
1,1,1124.153815
2,2,14561.142993
3,3,6389.57838
4,4,1501.445474


##  Analizando la correlación:

In [36]:
diamonds.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,5,2,4,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,4,7,2,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,4,2,2,62.3,54.0,4.3,4.34,2.69,475
3,3,1.04,4,3,7,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,4,4,2,61.4,55.0,5.58,5.62,3.44,1276


In [37]:
diamonds.drop('id', axis =1, inplace = True)

In [38]:
diamonds.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
carat,1.0,-0.017314,0.078995,-0.281975,0.023118,0.181725,0.976267,0.945757,0.968685,0.922345
cut,-0.017314,1.0,0.004712,0.065965,-0.303405,-0.062829,-0.004183,-0.011733,-0.04657,0.02975
color,0.078995,0.004712,1.0,-0.07239,0.021176,-0.001145,0.069462,0.066811,0.070828,0.042707
clarity,-0.281975,0.065965,-0.07239,1.0,-0.066128,-0.139029,-0.302599,-0.289453,-0.304629,-0.107
depth,0.023118,-0.303405,0.021176,-0.066128,1.0,-0.299534,-0.028765,-0.032894,0.092482,-0.013307
table,0.181725,-0.062829,-0.001145,-0.139029,-0.299534,1.0,0.195775,0.182559,0.154399,0.126545
x,0.976267,-0.004183,0.069462,-0.302599,-0.028765,0.195775,1.0,0.967143,0.985385,0.886168
y,0.945757,-0.011733,0.066811,-0.289453,-0.032894,0.182559,0.967143,1.0,0.96035,0.860499
z,0.968685,-0.04657,0.070828,-0.304629,0.092482,0.154399,0.985385,0.96035,1.0,0.876061
price,0.922345,0.02975,0.042707,-0.107,-0.013307,0.126545,0.886168,0.860499,0.876061,1.0


In [39]:
diamonds.drop(['x','y'], axis=1, inplace=True)

In [40]:
diamonds.shape

(40345, 8)

In [41]:
X_reduced = diamonds.drop('price', axis=1)
y_reduced = diamonds['price']

X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(X_reduced, y_reduced, test_size=0.2)


In [42]:
X_reduced.shape

(40345, 7)

In [43]:
forest_model_red = RandomForestRegressor(n_estimators=200, min_samples_leaf=3, max_features=3)

forest_model_red.fit(X_r_train, y_r_train)
y_pred_1 = forest_model_red.predict(X_r_test)
errors(y_r_test, X_r_test, forest_model_red)

METRIC SUMMARY
MSE: 553320.2119381935
RMSE: 743.8549670051236
MSLE:  0.015957512272357584
MAE:  377.7280390231063
R2:  0.9656909130824579


In [47]:
forest_model_red.fit(X_reduced, y_reduced)
test_reduced = test.drop(['id','x','y'],axis = 1)
y_pred_reduced = forest_model_red.predict(test_reduced)

In [49]:
test['randomForest_reduced'] = y_pred_reduced

final_copy_reduced = test[['id', 'randomForest_reduced']]

final_copy_reduced.rename(columns={'randomForest_reduced': "price"}, inplace = True)

final_copy_reduced.to_csv('outputs/predictRandomReduced.csv', header=True, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [50]:
final_copy_reduced

Unnamed: 0,id,price
0,0,4582.794769
1,1,1073.149383
2,2,14916.493869
3,3,6634.786589
4,4,1473.999910
...,...,...
13444,13444,748.362959
13445,13445,2615.272437
13446,13446,512.098489
13447,13447,742.747509
