In [13]:
# imports 
import numpy as np
import pandas as pd

In [14]:
df= pd.read_csv('../data/train/diamonds_train.csv', index_col=[0])
df.head().T

Unnamed: 0,0,1,2,3,4
index_id,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,41667f6e2629360aecaf00b20f8732e3310417ebd54b24...,01f8667f50d52677bea23231a74156e4f92360d7bc3db6...,c3867352aab641358faec75d733af012dbe2259a014ea8...,0da4b104c4d8589fcb96a03aa0787549a2631935b0f499...
depth,62.4,61.6,62.3,59.6,60.2
table,58.0,58.0,58.0,60.0,62.0
x,6.83,6.4,5.86,7.58,5.4
y,6.79,6.35,5.8,7.48,5.33
z,4.25,3.93,3.63,4.49,3.23
price,4268,3513,1792,7553,1176
carat,1.21,1.02,0.77,1.51,0.57
cut,Premium,Premium,Premium,Premium,Premium
color,J,J,J,J,J


In [15]:
numerical_columns = ['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_columns = ['cut', 'color', 'clarity']
FEATS = numerical_columns + categorical_columns
TARGET = 'price'


In [16]:
# Apply label encoder to each column with categorical data
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])
df.head()

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,3,6,5,Dubai
1,41667f6e2629360aecaf00b20f8732e3310417ebd54b24...,61.6,58.0,6.4,6.35,3.93,3513,1.02,3,6,5,Dubai
2,01f8667f50d52677bea23231a74156e4f92360d7bc3db6...,62.3,58.0,5.86,5.8,3.63,1792,0.77,3,6,5,Dubai
3,c3867352aab641358faec75d733af012dbe2259a014ea8...,59.6,60.0,7.58,7.48,4.49,7553,1.51,3,6,5,Dubai
4,0da4b104c4d8589fcb96a03aa0787549a2631935b0f499...,60.2,62.0,5.4,5.33,3.23,1176,0.57,3,6,5,Dubai


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [18]:
from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(df[FEATS], target, test_size=0.15, random_state=42)

data_train, data_test, target_train, target_test = train_test_split(df[FEATS], df[TARGET])

In [19]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

"""model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])"""

pipeline_xgb=Pipeline([("scalar5",StandardScaler()),
                     ("rf_classifier",XGBRegressor())])


In [20]:
pipeline_xgb.fit(data_train, target_train);

In [21]:
y_pred = pipeline_xgb.predict(data_test)
y_pred

array([3641.155 ,  864.7001, 7412.3257, ...,  793.8273, 1853.5234,
       3048.2727], dtype=float32)

In [22]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(target_test, y_pred)**0.5
rmse

568.9628125984176

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(pipeline_xgb, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(df[FEATS], target)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


[CV 1/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=64
[CV 1/5; 2/32] START preprocessor__num__imputer__strategy=median, regressor__max_depth=4, regressor__n_estimators=512
[CV 4/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=64
[CV 2/5; 2/32] START preprocessor__num__imputer__strategy=median, regressor__max_depth=4, regressor__n_estimators=512
[CV 3/5; 2/32] START preprocessor__num__imputer__strategy=median, regressor__max_depth=4, regressor__n_estimators=512
[CV 2/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=64[CV 5/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=64

[CV 3/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=64
[CV 4/5; 2/32] START preprocessor__num__imputer__strategy=median, regre

In [None]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [None]:
grid_search.best_score_

-554.2517030079183

In [65]:
#Model prediction Test Dataframe (aka real prediction):
diamonds_train=pd.read_csv('../data/test/diamonds_test.csv', index_col=[0])
diamonds_train

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...
13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [66]:
for col in categorical_columns:
    diamonds_train[col] = label_encoder.fit_transform(diamonds_train[col])
diamonds_train.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.79,4,2,2,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1.2,2,6,4,61.0,57.0,6.81,6.89,4.18,Surat
2,1.57,3,4,2,62.2,61.0,7.38,7.32,4.57,Kimberly
3,0.9,4,2,2,63.8,54.0,6.09,6.13,3.9,Kimberly
4,0.5,4,2,4,62.9,58.0,5.05,5.09,3.19,Amsterdam


In [70]:
pipeline_xgb.fit(df[FEATS],df[TARGET]); #Volvemos a entrenar el modelo con el datframe original (sin la separacion de train y test), para tener mas datos en el entreno

In [71]:
y_pred = pipeline_xgb.predict(diamonds_train[FEATS])
y_pred


array([2924.8909, 5905.8438, 9644.87  , ..., 3281.2346, 2162.077 ,
        827.3256], dtype=float32)

In [72]:
y_pred=pd.DataFrame(y_pred, columns=['price'])
y_pred.reset_index(inplace=True)
y_pred.rename(columns={"index": "id"}, inplace= True)
y_pred

Unnamed: 0,id,price
0,0,2924.890869
1,1,5905.843750
2,2,9644.870117
3,3,3760.827393
4,4,1654.788696
...,...,...
13480,13480,1700.711792
13481,13481,2331.504639
13482,13482,3281.234619
13483,13483,2162.076904


In [73]:
y_pred.to_csv('../data/prediction/price_prediction.csv', index= False)