In [96]:
import pandas as pd

In [97]:
diamonds = pd.read_csv('../data/diamonds_vol.csv')
diamonds_predict = pd.read_csv('../data/diamonds_predict_vol.csv')

In [98]:
diamonds_predict

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,volume
0,0,0.79,Very Good,F,SI1,62.7,60.0,0.225071
1,1,1.20,Ideal,J,VS1,61.0,57.0,0.341880
2,2,1.57,Premium,H,SI1,62.2,61.0,0.447293
3,3,0.90,Very Good,F,SI1,63.8,54.0,0.256410
4,4,0.50,Very Good,F,VS1,62.9,58.0,0.142450
...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,0.162393
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,0.202279
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,0.199430
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,0.199430


In [99]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [100]:
target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table','volume']
features = num_features + cat_features

In [101]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', StandardScaler())])

In [102]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [103]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                ('cat', categorical_transformer, cat_features)])

In [104]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['carat', 'depth', 'table', 'volume']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [105]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds).todense()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.867006,0.452019,0.247981,0.867006,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.004557,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.184434,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.815298,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.467458,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


training simple model

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [108]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 9)
(10114, 9)


In [117]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor(max_depth=16))])

In [118]:
model.fit(diamonds_train[features], diamonds_train[target]);

checking model performance...

In [119]:
from sklearn.metrics import mean_squared_error

In [112]:
y_test = model.predict(diamonds_test[features])
y_train = model.predict(diamonds_train[features])

In [113]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[target], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[target], squared=False)}")

test error: 559.1155002903057
train error: 213.02387696339363


cross validation

In [114]:
from sklearn.model_selection import cross_val_score

In [115]:
scores = cross_val_score(model, 
                         diamonds[features], 
                         diamonds[target], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [116]:
import numpy as np
np.mean(-scores)

559.497378060202

GRID SEARCH

In [22]:
from sklearn.model_selection import RandomizedSearchCV

In [23]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'regressor__n_estimators': [128],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[features], diamonds[target])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:   32.4s remaining:   48.6s
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  1.4min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed:  1.6min remaining:   40.8s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  5.8min remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.7min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer()),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table']),
                                                                              ('cat',
                            

In [24]:
grid_search.best_params_

{'regressor__n_estimators': 128,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'mean'}

In [25]:
grid_search.best_score_

-551.5807178861996

submission

In [37]:
y_pred = grid_search.predict(diamonds_predict[features])

In [27]:
diamonds_predict['features'] = y_pred

In [28]:
submission_v2_1609=diamonds_predict[['features']]

In [29]:
submission_v2_1609=pd.DataFrame({'price': y_pred})

In [30]:
submission_v2_1609.index.name='id'

In [38]:
submission_v2_1609.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,3034.964532
1,5357.355005
2,10683.07923
3,4185.333928
4,1614.943482


In [31]:
submission_v2_1609.describe()

Unnamed: 0,price
count,13485.0
mean,3954.414562
std,3955.708764
min,367.925803
25%,939.985841
50%,2457.701597
75%,5306.087216
max,18120.501988


In [32]:
#submission_v2_1609.price.clip(0, 20000, inplace=True)

In [33]:
#submission_v2_1609.to_csv('submission_v2_1609.csv')