In [1]:
import pandas as pd

Data Loading

In [2]:
diamonds = pd.read_csv('../data/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/diamonds_predict.csv')

ml_Processing

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [4]:
target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat']
features = num_features + cat_features

In [5]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [6]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [7]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                ('cat', categorical_transformer, cat_features)])

In [8]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['carat']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [10]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds).todense()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.867006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Training the simple model

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [13]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 10)
(10114, 10)


In [14]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [15]:
model.fit(diamonds_train[features], diamonds_train[target]);

checking model performance..

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
y_test = model.predict(diamonds_test[features])
y_train = model.predict(diamonds_train[features])

In [18]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[target], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[target], squared=False)}")

test error: 597.6441582058527
train error: 363.7365784490147


cross validation

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
scores = cross_val_score(model, 
                         diamonds[features], 
                         diamonds[target], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [21]:
import numpy as np
np.mean(-scores)

587.4560336141361

Grid search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [100, 500, 900, 1100, 1500],
    'regressor__max_depth': [1, 5, 10, 15]
                                }

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=50)

grid_search.fit(diamonds[features], diamonds[target])

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

Submission

In [None]:
y_pred = grid_search.predict(diamonds_predict[features])

In [None]:
submission1509 = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [None]:
submission_1409.head()

In [None]:
submission_1409.describe()

In [None]:
submission_1909_v1.price.clip(0, 20000, inplace=True)

In [None]:
submission_1909_v1.to_csv('Ssubmission_1909_v1.csv', index=False)