In [1]:
import pandas as pd

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

import math


from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
diamonds = pd.read_csv('../data/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/diamonds_test.csv')

In [4]:
diamonds = diamonds.loc[~((diamonds['y'] > 20) | (diamonds['z'] > 20))]

In [5]:
diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']
diamonds_predict['volume'] = diamonds_predict['x']*diamonds_predict['y']*diamonds_predict['z']


In [6]:
diamonds['ratio_length_width'] = diamonds['x']/diamonds['y']
diamonds_predict['ratio_length_width'] = diamonds_predict['x']/diamonds_predict['y']

In [7]:
diamonds['ratio_xyz'] = diamonds['x']/diamonds['y']/diamonds['z']
diamonds_predict['ratio_xyz'] = diamonds_predict['x']/diamonds_predict['y']/diamonds_predict['z']




In [8]:
diamonds['density'] = diamonds['carat']/diamonds['volume']
diamonds_predict['density'] = diamonds_predict['carat']/diamonds_predict['volume']

In [9]:
xy = []
for i in diamonds['table'].index:
    if 54<diamonds['table'][i]<57 and 61<diamonds['depth'][i]<62.5:
        xy.append('Round')
    elif 52<diamonds['table'][i]<60 and 60<diamonds['depth'][i]<68:
        xy.append('Oval')
    elif 63<diamonds['table'][i]<69 and 69<diamonds['depth'][i]<76:
        xy.append('Princess')
    elif 58<diamonds['table'][i]<63 and 58<diamonds['depth'][i]<66:
        xy.append('Cushion')
    else:
        xy.append('others')

In [10]:
diamonds['shape'] = xy

In [11]:
xy = []
for i in diamonds_predict['table'].index:
    if 54<diamonds_predict['table'][i]<57 and 61<diamonds_predict['depth'][i]<62.5:
        xy.append('Round')
    elif 52<diamonds_predict['table'][i]<60 and 60<diamonds_predict['depth'][i]<68:
        xy.append('Oval')
    elif 63<diamonds_predict['table'][i]<69 and 69<diamonds_predict['depth'][i]<76:
        xy.append('Princess')
    elif 58<diamonds_predict['table'][i]<63 and 58<diamonds_predict['depth'][i]<66:
        xy.append('Cushion')
    else:
        xy.append('others')

In [12]:
diamonds_predict['shape'] = xy

In [13]:
import math

carat_log= []
for i in diamonds['carat']:
    carat_log.append(math.log(i))
diamonds['carat_log'] = carat_log

carat_log= []
for i in diamonds_predict['carat']:
    carat_log.append(math.log(i))
diamonds_predict['carat_log'] = carat_log

In [14]:
NUM_FEATS = ['carat','table','depth','x','y','z','ratio_length_width', 'carat_log']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [15]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [16]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [17]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [20]:
from lightgbm import LGBMRegressor

In [21]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor',LGBMRegressor())])


In [22]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

In [23]:
from sklearn.metrics import r2_score

In [24]:
y_test = model.predict(diamonds_test[FEATS])
y_train = model.predict(diamonds_train[FEATS])

In [25]:
print(f"test error: {r2_score(y_pred=y_test, y_true=diamonds_test[TARGET])}")
print(f"train error: {r2_score(y_pred=y_train, y_true=diamonds_train[TARGET])}")

test error: 0.9808016544782202
train error: 0.986494808686546


In [26]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")

test error: 554.6539128769064
train error: 463.5206947707426


In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=12, n_jobs=-2)

In [29]:
import numpy as np
np.mean(-scores)

536.4069405113943

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512, 1028],
    'regressor__max_depth': [2, 4, 8, 16, 32],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [None]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [None]:
submission_df.to_csv('diamonds_backtothepath3.csv', index=False)