In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [94]:
def mean_absolute_percentage_error(actual, pred): 
    return np.mean(np.abs((actual - pred) / actual)) * 100

def evaluate_model(model_name, model, X, y):
    
    predictions = model.predict(X)

    MAE = mean_absolute_error(y, predictions)
    MAPE = mean_absolute_percentage_error(y, predictions)
    RMSE = mean_squared_error(y, predictions, squared = False)

    print('MAE for', model_name, ': %1.3f' % MAE)
    print('MAPE for', model_name, ': %1.3f' % MAPE)
    print('RMSE for', model_name, ': %1.3f' % RMSE)

    metrics_table = pd.DataFrame({'MAE' : [round(MAE, 3)], 'MAPE' : [round(MAPE, 3)], 'RMSE' : [round(RMSE, 3)]}, index = [model_name])
    
    return metrics_table

### Read in the data

In [3]:
data = pd.read_csv('../data/diamonds_cleaned.csv')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335


### Dealing with input types

CatBoost can take categorical features as is, so I won't one-hot encode the data yet, instead, for most models I will separately one-hot encode the train and test sets, for CatBoost I'll just give the Regressor the features as string

In [4]:
y = data['price'].copy()
X = data.drop('price', 1).copy()

print('Shape of original data:', data.shape)
print('Shape of y:', y.shape)
print('Shape of X:', X.shape)

Shape of original data: (53770, 7)
Shape of y: (53770,)
Shape of X: (53770, 6)


### Train - Test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 20202020)

print('Shape of X train:', X_train.shape)
print('Shape of X test:', X_test.shape)
print('Shape of y train:', y_train.shape)
print('Shape of y test:', y_test.shape)

Shape of X train: (40327, 6)
Shape of X test: (13443, 6)
Shape of y train: (40327,)
Shape of y test: (13443,)


### One-hot encode categoricals

In [19]:
# Need original data for CatBoost

X_train_original = X_train.copy()
X_test_original = X_test.copy()

# One hot encode for other models

X_train = pd.get_dummies(X_train, columns = ['cut', 'color', 'clarity'], prefix_sep = ' = ')
X_test = pd.get_dummies(X_test, columns = ['cut', 'color', 'clarity'], prefix_sep = ' = ')

In [23]:
print('Original amount of Xs:', X_train_original.shape[1])
print('Column # with one-hot encoding:', X_train.shape[1])

Original amount of Xs: 6
Column # with one-hot encoding: 23


# Applying the different ML models

### 1. DummyRegressor to compare results against

Predicting the median

In [113]:
dummy = DummyRegressor(strategy = 'median')
dummy.fit(X_train, y_train)

DummyRegressor(constant=None, quantile=None, strategy='median')

In [114]:
dummy_median = evaluate_model('DummyRegressorMedian', dummy, X_test, y_test)

MAE for DummyRegressorMedian : 2806.394
MAPE for DummyRegressorMedian : 110.422
RMSE for DummyRegressorMedian : 4258.833


Predicting the 25% percentile

In [124]:
dummy = DummyRegressor(strategy = 'quantile', quantile = 0.25)
dummy.fit(X_train, y_train)

DummyRegressor(constant=None, quantile=0.25, strategy='quantile')

In [125]:
dummy_quantile = evaluate_model('DummyRegressorQuantile', dummy, X_test, y_test)

MAE for DummyRegressorQuantile : 3105.765
MAPE for DummyRegressorQuantile : 60.473
RMSE for DummyRegressorQuantile : 4964.068


Concat results

In [126]:
model_comparison = pd.concat([dummy_median, dummy_quantile], 0)
model_comparison.head()

Unnamed: 0,MAE,MAPE,RMSE
DummyRegressorMedian,2806.394,110.422,4258.833
DummyRegressorQuantile,3105.765,60.473,4964.068


### 2. Simple linear regression

In [127]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [128]:
linreg = evaluate_model('LinearRegression', lr, X_test, y_test)

MAE for LinearRegression : 803.143
MAPE for LinearRegression : 45.210
RMSE for LinearRegression : 1143.740


In [129]:
model_comparison = pd.concat([model_comparison, linreg], 0)
model_comparison.head()

Unnamed: 0,MAE,MAPE,RMSE
DummyRegressorMedian,2806.394,110.422,4258.833
DummyRegressorQuantile,3105.765,60.473,4964.068
LinearRegression,803.143,45.21,1143.74


### Penalized regressions
#### 1. Ridge Regression

In [155]:
ridge_params = {'alpha' : np.linspace(0.2, 0.6, 25)}

ridge = Ridge(random_state = 20202020)

GRID_RIDGE = GridSearchCV(ridge, param_grid = ridge_params, cv = 5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

GRID_RIDGE.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False,
                             random_state=20202020, solver='auto', tol=0.001),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([0.2       , 0.21666667, 0.23333333, 0.25      , 0.26666667,
       0.28333333, 0.3       , 0.31666667, 0.33333333, 0.35      ,
       0.36666667, 0.38333333, 0.4       , 0.41666667, 0.43333333,
       0.45      , 0.46666667, 0.48333333, 0.5       , 0.51666667,
       0.53333333, 0.55      , 0.56666667, 0.58333333, 0.6       ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_root_mean_squared_error', verbose=0)

In [156]:
GRID_RIDGE.best_params_

{'alpha': 0.43333333333333335}

In [157]:
ridgereg = evaluate_model('RidgeRegression', GRID_RIDGE.best_estimator_, X_test, y_test)

MAE for RidgeRegression : 803.074
MAPE for RidgeRegression : 45.196
RMSE for RidgeRegression : 1143.750


In [150]:
model_comparison = pd.concat([model_comparison, ridgereg], 0)
model_comparison.head()

Unnamed: 0,MAE,MAPE,RMSE
DummyRegressorMedian,2806.394,110.422,4258.833
DummyRegressorQuantile,3105.765,60.473,4964.068
LinearRegression,803.143,45.21,1143.74
RidgeRegression,803.074,45.196,1143.75


#### 2. LASSO Regression

In [167]:
LASSO_params = {'alpha' : np.linspace(0, 0.03, 10)}

lasso = Lasso(random_state = 20202020)

GRID_LASSO = GridSearchCV(lasso, param_grid = LASSO_params, cv = 5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

GRID_LASSO.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=20202020,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([0.        , 0.00333333, 0.00666667, 0.01      , 0.01333333,
       0.01666667, 0.02      , 0.02333333, 0.02666667, 0.03      ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_root_mean_squared_error', verbose=0)

In [168]:
GRID_LASSO.best_params_

{'alpha': 0.006666666666666666}

In [169]:
lassoreg = evaluate_model('LassoRegression', GRID_LASSO.best_estimator_, X_test, y_test)

MAE for LassoRegression : 803.126
MAPE for LassoRegression : 45.207
RMSE for LassoRegression : 1143.741


In [170]:
model_comparison = pd.concat([model_comparison, lassoreg], 0)
model_comparison.head()

Unnamed: 0,MAE,MAPE,RMSE
DummyRegressorMedian,2806.394,110.422,4258.833
DummyRegressorQuantile,3105.765,60.473,4964.068
LinearRegression,803.143,45.21,1143.74
RidgeRegression,803.074,45.196,1143.75
LassoRegression,803.126,45.207,1143.741


#### 3. Elastic Net Regression

In [None]:
ElasticNet_params = {'alpha' : np.linspace(0, 1, 11),
                     'l1_ratio' : np.linspace(0, 1, 11)}

elasticnet = ElasticNet(random_state = 20202020)

GRID_EN = GridSearchCV(elasticnet, param_grid = ElasticNet_params, cv = 5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

GRID_EN.fit(X_train, y_train)

In [None]:
GRID_EN.best_params_

In [None]:
elasticnetreg = evaluate_model('ElasticNetRegression', GRID_EN.best_estimator_, X_test, y_test)