In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [73]:
def mean_absolute_percentage_error(actual, pred): 
    return np.mean(np.abs((actual - pred) / actual)) * 100

def evaluate_model(model_name, model, X, y):
    
    predictions = model.predict(X)

    MAE = mean_absolute_error(y, predictions)
    MAPE = mean_absolute_percentage_error(y, predictions)
    RMSE = mean_squared_error(y, predictions, squared = False)

    print('MAE for', model_name, ': %1.3f' % MAE)
    print('MAPE for', model_name, ': %1.3f' % MAPE)
    print('RMSE for', model_name, ': %1.3f' % RMSE)

    metrics_table = pd.DataFrame({model_name : [round(MAE, 3), round(MAPE, 3), round(RMSE, 3)]}, index = ['MAE', 'MAPE', 'RMSE'])
    
    return metrics_table

### Read in the data

In [3]:
data = pd.read_csv('../data/diamonds_cleaned.csv')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335


### Dealing with input types

CatBoost can take categorical features as is, so I won't one-hot encode the data yet, instead, for most models I will separately one-hot encode the train and test sets, for CatBoost I'll just give the Regressor the features as string

In [4]:
y = data['price'].copy()
X = data.drop('price', 1).copy()

print('Shape of original data:', data.shape)
print('Shape of y:', y.shape)
print('Shape of X:', X.shape)

Shape of original data: (53770, 7)
Shape of y: (53770,)
Shape of X: (53770, 6)


### Train - Test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 20202020)

print('Shape of X train:', X_train.shape)
print('Shape of X test:', X_test.shape)
print('Shape of y train:', y_train.shape)
print('Shape of y test:', y_test.shape)

Shape of X train: (40327, 6)
Shape of X test: (13443, 6)
Shape of y train: (40327,)
Shape of y test: (13443,)


### One-hot encode categoricals

In [19]:
# Need original data for CatBoost

X_train_original = X_train.copy()
X_test_original = X_test.copy()

# One hot encode for other models

X_train = pd.get_dummies(X_train, columns = ['cut', 'color', 'clarity'], prefix_sep = ' = ')
X_test = pd.get_dummies(X_test, columns = ['cut', 'color', 'clarity'], prefix_sep = ' = ')

In [23]:
print('Original amount of Xs:', X_train_original.shape[1])
print('Column # with one-hot encoding:', X_train.shape[1])

Original amount of Xs: 6
Column # with one-hot encoding: 23


# Applying the different ML models

### 1. DummyRegressor to compare results against

In [61]:
dummy = DummyRegressor(strategy = 'median')
dummy.fit(X_train, y_train)

DummyRegressor(constant=None, quantile=None, strategy='median')

In [74]:
evaluate_model('DummyRegressor', dummy, X_test, y_test)

MAE for DummyRegressor : 2806.394
MAPE for DummyRegressor : 110.422
RMSE for DummyRegressor : 4258.833


Unnamed: 0,DummyRegressor
MAE,2806.394
MAPE,110.422
RMSE,4258.833


That should be easy to beat - making 110% errors on average if I predict everything to be the train median