In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pickle

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import mean_squared_error

In [2]:
import sklearn
print(sklearn.__version__)

1.2.2


In [3]:
!python --version

Python 3.10.12


In [4]:
df = pd.read_csv('diamonds.csv')

In [5]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
# df['log_price'] = np.log1p(df['price'])

In [5]:
# def split_train_test(data, test_ratio):
#     shuffled_indices = np.random.permutation(len(data))
#     test_set_size = int(len(data) * test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]

In [6]:
def split_test_set(data, split_ratio):
    n = len(data)

    n_val = int(n * split_ratio)
    n_test = int(n * split_ratio)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    shuffled_idx = np.random.permutation(n)
    

    val_idx = shuffled_idx[:n_val]
    test_idx = shuffled_idx[n_val: n_val + n_test]
    train_idx = shuffled_idx[n_val + n_test:]
    
    return data.iloc[train_idx], data.iloc[val_idx], data.iloc[test_idx]


In [7]:
df_train, df_val, df_test = split_test_set(df, 0.2)
# df_val = df_shuffled.iloc[n_train:n_train + n_val]
# df_test = df_shuffled.iloc[n_train + n_val:]

In [8]:
len(df_train), len(df_val), len(df_test)

(30000, 10000, 10000)

In [9]:
corr_matrix = df.corr(numeric_only=True)
corr_matrix["price"].sort_values(ascending=False)

price    1.000000
carat    0.921804
x        0.884919
y        0.864393
z        0.860963
table    0.129848
depth   -0.012731
Name: price, dtype: float64

In [10]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [11]:
df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [12]:
categorical = [col for col in df.columns if df[col].dtype == 'object']
categorical


['cut', 'color', 'clarity']

In [13]:
numerical = [col for col in df.columns if df[col].dtype != 'object' and col != 'price']
numerical

['carat', 'depth', 'table', 'x', 'y', 'z']

no mising values, so no need to impute them

In [14]:
y_train = df_train['price'].values
y_val = df_val['price'].values
y_test = df_test['price'].values

del df_train['price']
del df_val['price']
del df_test['price']

In [15]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.799444,61.753006,57.45783,3944.80544,5.734403,5.737956,3.541056
std,0.475173,1.431088,2.232092,3997.938105,1.123077,1.145579,0.707065
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2410.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5351.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [16]:
dv = DictVectorizer(sparse=False)

In [17]:
dicts_train = df_train.to_dict(orient='records')
dicts_val = df_val.to_dict(orient='records')
#test_dicts = df_test.to_dict(orient='records')

In [18]:
dicts_train[10]

{'carat': 1.14,
 'cut': 'Good',
 'color': 'I',
 'clarity': 'SI2',
 'depth': 60.0,
 'table': 65.0,
 'x': 6.8,
 'y': 6.75,
 'z': 4.06}

In [17]:
X_train = dv.fit_transform(dicts_train)

In [18]:
lr = LinearRegression()

In [19]:
lr.fit(X_train, y_train)
# lr.fit(X_train, np.log1p(y_train))

In [20]:
# training performance
y_pred = lr.predict(X_train)
#y_pred = np.expm1(y_pred)
rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f"Training rmse: {rmse}")

Training rmse: 1139.0786996598392


In [21]:
# val performance
X_val = dv.transform(dicts_val)
y_pred = lr.predict(X_val)
#y_pred = np.expm1(y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation rmse: {rmse}")

Validation rmse: 1133.1245541918595


## Tree Regressors

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [23]:
dtr = DecisionTreeRegressor()

In [24]:
# training performance
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f"training rmse: {rmse}")

training rmse: 6.965699294878201


In [25]:
# validation performance
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"validation rmse: {rmse}")

validation rmse: 745.9598331344657


okay so the validation performance with the tree regressor out of box is already significantly better than lr.  but the performance was way worse than the training performance which means it is overfitting and if we regularize somehow or play with the parameters we can probably improve it

In [28]:
# validation performance
dtr.fit(X_train, y_train)
#dtr.fit(X_train, np.log1p(y_train))
y_pred = dtr.predict(X_val)
#y_pred = np.expm1(y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)

print(f"validation rmse: {rmse}")


validation rmse: 738.4336292958224


In [31]:
# validation performance
dtr.fit(X_train, np.log1p(y_train))

y_pred = dtr.predict(X_val)
y_pred = np.expm1(y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)

print(f"validation rmse: {rmse}")


validation rmse: 779.1218305217647


takind the log of the target doesnt seem to help much and in the case of lr in made validation rmse much worse so I will not bother with it.

In [32]:
# lets play with the max_depth parameter to see how it affects the performance on val rmse

def train_dtr(max_depth=None, min_samples_leaf=1, max_leaf_nodes=None):
    
    if max_depth is not None:
        dtr = DecisionTreeRegressor(max_depth=max_depth,min_samples_leaf=min_samples_leaf, random_state=23,
                                   max_leaf_nodes=max_leaf_nodes)
    else:
        dtr = DecisionTreeRegressor(random_state=42)
    
    dv = DictVectorizer(sparse=False)
    dicts_train = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(dicts_train)
    
    dicts_val = df_val.to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    
    
    dtr.fit(X_train, y_train)
    #dtr.fit(X_train, np.log1p(y_train))
    
    y_pred = dtr.predict(X_val)
    #y_pred = np.expm1(y_pred)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    return rmse
    
        

In [34]:
for d in [8, 10, 12, 14, 16]:
    rmse = train_dtr(d)
    print(f"max_depth: {d} --------> rmse: {rmse}")

max_depth: 8 --------> rmse: 921.5867489739034
max_depth: 10 --------> rmse: 767.9930913343646
max_depth: 12 --------> rmse: 685.4898361539573
max_depth: 14 --------> rmse: 683.504104506087
max_depth: 16 --------> rmse: 696.2004348255731


In [35]:
d = 14
for s in [2, 4, 6, 8, 10, 12, 14, 16, 20]:
    rmse = train_dtr(d, s)
    print(f"max_depth: {d} min_samples_leaf: {s} --------> rmse: {rmse}")

max_depth: 14 min_samples_leaf: 2 --------> rmse: 668.4038507622773
max_depth: 14 min_samples_leaf: 4 --------> rmse: 661.0163319809513
max_depth: 14 min_samples_leaf: 6 --------> rmse: 651.7743823456976
max_depth: 14 min_samples_leaf: 8 --------> rmse: 647.9475654485638
max_depth: 14 min_samples_leaf: 10 --------> rmse: 643.9642573366292
max_depth: 14 min_samples_leaf: 12 --------> rmse: 647.9374237971298
max_depth: 14 min_samples_leaf: 14 --------> rmse: 647.6586289189544
max_depth: 14 min_samples_leaf: 16 --------> rmse: 652.3892915391586
max_depth: 14 min_samples_leaf: 20 --------> rmse: 663.5673284174321


In [36]:
d = 14
s = 10
for n in [220, 400, 1000, None]:
    rmse = train_dtr(d, s, n)
    print(f"max_depth: {d} min_samples_leaf: {s}  max_leaf_nodes: {n} --------> rmse: {rmse}")

max_depth: 14 min_samples_leaf: 10  max_leaf_nodes: 220 --------> rmse: 682.2259339959656
max_depth: 14 min_samples_leaf: 10  max_leaf_nodes: 400 --------> rmse: 655.8399168674272
max_depth: 14 min_samples_leaf: 10  max_leaf_nodes: 1000 --------> rmse: 644.3223602150954
max_depth: 14 min_samples_leaf: 10  max_leaf_nodes: None --------> rmse: 643.9642573366292


In [39]:
SCORES = {"model": "DecisionTreeRegressor",
          'params': {'max_depth': 14, 'min_samples_leaf': 10, 'max_leaf_nodes': None, 'val_rmse': 643.96},
         }
                                                       

### Random forest regressor

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
def train_rfr(n_estimators=100, max_depth=None, min_samples_leaf=2, max_leaf_nodes=None, bootstrap=True):
    
    
    rfr = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=23,
                                   max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap)
#     else:
#         rfr = RandomForestRegressor(bootstrap=True, random_state=42)
    
    dv = DictVectorizer(sparse=False)
    dicts_train = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(dicts_train)
    
    dicts_val = df_val.to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    
    
    rfr.fit(X_train, y_train)
    #dtr.fit(X_train, np.log1p(y_train))
    
    y_pred = rfr.predict(X_val)
    #y_pred = np.expm1(y_pred)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    return rmse

In [42]:
# out the box, with standard settings
train_rfr()

555.4104290246701

wow, that is quite a jump in performance

In [43]:
# first to tune the n_estimators
for n in [100, 200, 300]:
    rmse = train_rfr(n_estimators=n)
    print(f"n_estimators: {n} --------> rmse: {rmse}")

n_estimators: 100 --------> rmse: 555.4104290246701
n_estimators: 200 --------> rmse: 555.5721045179873
n_estimators: 300 --------> rmse: 554.9609613678379


rmse improves very marginally with more estimators but it doesnt seem worth it given the extra time needed to train.  I think I will stay with the default 100

In [44]:
# first to tune the n_estimators
n = 100
for d in [18, 25, 35, None]:
    rmse = train_rfr(n_estimators=n, max_depth=d)
    print(f"n_estimators: {n}  max_depth: {d} --------> rmse: {rmse}")

n_estimators: 100  max_depth: 18 --------> rmse: 555.7453306059564
n_estimators: 100  max_depth: 25 --------> rmse: 556.1774749264396
n_estimators: 100  max_depth: 35 --------> rmse: 555.4104290246701
n_estimators: 100  max_depth: None --------> rmse: 555.4104290246701


default value for max_depth (None) seems good here as well

In [45]:
n = 100
for d in [18, 25, 35, None]:
    rmse = train_rfr(n_estimators=n, max_depth=d, bootstrap=False)
    print(f"n_estimators: {n}  max_depth: {d} --------> rmse: {rmse}")

n_estimators: 100  max_depth: 18 --------> rmse: 680.3926869657345
n_estimators: 100  max_depth: 25 --------> rmse: 695.0703185597167
n_estimators: 100  max_depth: 35 --------> rmse: 696.0554324911
n_estimators: 100  max_depth: None --------> rmse: 696.0554324911


I just wanted to see the difference with bootstrap=False, performance declined. Keep it as True

In [46]:
# tuning max_leaf_nodes
n = 100
d = None
for l in [200, 400, 600, 800, None]:
    rmse = train_rfr(n_estimators=n, max_depth=d, max_leaf_nodes=l)
    print(f"n_estimators: {n}  max_depth: {d} max_leaf_nodes: {l} --------> rmse: {rmse}")

n_estimators: 100  max_depth: None max_leaf_nodes: 200 --------> rmse: 629.4740807188199
n_estimators: 100  max_depth: None max_leaf_nodes: 400 --------> rmse: 589.1227246745066
n_estimators: 100  max_depth: None max_leaf_nodes: 600 --------> rmse: 573.7674626249553
n_estimators: 100  max_depth: None max_leaf_nodes: 800 --------> rmse: 567.0535965363822
n_estimators: 100  max_depth: None max_leaf_nodes: None --------> rmse: 555.4104290246701


max_leaf_nodes also seems to be best at max value None

### XGBRegressor

In [47]:
from xgboost import XGBRegressor

In [48]:
# out of the box

xgb_regressor = XGBRegressor(objective='reg:squarederror',
                             random_state=42)
xgb_regressor.fit(X_train, y_train)

y_pred = xgb_regressor.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"xgb_regressor val_rmse: {rmse}")

xgb_regressor val_rmse: 565.566924933824


In [69]:
def train_xgb_regressor(eta=0.3, max_depth=6, min_child_weight=1):
    xgb_reg = XGBRegressor(eta=eta, max_depth=max_depth, min_child_weight=min_child_weight, random_state=23)
#     else:
#         rfr = RandomForestRegressor(bootstrap=True, random_state=42)
    
    dv = DictVectorizer(sparse=False)
    dicts_train = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(dicts_train)
    
    dicts_val = df_val.to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    
    
    xgb_reg.fit(X_train, y_train)
    #dtr.fit(X_train, np.log1p(y_train))
    
    y_pred = xgb_reg.predict(X_val)
    #y_pred = np.expm1(y_pred)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    return rmse

In [None]:
XGBRegressor()

In [50]:
# tuning eta
for e in [0.1, 0.12, 0.14, 0.16, .165, 0.17, 0.18]:
    rmse = train_xgb_regressor(eta=e)
    print(f"eta: {e} -----> val_rmse: {rmse}")

eta: 0.1 -----> val_rmse: 558.9534684791964
eta: 0.12 -----> val_rmse: 555.3450873353094
eta: 0.14 -----> val_rmse: 556.9940414926251
eta: 0.16 -----> val_rmse: 553.2864166103369
eta: 0.165 -----> val_rmse: 557.7480574971576
eta: 0.17 -----> val_rmse: 555.8154335425044
eta: 0.18 -----> val_rmse: 557.1561571985372


In [52]:
# tuning max_depth
e = 0.16
for d in [2, 4, 6, 7, 8, 10]:
    rmse = train_xgb_regressor(eta=e, max_depth=d)
    print(f"eta: {e}  max_depth: {d} -----> val_rmse: {rmse}")

eta: 0.16  max_depth: 2 -----> val_rmse: 839.6484442933144
eta: 0.16  max_depth: 4 -----> val_rmse: 614.4013393490552
eta: 0.16  max_depth: 6 -----> val_rmse: 553.2864166103369
eta: 0.16  max_depth: 7 -----> val_rmse: 543.3631064922608
eta: 0.16  max_depth: 8 -----> val_rmse: 545.58569520175
eta: 0.16  max_depth: 10 -----> val_rmse: 553.4971179685733


In [71]:
# tuning min_child_weight
e = 0.16
d = 7
for c in [0, 0.1, 0.5, 0.7, 1, 2, 2.5, 2.7]:
    rmse = train_xgb_regressor(eta=e, max_depth=d, min_child_weight=c)
    print(f"eta: {e}  max_depth: {d} min_child_weight: {c} -----> val_rmse: {rmse}")

eta: 0.16  max_depth: 7 min_child_weight: 0 -----> val_rmse: 543.3631064922608
eta: 0.16  max_depth: 7 min_child_weight: 0.1 -----> val_rmse: 543.3631064922608
eta: 0.16  max_depth: 7 min_child_weight: 0.5 -----> val_rmse: 543.3631064922608
eta: 0.16  max_depth: 7 min_child_weight: 0.7 -----> val_rmse: 543.3631064922608
eta: 0.16  max_depth: 7 min_child_weight: 1 -----> val_rmse: 543.3631064922608
eta: 0.16  max_depth: 7 min_child_weight: 2 -----> val_rmse: 542.4844487053525
eta: 0.16  max_depth: 7 min_child_weight: 2.5 -----> val_rmse: 549.6160678045958
eta: 0.16  max_depth: 7 min_child_weight: 2.7 -----> val_rmse: 549.6160678045958


the default min_child_weight of 1 is fine, same as 0

In [62]:
def train_xgb_regressor(eta=0.3, max_depth=6, min_child_weight=1):
    xgb_reg = XGBRegressor(eta=eta, max_depth=max_depth, min_child_weight=min_child_weight, random_state=23)
#     else:
#         rfr = RandomForestRegressor(bootstrap=True, random_state=42)
    
    dv = DictVectorizer(sparse=False)
    dicts_train = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(dicts_train)
    
    dicts_val = df_val.to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    
    
    xgb_reg.fit(X_train, y_train)
    #dtr.fit(X_train, np.log1p(y_train))
    
    y_pred = xgb_reg.predict(X_val)
    #y_pred = np.expm1(y_pred)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    print("val_rmse: %f" % rmse)
    
    return dv, xgb_reg

In [63]:
dv, xgb_reg = train_xgb_regressor(eta=0.16, max_depth=7)

val_rmse: 543.363106


In [64]:
# saving the model and dict Vecorizer
with open('models/xgb_reg.bin', 'wb') as f_out:
    pickle.dump((dv, xgb_reg), f_out)

In [67]:

print(sklearn.__version__)

1.2.2


In [None]:
['eta', 'max_depth', 'leanrning_rate', 'n_estimators, min_child_weight', 'gamma', 'subsample', 'colsample_bytree',
'reg_alpha', 'reg_lambda', 'random_state']

In [78]:
param = {
        'eta': trial.suggest_float('eta', 0.1, 0.3),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_float('min_child_weight', 0, 2.5),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }

NameError: name 'trial' is not defined