In [3]:
import numpy as np 
import pandas as pd 
import math

from gradient_boosting import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

%load_ext autoreload
%autoreload 2

In [4]:
df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('validation.csv')
df_test = pd.read_csv('test.csv')

In [5]:
var_columns = [c for c in df_train.columns if c not in ['profit_margin']]

x_train = df_train.loc[:,var_columns]
y_train = df_train.loc[:,'profit_margin']

x_train_np = x_train.to_numpy()
y_train_np = y_train.to_numpy()

x_valid = df_valid.loc[:,var_columns]
y_valid = df_valid.loc[:,'profit_margin']

x_valid_np = x_valid.to_numpy()
y_valid_np = y_valid.to_numpy()

x_test = df_test.loc[:,var_columns]
y_test = df_test.loc[:,'profit_margin']

x_test_np = x_test.to_numpy()
y_test_np = y_test.to_numpy()

In [6]:
# baseline
scratch_baseline = GradientBoostingRegressor(2, 0.1, 2, 1e-7, 3, 'squared_error')
scratch_baseline.fit(x_train_np, y_train_np)
y_pred_base = scratch_baseline.predict(x_test_np)

y_actual = df_test['profit_margin'].to_numpy()

results = pd.DataFrame({"predicted":y_pred_base, "actual":y_actual}).sort_values('actual')
num_neg = 0
for i in results['predicted']:
    if i < 0:
        num_neg += 1
print(num_neg)
# print(results.to_string())

MSE = mean_squared_error(y_actual, y_pred_base)
print(f'Baseline MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'Baseline RMSE: {RMSE}')

MAE = mean_absolute_error(y_actual, y_pred_base)
print(f'Baseline MAE: {MAE}')

Training: 100% [------------------------------------------------] Time: 0:04:10

0
Baseline MSE: 0.469364869357778
Baseline RMSE: 0.6851020868146426
Baseline MAE: 0.5558102993290996





In [26]:
# tuned
scratch_tuned = GradientBoostingRegressor(60, 0.01, 2, 1e-7, 3, 'huber')
scratch_tuned.fit(x_train_np, y_train_np)
y_pred_tuned = scratch_tuned.predict(x_test_np)

y_actual = df_test['profit_margin'].to_numpy()

results = pd.DataFrame({"predicted":y_pred_tuned, "actual":y_actual}).sort_values('actual')
num_neg = 0
counter = 0
for i in results['predicted']:
    if i < 0:
        num_neg += 1
        results.at[0, 'predicted'] = 0
print(num_neg)
# print(results.to_string())

MSE = mean_squared_error(y_actual, y_pred_tuned)
print(f'Changed MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'Changed RMSE: {RMSE}')

MAE = mean_absolute_error(y_actual, y_pred_tuned)
print(f'Changed MAE: {MAE}')

Training: 100% [------------------------------------------------] Time: 9:34:08
0
Changed MSE: 0.4574843080966632
Changed RMSE: 0.6776165244123673
Changed MAE: 0.5394998065916507


In [22]:
from sklearn.model_selection import cross_val_score, KFold

n_folds = 5

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

base_scores = cross_val_score(scratch_baseline, x_valid_np, y_valid_np, cv=kf)
base_mean_score = base_scores.mean()
print('Baseline Mean Cross-validation Score: ', base_mean_score)

tuned_scores = cross_val_score(scratch_tuned, x_valid_np, y_valid_np, cv=kf)
tuned_mean_score = tuned_scores.mean()
print('Tuned Mean Cross-validation Score: ', tuned_mean_score)

Training: 100% [------------------------------------------------] Time: 0:00:03
Training: 100% [------------------------------------------------] Time: 0:00:03
Training: 100% [------------------------------------------------] Time: 0:00:03
Training: 100% [------------------------------------------------] Time: 0:00:03
Training: 100% [------------------------------------------------] Time: 0:00:03
Training:   0% [                                               ] ETA:  --:--:--

Baseline Mean Cross-validation Score:  0.13647207233472058


Training: 100% [------------------------------------------------] Time: 0:01:52
Training: 100% [------------------------------------------------] Time: 0:01:52
Training: 100% [------------------------------------------------] Time: 0:01:59
Training: 100% [------------------------------------------------] Time: 0:01:53
Training: 100% [------------------------------------------------] Time: 0:02:06

Tuned Mean Cross-validation Score:  0.10959225977388329





In [23]:
import statistics

print("Baseline CV standard deviation: ", statistics.stdev(base_scores))
print("Tuned CV standard deviation: ", statistics.stdev(tuned_scores))

Baseline CV standard deviation:  0.06354724516044141
Tuned CV standard deviation:  0.07603195404585959


In [27]:
from sklearn.metrics import r2_score

base_r2_score = r2_score(y_actual, y_pred_base)
tuned_r2_score = r2_score(y_actual, y_pred_tuned)

print('Baseline R2 score: ', base_r2_score)
print('Tuned R2 score: ', tuned_r2_score)

Baseline R2 score:  0.2161918043561809
Tuned R2 score:  0.22426360998544933
