In [2]:
import numpy as np
import pandas as pd
from decision_tree import DecisionTree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score

from random_forest import RandomForest

### Read Data


In [3]:
df = pd.read_csv("../train.csv")
val_df = pd.read_csv("../validation.csv")
X_train = df.drop(columns="profit_margin")
y_train = df["profit_margin"]
X_val = val_df.drop(columns="profit_margin")
y_val = val_df["profit_margin"]

### Set Experimental Parameters


In [4]:
args = {"n_estimators": 10, "min_samples_split": 8}

### Run K-fold cross-validation with different hyperparameters


In [4]:
rf = RandomForest()
# sklearn_rf.score(X_val, y_val)
total_X = pd.concat([X_train, X_val])
total_y = pd.concat([y_train, y_val])
default_cross_val = cross_val_score(
    rf,
    total_X,
    total_y,
    cv=5,
    scoring="r2",
)
default_cross_val.mean()
# print(sklearn_rf.score(X_train, y_train), sklearn_rf.score(X_val, y_val))

100%|██████████| 10/10 [03:17<00:00, 19.79s/it]
100%|██████████| 10/10 [02:35<00:00, 15.60s/it]
100%|██████████| 10/10 [03:37<00:00, 21.70s/it]
100%|██████████| 10/10 [03:03<00:00, 18.39s/it]
100%|██████████| 10/10 [02:42<00:00, 16.24s/it]


0.07602653852569197

In [60]:
rf = RandomForest(**args)
# sklearn_rf.score(X_val, y_val)
total_X = pd.concat([X_train, X_val])
total_y = pd.concat([y_train, y_val])
default_cross_val = cross_val_score(
    rf,
    total_X,
    total_y,
    cv=5,
    scoring="r2",
)
default_cross_val.mean()

100%|██████████| 10/10 [03:28<00:00, 20.83s/it]
100%|██████████| 10/10 [02:40<00:00, 16.06s/it]
100%|██████████| 10/10 [02:38<00:00, 15.84s/it]
100%|██████████| 10/10 [03:51<00:00, 23.16s/it]
100%|██████████| 10/10 [02:53<00:00, 17.33s/it]


0.07316911134681348

### Run grid search on hyperparameters


In [None]:
results_df = pd.DataFrame(
    columns=[
        "max_depth",
        "n_estimators",
        "max_features",
        "min_samples_leaf",
        "min_samples_split",
    ]
)
i = 0
for max_depth in [3, 5, 10, None]:
    for n_estimators in [2, 5, 10]:
        for min_samples_leaf in [1, 2, 4]:
            for min_samples_split in [2, 5, 8]:
                random_forest = RandomForest(
                    max_depth=max_depth,
                    n_estimators=n_estimators,
                    min_samples_leaf=min_samples_leaf,
                    min_samples_split=min_samples_split,
                )
                params_dict = {
                    "max_depth": max_depth,
                    "n_estimators": n_estimators,
                    "min_samples_leaf": min_samples_leaf,
                    "min_samples_split": min_samples_split,
                }
                print("training with param dict:\n", params_dict)
                random_forest.fit(X_train, y_train)
                y_pred = random_forest.predict(X_val)
                rmse = mean_squared_error(y_pred, y_val, squared=False)
                mae = mean_absolute_error(y_pred, y_val)
                rmse_logged = np.sqrt(mean_squared_error(np.log(y_pred), np.log(y_val)))
                mae_logged = mean_absolute_error(np.log(y_pred), np.log(y_val))
                new_entry = pd.DataFrame(
                    {
                        "max_depth": max_depth,
                        "n_estimators": n_estimators,
                        "min_samples_leaf": min_samples_leaf,
                        "min_samples_split": min_samples_split,
                        "rmse": rmse,
                        "rmse_logged": rmse_logged,
                        "mae": mae,
                        "mae_logged": mae_logged,
                    },
                    index=[i],
                )
                i += 1
                # print(
                #     f"\tGot results: rmse: {rmse}, rmse_logged: {rmse_logged}, mae: {mae}, mae_logged: {mae_logged}"
                # )
                results_df = pd.concat([results_df, new_entry])

### Show best performing models


In [65]:
results_df.sort_values("rmse")

Unnamed: 0,max_depth,n_estimators,max_features,min_samples_leaf,min_samples_split,rmse,rmse_logged,mae,mae_logged
29,5,2,,1,8,0.700332,1.012820,0.588489,0.583753
33,5,2,,4,2,0.706232,1.018839,0.591683,0.586444
86,,2,,2,8,0.708904,1.014272,0.584969,0.579450
54,10,2,,1,2,0.712547,1.020802,0.598090,0.590215
68,10,5,,2,8,0.715012,1.022757,0.603112,0.593196
...,...,...,...,...,...,...,...,...,...
12,3,5,,2,2,0.754814,1.045324,0.641643,0.618457
81,,2,,1,2,0.755301,1.034139,0.626880,0.610618
62,10,2,,4,8,0.755601,1.047116,0.642294,0.618842
31,5,2,,2,5,0.756893,1.048821,0.644748,0.620387


### Get SKLearn implementation for comparison


In [57]:
sklearn_rf = RandomForestRegressor(**args)
total_X = pd.concat([X_train, X_val])
total_y = pd.concat([y_train, y_val])
default_cross_val = cross_val_score(
    sklearn_rf,
    total_X,
    total_y,
    cv=5,
    scoring="r2",
)
default_cross_val.mean()

0.3567610745405297

In [None]:
y_pred = rf.predict(X_val.values)

In [None]:
mean_squared_error(y_pred, y_val, squared=False)

In [None]:
y_pred = sklearn_rf.predict(X_val)
mean_squared_error(y_pred, y_val, squared=False)

### Read Data


In [12]:
df = pd.read_csv("../test.csv")
X_test = df.drop(columns="profit_margin")
y_test = df["profit_margin"]
y_pred = rf.predict(X_test)

### Fit both implementations, both baseline and tuned for statistical tests


In [6]:
rf = RandomForest(max_depth=5, n_estimators=2, min_samples_leaf=1, min_samples_split=8)
rf.fit(X_train, y_train)

100%|██████████| 2/2 [00:47<00:00, 23.64s/it]


<random_forest.RandomForest at 0x7ff47f22ace0>

In [16]:
from scipy import stats

t_stat, p = stats.ttest_ind(y_pred, y_test)
alpha = 0.05
print(f"p-value: {p}")
print(f"t-value: {t_stat}")
if p > alpha:
    print("model significantly different from test")

p-value: 0.4443126273115844
t-value: 0.7651312745607628
model significantly different from test


In [17]:
rf = RandomForest()
rf.fit(X_train, y_train)

100%|██████████| 10/10 [04:03<00:00, 24.36s/it]


<random_forest.RandomForest at 0x7ff47f22aef0>

In [21]:
from scipy import stats

y_pred = rf.predict(X_test)

t_stat, p = stats.ttest_ind(y_pred, y_test)
alpha = 0.05
print(f"p-value: {p}")
print(f"t-value: {t_stat}")
if p > alpha:
    print("model significantly different from test")

p-value: 0.5211428309107893
t-value: 0.6417343412256528
model significantly different from test


In [22]:
sklearn_rf = RandomForestRegressor()
sklearn_rf.fit(X_train, y_train)
y_pred = sklearn_rf.predict(X_test)

t_stat, p = stats.ttest_ind(y_pred, y_test)
alpha = 0.05
print(f"p-value: {p}")
print(f"t-value: {t_stat}")
if p > alpha:
    print("model significantly different from test")

p-value: 0.9658733821620628
t-value: -0.04279150019356165
model significantly different from test


In [23]:
sklearn_rf = RandomForestRegressor(
    n_estimators=10, max_depth=None, min_samples_leaf=1, min_samples_split=8
)
sklearn_rf.fit(X_train, y_train)
y_pred = sklearn_rf.predict(X_test)

t_stat, p = stats.ttest_ind(y_pred, y_test)
alpha = 0.05
print(f"p-value: {p}")
print(f"t-value: {t_stat}")
if p > alpha:
    print("model significantly different from test")

p-value: 0.9422270567070601
t-value: 0.07248306424289566
model significantly different from test
