In [40]:
import pandas as pd
import numpy as np
from decisiontree import dtree, entropy, gini, bag
from algorithms import predict, metrics, collapseToBin, predict_bag
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [3]:
# Intialize possible hyperparameters
criterions = [gini, entropy]
max_depths = [i for i in range(1, 25)]
target_impurities = np.arange(20, -1, -2) / 100
min_examples = [i for i in range(8, 2, -1)]

critMap = {gini: "gini", entropy: 'entropy'}


accuracies = []

In [4]:
df = pd.read_csv("../data/team_data_v4.csv")
df.drop(["Unnamed: 0"], inplace=True, axis=1)
df

Unnamed: 0,home_team,away_team,home_shots_prop,away_shots_prop,home_corsi_prop,away_corsi_prop,home_fenwick_prop,away_fenwick_prop,home_penalties_prop,away_penalties_prop,home_hits_prop,away_hits_prop,home_takeaways_prop,away_takeaways_prop,game_end
0,Toronto Maple Leafs,Ottawa Senators,0.596154,0.384615,0.511111,0.477778,0.567568,0.418919,0.375000,0.500000,0.392405,0.594937,0.727273,0.181818,home win
1,Philadelphia Flyers,Boston Bruins,0.467742,0.516129,0.446429,0.544643,0.425287,0.563218,0.500000,0.450000,0.462687,0.522388,0.500000,0.428571,home win
2,Toronto Maple Leafs,Ottawa Senators,0.450980,0.529412,0.469388,0.520408,0.493333,0.493333,0.428571,0.428571,0.397436,0.589744,0.470588,0.470588,away win
3,Arizona Coyotes,St. Louis Blues,0.400000,0.500000,0.400000,0.500000,0.400000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,away win
4,Pittsburgh Penguins,Columbus Blue Jackets,0.539683,0.444444,0.528000,0.464000,0.549451,0.439560,0.500000,0.375000,0.565789,0.421053,0.625000,0.312500,home win
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1328,Pittsburgh Penguins,Philadelphia Flyers,0.476923,0.507692,0.533333,0.458333,0.467391,0.521739,0.562500,0.375000,0.483333,0.500000,0.500000,0.461538,home win
1329,Winnipeg Jets,New Jersey Devils,0.450000,0.533333,0.504425,0.486726,0.423529,0.564706,0.375000,0.500000,0.629630,0.333333,0.562500,0.375000,home win
1330,Toronto Maple Leafs,Detroit Red Wings,0.642857,0.342857,0.544715,0.447154,0.593750,0.395833,0.333333,0.555556,0.555556,0.425926,0.500000,0.437500,away win
1331,Vancouver Canucks,Los Angeles Kings,0.490196,0.490196,0.528846,0.461538,0.513514,0.472973,0.285714,0.571429,0.465517,0.517241,0.428571,0.428571,away win


In [5]:
apply = np.vectorize(lambda x: collapseToBin("home win", x))

In [9]:
# do mini
def validate(data, crit, max_dpth=None, min_inst=2, target_imp=0.0):
    folds = 10
    class_col = data.columns[-1]
    temp_accuracies = []

    for f in range(folds):
        train_fold = data[data.index % folds != f]
        model = dtree(train_fold, crit, class_col, max_dpth, min_inst, target_imp)
        # print("completed training -------")

        valid_fold = data[data.index % folds == f]
        predicts = predict(model, valid_fold)
        predicts = apply(predicts)

        actual = np.array(valid_fold["game_end"])
        actual = apply(actual)
        
        scores = metrics(actual, predicts)
        # print(scores)
        accuracy = scores["accuracy"]
        temp_accuracies.append(accuracy)
    return round(sum(temp_accuracies) / 10, 4)

In [7]:
def testHyperparams(df):
    bestScore = 0
    bestDepth = max_depths[0]
    bestTarget = target_impurities[0]
    bestMin = min_examples[0]

    for max_depth in max_depths:
        for crit in criterions:
            print(critMap[crit], "| max depth;", max_depth, "| target impurity;", bestTarget, "| minimum instances;",
                  bestMin)
            score = validate(df, crit, max_depth, bestMin, bestTarget)
            if score >= bestScore:
                bestDepth = max_depth
                bestScore = score
            print(score)

    best_score = 0
    for target in target_impurities:
        for crit in criterions:
            print(critMap[crit], "| max depth;", bestDepth, "| target impurity;", target, "| minimum instances;",
                  bestMin)
            score = validate(df, crit, bestDepth, bestMin, target)
            if score >= bestScore:
                bestTarget = target
                bestScore = score
            print(score)

    best_score = 0
    for min_example in min_examples:
        for crit in criterions:
            print(critMap[crit], "| max depth;", bestDepth, "| target impurity;", bestTarget, "| minimum instances;",
                  min_example)
            score = validate(df, crit, bestDepth, min_example, bestTarget)
            if score >= bestScore:
                bestMin = min_example
                bestScore = score
            print(score)
    return bestScore, bestTarget, bestDepth, bestMin

In [10]:
selected = testHyperparams(df)
print(selected)

gini | max depth; 1 | target impurity; 0.2 | minimum instances; 8
0.5544
entropy | max depth; 1 | target impurity; 0.2 | minimum instances; 8
0.5544
gini | max depth; 2 | target impurity; 0.2 | minimum instances; 8
0.5604
entropy | max depth; 2 | target impurity; 0.2 | minimum instances; 8
0.5589
gini | max depth; 3 | target impurity; 0.2 | minimum instances; 8
0.5657
entropy | max depth; 3 | target impurity; 0.2 | minimum instances; 8
0.5702
gini | max depth; 4 | target impurity; 0.2 | minimum instances; 8
0.5972
entropy | max depth; 4 | target impurity; 0.2 | minimum instances; 8
0.5777
gini | max depth; 5 | target impurity; 0.2 | minimum instances; 8
0.5717
entropy | max depth; 5 | target impurity; 0.2 | minimum instances; 8
0.5672
gini | max depth; 6 | target impurity; 0.2 | minimum instances; 8
0.5822
entropy | max depth; 6 | target impurity; 0.2 | minimum instances; 8
0.5792
gini | max depth; 7 | target impurity; 0.2 | minimum instances; 8
0.5626
entropy | max depth; 7 | target i

In [24]:
model = dtree(df, gini, list(df.columns)[-1], 10, 3, 0.0)
predicts = predict(model, df)
predicts = apply(predicts)

actual = np.array(df["game_end"])
actual = apply(actual)
        
scores = metrics(actual, predicts)
print(scores)

{'accuracy': 0.8447111777944486, 'sensitivity': 0.7783094098883573, 'specificity': 0.9036827195467422, 'precision': 0.8776978417266187, 'f1-score': 0.8250211327134404}


In [20]:
def testHyperparams2(df):
    bestScore = 0
    for max_depth in max_depths:
        for crit in criterions:
            print(critMap[crit], "| max depth;", max_depth, "| target impurity;", 0.0, "| minimum instances;",
                  3)
            score = validate(df, crit, max_depth, 3, 0.0)
            if score >= bestScore:
                bestDepth = max_depth
                bestScore = score
            print(score)

In [21]:
testHyperparams2(df)

gini | max depth; 1 | target impurity; 0.0 | minimum instances; 3
0.5544
entropy | max depth; 1 | target impurity; 0.0 | minimum instances; 3
0.5544
gini | max depth; 2 | target impurity; 0.0 | minimum instances; 3
0.5604
entropy | max depth; 2 | target impurity; 0.0 | minimum instances; 3
0.5589
gini | max depth; 3 | target impurity; 0.0 | minimum instances; 3
0.5657
entropy | max depth; 3 | target impurity; 0.0 | minimum instances; 3
0.5702
gini | max depth; 4 | target impurity; 0.0 | minimum instances; 3
0.598
entropy | max depth; 4 | target impurity; 0.0 | minimum instances; 3
0.5777
gini | max depth; 5 | target impurity; 0.0 | minimum instances; 3
0.5717
entropy | max depth; 5 | target impurity; 0.0 | minimum instances; 3
0.5672
gini | max depth; 6 | target impurity; 0.0 | minimum instances; 3
0.5837
entropy | max depth; 6 | target impurity; 0.0 | minimum instances; 3
0.5792
gini | max depth; 7 | target impurity; 0.0 | minimum instances; 3
0.5604
entropy | max depth; 7 | target im

In [22]:
model = dtree(df, gini, list(df.columns)[-1], 8, 4, 0.0)
predicts = predict(model, df)
predicts = apply(predicts)

actual = np.array(df["game_end"])
actual = apply(actual)

print(actual)
        
scores = metrics(actual, predicts)
print(scores)

[0 0 1 ... 1 1 0]
{'accuracy': 0.7959489872468117, 'sensitivity': 0.7017543859649122, 'specificity': 0.8796033994334278, 'precision': 0.8380952380952381, 'f1-score': 0.7638888888888888}


In [27]:
model = dtree(df, gini, list(df.columns)[-1], 10, 3, 0.0)
predicts = predict(model, df)
predicts = apply(predicts)

actual = np.array(df["game_end"])
actual = apply(actual)
        
scores = metrics(actual, predicts)
print(scores)

{'accuracy': 0.8012003000750187, 'sensitivity': 0.7368421052631579, 'specificity': 0.8583569405099151, 'precision': 0.8220640569395018, 'f1-score': 0.7771236333052985}


In [28]:
model = dtree(df, gini, list(df.columns)[-1], 10, 3, 0.14)
predicts = predict(model, df)
predicts = apply(predicts)

actual = np.array(df["game_end"])
actual = apply(actual)
        
scores = metrics(actual, predicts)
print(scores)

{'accuracy': 0.8319579894973743, 'sensitivity': 0.7575757575757576, 'specificity': 0.8980169971671388, 'precision': 0.8683729433272395, 'f1-score': 0.8091993185689947}


In [38]:
X, y = df[list(df.columns)[:-1]], df[list(df.columns)[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train["game_end"] = y_train
X_test["game_end"] = y_test

print(y_test)

model = dtree(X_train, gini, list(df.columns)[-1], 10, 3, 0.14)
predicts = predict(model, X_test)
predicts = apply(predicts)

actual = np.array(y_test)
actual = apply(actual)
        
scores = metrics(actual, predicts)
print(scores)

898     home win
1062    away win
1251    home win
298     away win
237     away win
          ...   
638     home win
534     away win
542     home win
895     home win
1090    away win
Name: game_end, Length: 267, dtype: object
{'accuracy': 0.5880149812734082, 'sensitivity': 0.5190839694656488, 'specificity': 0.6544117647058824, 'precision': 0.591304347826087, 'f1-score': 0.5528455284552846}


In [41]:
X, y = df[list(df.columns)[:-1]], df[list(df.columns)[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train["game_end"] = y_train
X_test["game_end"] = y_test

bagModel = bag(X_train, gini, list(df.columns)[-1], 10, 3, 0.14)
predicts = predict_bag(bagModel, X_test)
predicts = apply(predicts)

actual = np.array(y_test)
actual = apply(actual)
        
scores = metrics(actual, predicts)
print(scores)

completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
completed an iteration -------------
c

### Takeaways!!!

Can use a min-instances of 3 instead of 2 (optimal)

Should still use 0.14 as the metric for target impurity

optimal depth from hyperparameter tunning is 10

optimal metric is gini

{'accuracy': 0.8012003000750187, 'sensitivity': 0.7368421052631579, 'specificity': 0.8583569405099151, 'precision': 0.8220640569395018, 'f1-score': 0.7771236333052985}
