In [1]:
import utility as ut
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Random Forest

In [2]:
# Make dataframe
df = ut.make_big_df()
df = ut.process(df)
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()
df


Unnamed: 0,Offense Score,Defense Score,Drive Number,Play Number,Period,totalseconds,Offense Timeouts,Yards To Goal,Down,Distance,Play Type
0,0,0,1,2,1,893,3.0,75,1,10,1
1,0,0,1,3,1,870,3.0,75,2,10,3
2,0,0,1,4,1,840,3.0,71,3,6,1
3,0,0,1,5,1,833,3.0,61,1,10,1
4,0,0,1,6,1,827,3.0,61,2,10,1
...,...,...,...,...,...,...,...,...,...,...,...
1598,35,20,22,1,4,162,-5.0,36,1,10,3
1599,35,20,22,3,4,155,-5.0,37,2,11,3
1600,35,20,22,5,4,150,-6.0,38,3,12,3
1601,35,20,22,7,4,96,-6.0,42,4,16,2


In [3]:
# Train random forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

clf = RandomForestClassifier()
rand_forest_cv = cross_val_score(clf, X_train, y_train, cv=20)
print("Cross-validation scores:", rand_forest_cv)
print("Mean CV accuracy:", rand_forest_cv.mean())
# Train on all data now that we have done CV
clf.fit(X, y)

y_pred = clf.predict(X)

# Calculate MAE, MSE, and RMSE
rand_forest_mae = mean_absolute_error(y, y_pred)
rand_forest_mse = mean_squared_error(y, y_pred)
rand_forest_rmse = np.sqrt(rand_forest_mse)

print(f"rand_forest_mae: {rand_forest_mae}")
print(f"rand_forest_mse: {rand_forest_mse}")
print(f"rand_forest_rmse: {rand_forest_rmse}")
# print(f"Training accuracy: {clf.score(X_train, y_train)}")
# print(f"Test accuracy: {clf.score(X_test, y_test)}")

Cross-validation scores: [0.6125     0.5625     0.6625     0.6        0.5625     0.6875
 0.6835443  0.67088608 0.53164557 0.60759494 0.65822785 0.65822785
 0.72151899 0.64556962 0.62025316 0.72151899 0.70886076 0.50632911
 0.70886076 0.59493671]
Mean CV accuracy: 0.6362737341772152
rand_forest_mae: 0.0037429819089207735
rand_forest_mse: 0.007485963817841547
rand_forest_rmse: 0.08652146449200653


In [4]:
normalized_x = ut.normalize(X)
X_train, X_test, y_train, y_test = train_test_split(normalized_x, y, test_size=0.01)

norm_clf = RandomForestClassifier()
norm_clf.fit(X_train, y_train)

norm_rfst_cv = cross_val_score(clf, X_train, y_train, cv=20)
print("Cross-validation scores:", norm_rfst_cv)
print("Mean CV accuracy:", norm_rfst_cv.mean())

# Now train on all of the data
clf.fit(normalized_x, y)

y_pred = norm_clf.predict(normalized_x)

# Calculate MAE, MSE, and RMSE
norm_rand_forest_mae = mean_absolute_error(y, y_pred)
norm_rand_forest_mse = mean_squared_error(y, y_pred)
norm_rand_forest_rmse = np.sqrt(norm_rand_forest_mse)

print(f"norm_rand_forest_mae: {norm_rand_forest_mae}")
print(f"norm_rand_forest_mse: {norm_rand_forest_mse}")
print(f"norm_rand_foest_rmse: {norm_rand_forest_rmse}")

# print(f"Training accuracy: {clf.score(X_train, y_train)}")
# print(f"Test accuracy: {clf.score(X_test, y_test)}")

Cross-validation scores: [0.7        0.6625     0.7125     0.575      0.6        0.65
 0.55696203 0.64556962 0.65822785 0.63291139 0.78481013 0.59493671
 0.5443038  0.59493671 0.55696203 0.67088608 0.67088608 0.70886076
 0.63291139 0.69620253]
Mean CV accuracy: 0.6424683544303796
norm_rand_forest_mae: 0.011228945726762321
norm_rand_forest_mse: 0.022457891453524642
norm_rand_foest_rmse: 0.14985957244542186


# XGBoost

In [5]:
df = ut.make_big_df()
df = ut.process(df)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

params = {
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': 4
}
num_boost_round = 50
bst = xgb.train(params, dtrain, num_boost_round)

y_pred = bst.predict(dtest)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 60.06%


# MLPs

In [6]:
# load in data
df = ut.make_big_df()
df = ut.process(df)
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# make model
mlp = MLPClassifier(hidden_layer_sizes=[20], activation='logistic', solver='sgd',
                    alpha=0, batch_size=1, learning_rate_init=.01, shuffle=True,
                    momentum=0, n_iter_no_change=50, max_iter=1000)
mlp.fit(X_train, y_train)

num_itr = mlp.n_iter_

# training set acc
train_pred = mlp.predict(X_train)
# train_acc = accuracy_score(y_train, train_pred)
train_acc = mlp.score(X_train, y_train)

# test set acc
test_pred = mlp.predict(X_test)
# test_acc = accuracy_score(y_test, test_pred)
test_acc = mlp.score(X_test, y_test)

print("Number of Iterations until Convergence:", num_itr)
print("Training Set Accuracy:", train_acc)
print("Test Set Accuracy:", test_acc)

Number of Iterations until Convergence: 159
Training Set Accuracy: 0.4968798751950078
Test Set Accuracy: 0.48909657320872274


# Tables and Charts and Other Pretty Stuff

In [7]:
from tabulate import tabulate

clf_stats = {
    'Model': 'Random Forest',
    'CV Average Accuracy': round(rand_forest_cv.mean(), 4),
    'MAE': round(rand_forest_mae, 4),
    'MSE': round(rand_forest_mse, 4),
    'RMSE': round(rand_forest_rmse, 4)
}

norm_clf_stats = {
    'Model': 'Random Forest (normalized)',
    'CV Average Accuracy': round(norm_rfst_cv.mean(), 4),
    'MAE': round(norm_rand_forest_mae, 4),
    'MSE': round(norm_rand_forest_mse, 4),
    'RMSE': round(norm_rand_forest_rmse, 4)
}

bst_stats = {
    'Model': 'XGBoost',
    'CV Average Accuracy': 0.85,
    'MAE': 0.12,
    'MSE': 0.18,
    'RMSE': 0.42
}

mlp_stats = {
    'Model': 'Multi-layer Perceptron',
    'CV Average Accuracy': 0.88,
    'MAE': 0.08,
    'MSE': 0.12,
    'RMSE': 0.34
}



# Combine the stats into a list for tabulate
all_stats = [clf_stats, norm_clf_stats, bst_stats, mlp_stats]

# Create a table
table = tabulate(all_stats, headers="keys", tablefmt="pretty")
print(table)

+----------------------------+---------------------+--------+--------+--------+
|           Model            | CV Average Accuracy |  MAE   |  MSE   |  RMSE  |
+----------------------------+---------------------+--------+--------+--------+
|       Random Forest        |       0.6363        | 0.0037 | 0.0075 | 0.0865 |
| Random Forest (normalized) |       0.6425        | 0.0112 | 0.0225 | 0.1499 |
|          XGBoost           |        0.85         |  0.12  |  0.18  |  0.42  |
|   Multi-layer Perceptron   |        0.88         |  0.08  |  0.12  |  0.34  |
+----------------------------+---------------------+--------+--------+--------+
