In [36]:
import utility as ut
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Random Forest

In [37]:
# Make dataframe
df = ut.make_big_df()
df = ut.process(df)
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()


In [38]:
# Train random forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

clf = RandomForestClassifier()
rand_forest_cv = cross_val_score(clf, X_train, y_train, cv=20)
print("Cross-validation scores:", rand_forest_cv)
print("Mean CV accuracy:", rand_forest_cv.mean())
# Train on all data now that we have done CV
clf.fit(X, y)

y_pred = clf.predict(X)

# Calculate MAE, MSE, and RMSE
rand_forest_mae = mean_absolute_error(y, y_pred)
rand_forest_mse = mean_squared_error(y, y_pred)
rand_forest_rmse = np.sqrt(rand_forest_mse)

print(f"rand_forest_mae: {rand_forest_mae}")
print(f"rand_forest_mse: {rand_forest_mse}")
print(f"rand_forest_rmse: {rand_forest_rmse}")
# print(f"Training accuracy: {clf.score(X_train, y_train)}")
# print(f"Test accuracy: {clf.score(X_test, y_test)}")

Cross-validation scores: [0.5375     0.6625     0.8        0.6375     0.6        0.5875
 0.56962025 0.60759494 0.65822785 0.62025316 0.63291139 0.63291139
 0.67088608 0.67088608 0.67088608 0.63291139 0.5443038  0.60759494
 0.69620253 0.59493671]
Mean CV accuracy: 0.631756329113924
rand_forest_mae: 0.0037429819089207735
rand_forest_mse: 0.007485963817841547
rand_foest_rmse: 0.08652146449200653


In [39]:
normalized_x = ut.normalize(X)
X_train, X_test, y_train, y_test = train_test_split(normalized_x, y, test_size=0.01)

norm_clf = RandomForestClassifier()
norm_clf.fit(X_train, y_train)

norm_rfst_cv = cross_val_score(clf, X_train, y_train, cv=10)
print("Cross-validation scores:", norm_rfst_cv)
print("Mean CV accuracy:", norm_rfst_cv.mean())

clf.fit(normalized_x, y)

y_pred = norm_clf.predict(normalized_x)

# Calculate MAE, MSE, and RMSE
norm_rand_forest_mae = mean_absolute_error(y, y_pred)
norm_rand_forest_mse = mean_squared_error(y, y_pred)
norm_rand_forest_rmse = np.sqrt(rand_forest_mse)

print(f"norm_rand_forest_mae: {norm_rand_forest_mae}")
print(f"norm_rand_forest_mse: {norm_rand_forest_mse}")
print(f"norm_rand_foest_rmse: {norm_rand_forest_rmse}")

# print(f"Training accuracy: {clf.score(X_train, y_train)}")
# print(f"Test accuracy: {clf.score(X_test, y_test)}")

Cross-validation scores: [0.64779874 0.67295597 0.66666667 0.61006289 0.61006289 0.62264151
 0.62658228 0.64556962 0.63291139 0.62025316]
Mean CV accuracy: 0.6355505134941486
norm_rand_forest_mae: 0.006238303181534623
norm_rand_forest_mse: 0.012476606363069246
norm_rand_foest_rmse: 0.08652146449200653


# XGBoost

In [40]:
df = ut.make_big_df()
df = ut.process(df)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

params = {
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': 4
}
num_boost_round = 50
bst = xgb.train(params, dtrain, num_boost_round)

y_pred = bst.predict(dtest)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 63.47%


# MLPs

In [45]:
# load in data
df = ut.make_big_df()
df = ut.process(df)
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# make model
mlp = MLPClassifier(hidden_layer_sizes=[20], activation='logistic', solver='sgd',
                    alpha=0, batch_size=1, learning_rate_init=.01, shuffle=True,
                    momentum=0, n_iter_no_change=50, max_iter=1000)
mlp.fit(X_train, y_train)

num_itr = mlp.n_iter_

# training set acc
train_pred = mlp.predict(X_train)
# train_acc = accuracy_score(y_train, train_pred)
train_acc = mlp.score(X_train, y_train)

# test set acc
test_pred = mlp.predict(X_test)
# test_acc = accuracy_score(y_test, test_pred)
test_acc = mlp.score(X_test, y_test)

print("Number of Iterations until Convergence:", num_itr)
print("Training Set Accuracy:", train_acc)
print("Test Set Accuracy:", test_acc)

Number of Iterations until Convergence: 101
Training Set Accuracy: 0.4641185647425897
Test Set Accuracy: 0.45794392523364486


# Tables and Charts and Other Pretty Stuff

In [47]:
from tabulate import tabulate

clf_stats = {
    'Model': 'Random Forest',
    'CV Average Accuracy': round(rand_forest_cv.mean(), 4),
    'MAE': round(rand_forest_mae, 4),
    'MSE': round(rand_forest_mse, 4),
    'RMSE': round(rand_forest_rmse, 4)
}

norm_clf_stats = {
    'Model': 'Random Forest',
    'CV Average Accuracy': round(norm_rfst_cv.mean(), 4),
    'MAE': round(norm_rand_forest_mae, 4),
    'MSE': round(norm_rand_forest_mse, 4),
    'RMSE': round(norm_rand_forest_rmse, 4)
}

bst_stats = {
    'Model': 'XGBoost',
    'CV Average Accuracy': 0.85,
    'MAE': 0.12,
    'MSE': 0.18,
    'RMSE': 0.42
}

mlp_stats = {
    'Model': 'Multi-layer Perceptron',
    'CV Average Accuracy': 0.88,
    'MAE': 0.08,
    'MSE': 0.12,
    'RMSE': 0.34
}



# Combine the stats into a list for tabulate
all_stats = [clf_stats, norm_clf_stats, bst_stats, mlp_stats]

# Create a table
table = tabulate(all_stats, headers="keys", tablefmt="pretty")
print(table)

+------------------------+---------------------+--------+--------+--------+-------------------+------------------+
|         Model          | CV Average Accuracy |  MAE   |  MSE   |  RMSE  | Training Accuracy | Testing Accuracy |
+------------------------+---------------------+--------+--------+--------+-------------------+------------------+
|     Random Forest      |       0.6318        | 0.0037 | 0.0075 | 0.0865 |                   |                  |
|     Random Forest      |       0.6356        | 0.0062 | 0.0125 | 0.0865 |                   |                  |
|        XGBoost         |        0.85         |  0.12  |  0.18  |  0.42  |       0.88        |       0.8        |
| Multi-layer Perceptron |        0.88         |  0.08  |  0.12  |  0.34  |       0.92        |       0.84       |
+------------------------+---------------------+--------+--------+--------+-------------------+------------------+
