In [1]:
import utility as ut
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Random Forest

In [2]:
# Make dataframe
df = ut.make_big_df()
df = ut.process(df)
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()
df


Unnamed: 0,Offense Score,Defense Score,Drive Number,Play Number,Period,totalseconds,Offense Timeouts,Yards To Goal,Down,Distance,Play Type,L1 Yards Gained,L2 Yards Gained,L1 Play Type,L2 Play Type,L1 Down,L2 Down,L1 Distance,L2 Distance,point diff
0,0,0,1,4,1,840,3.0,71,3,6,1,4.0,0.0,3.0,1.0,2.0,1.0,10.0,10.0,0
1,0,0,1,5,1,833,3.0,61,1,10,1,10.0,4.0,1.0,3.0,3.0,2.0,6.0,10.0,0
2,0,0,1,6,1,827,3.0,61,2,10,1,0.0,10.0,1.0,1.0,1.0,3.0,10.0,6.0,0
3,0,0,1,7,1,821,3.0,61,3,10,1,0.0,0.0,1.0,1.0,2.0,1.0,10.0,10.0,0
4,0,0,1,8,1,814,3.0,61,4,10,2,0.0,0.0,1.0,1.0,3.0,2.0,10.0,10.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,35,20,22,1,4,162,-5.0,36,1,10,3,44.0,0.0,0.0,3.0,4.0,3.0,27.0,27.0,15
1551,35,20,22,3,4,155,-5.0,37,2,11,3,1.0,44.0,3.0,0.0,1.0,4.0,10.0,27.0,15
1552,35,20,22,5,4,150,-6.0,38,3,12,3,1.0,1.0,3.0,3.0,2.0,1.0,11.0,10.0,15
1553,35,20,22,7,4,96,-6.0,42,4,16,2,1.0,1.0,3.0,3.0,3.0,2.0,12.0,11.0,15


In [3]:
# Train random forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

clf = RandomForestClassifier()
rand_forest_cv = cross_val_score(clf, X_train, y_train, cv=20)
print("Cross-validation scores:", rand_forest_cv)
print("Mean CV accuracy:", rand_forest_cv.mean())
# Train on all data now that we have done CV
clf.fit(X, y)

y_pred = clf.predict(X)

# Calculate MAE, MSE, and RMSE
rand_forest_mae = mean_absolute_error(y, y_pred)
rand_forest_mse = mean_squared_error(y, y_pred)
rand_forest_rmse = np.sqrt(rand_forest_mse)

print(f"rand_forest_mae: {rand_forest_mae}")
print(f"rand_forest_mse: {rand_forest_mse}")
print(f"rand_forest_rmse: {rand_forest_rmse}")
# print(f"Training accuracy: {clf.score(X_train, y_train)}")
# print(f"Test accuracy: {clf.score(X_test, y_test)}")

Cross-validation scores: [0.76623377 0.5974026  0.7012987  0.68831169 0.68831169 0.62337662
 0.62337662 0.62337662 0.66233766 0.58441558 0.66233766 0.7012987
 0.64935065 0.67532468 0.64935065 0.71428571 0.67532468 0.51948052
 0.53246753 0.59210526]
Mean CV accuracy: 0.6464883800410115
rand_forest_mae: 0.0
rand_forest_mse: 0.0
rand_forest_rmse: 0.0


In [4]:
normalized_x = ut.normalize(X)
X_train, X_test, y_train, y_test = train_test_split(normalized_x, y, test_size=0.01)

norm_clf = RandomForestClassifier()
norm_clf.fit(X_train, y_train)

norm_rfst_cv = cross_val_score(clf, X_train, y_train, cv=20)
print("Cross-validation scores:", norm_rfst_cv)
print("Mean CV accuracy:", norm_rfst_cv.mean())

# Now train on all of the data
clf.fit(normalized_x, y)

y_pred = norm_clf.predict(normalized_x)

# Calculate MAE, MSE, and RMSE
norm_rand_forest_mae = mean_absolute_error(y, y_pred)
norm_rand_forest_mse = mean_squared_error(y, y_pred)
norm_rand_forest_rmse = np.sqrt(norm_rand_forest_mse)

print(f"norm_rand_forest_mae: {norm_rand_forest_mae}")
print(f"norm_rand_forest_mse: {norm_rand_forest_mse}")
print(f"norm_rand_foest_rmse: {norm_rand_forest_rmse}")

# print(f"Training accuracy: {clf.score(X_train, y_train)}")
# print(f"Test accuracy: {clf.score(X_test, y_test)}")

Cross-validation scores: [0.63636364 0.7012987  0.7012987  0.67532468 0.54545455 0.7012987
 0.64935065 0.62337662 0.64935065 0.63636364 0.62337662 0.54545455
 0.63636364 0.62337662 0.67532468 0.63636364 0.62337662 0.58441558
 0.61038961 0.60526316]
Mean CV accuracy: 0.6341592617908407
norm_rand_forest_mae: 0.0038585209003215433
norm_rand_forest_mse: 0.0077170418006430866
norm_rand_foest_rmse: 0.08784669487603439


# XGBoost

In [15]:
df = ut.make_big_df()
df = ut.process(df)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

params = {
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': 4
}
num_boost_round = 50
bst = xgb.train(params, dtrain, num_boost_round)

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    nfold=20,
    metrics={'merror'},  # You can use other evaluation metrics
    early_stopping_rounds=10
)


best_num_boost_round = cv_results.shape[0]
# Train the model on the entire dataset
final_model = xgb.train(params, xgb.DMatrix(X, label=y), num_boost_round=best_num_boost_round)
xgb_cv_accuracy = 1 - cv_results['test-merror-mean'].mean()


y_pred = bst.predict(xgb.DMatrix(X))


xgb_mae = mean_absolute_error(y, y_pred)
xgb_mse = mean_squared_error(y, y_pred)
xgb_rmse = np.sqrt(norm_rand_forest_mse)


# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

0.6551470044378699

In [None]:
df = ut.make_big_df()
df = ut.process(df)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()
normalized_x = ut.normalize(X)

X_train, X_test, y_train, y_test = train_test_split(normalized_x, y, test_size=0.2)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

params = {
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': 4
}
num_boost_round = 50
bst = xgb.train(params, dtrain, num_boost_round)

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    nfold=20,
    metrics={'merror'},  # You can use other evaluation metrics
    early_stopping_rounds=10
)


best_num_boost_round = cv_results.shape[0]
# Train the model on the entire dataset
final_model = xgb.train(params, xgb.DMatrix(X, label=y), num_boost_round=best_num_boost_round)
norm_xgb_cv_accuracy = 1 - cv_results['test-merror-mean'].mean()


y_pred = bst.predict(xgb.DMatrix(X))


norm_xgb_mae = mean_absolute_error(y, y_pred)
norm_xgb_mse = mean_squared_error(y, y_pred)
norm_xgb_rmse = np.sqrt(norm_rand_forest_mse)

# MLPs

In [6]:
# load in data
df = ut.make_big_df()
df = ut.process(df)
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# make model
mlp = MLPClassifier(hidden_layer_sizes=[10], activation='logistic', solver='sgd',
                    alpha=0, batch_size=1, learning_rate_init=.01, shuffle=True,
                    momentum=0, n_iter_no_change=50, max_iter=1000)
# mlp.fit(X_train, y_train)

mlp_cv = cross_val_score(mlp, X, y, cv=5)
print("Cross-validation scores:", mlp_cv)
print("Mean CV accuracy:", mlp_cv.mean())

# Now train on all data
mlp.fit(X, y)

y_pred = mlp.predict(X)
mlp_mae = mean_absolute_error(y, y_pred)
mlp_mse = mean_squared_error(y, y_pred)
mlp_rmse = np.sqrt(mlp_mse)

print(f'mlp_mae: {mlp_mae}')
print(f'mlp_mse: {mlp_mse}')
print(f'mlp_rmse: {mlp_rmse}')

num_itr = mlp.n_iter_

# training set acc
train_pred = mlp.predict(X_train)
# train_acc = accuracy_score(y_train, train_pred)
# train_acc = mlp.score(X_train, y_train)

# test set acc
test_pred = mlp.predict(X_test)
# test_acc = accuracy_score(y_test, test_pred)
test_acc = mlp.score(X_test, y_test)

print("Number of Iterations until Convergence:", num_itr)
# print("Training Set Accuracy:", train_acc)
# print("Test Set Accuracy:", test_acc)

Cross-validation scores: [0.46623794 0.47588424 0.49196141 0.44694534 0.48874598]
Mean CV accuracy: 0.4739549839228296
mlp_mae: 0.940192926045016
mlp_mse: 1.8045016077170417
mlp_rmse: 1.3433173890473695
Number of Iterations until Convergence: 90


In [7]:
# make model
mlp = MLPClassifier(hidden_layer_sizes=[10], activation='logistic', solver='sgd',
                    alpha=0, batch_size=1, learning_rate_init=.01, shuffle=True,
                    momentum=0, n_iter_no_change=50, max_iter=1000)
# mlp.fit(X_train, y_train)

normalized_x = ut.normalize(X)
norm_mlp_cv = cross_val_score(mlp, normalized_x, y, cv=5)
print("Cross-validation scores:", mlp_cv)
print("Mean CV accuracy:", norm_mlp_cv.mean())

# Now train on all data
mlp.fit(normalized_x, y)

y_pred = mlp.predict(normalized_x)
norm_mlp_mae = mean_absolute_error(y, y_pred)
norm_mlp_mse = mean_squared_error(y, y_pred)
norm_mlp_rmse = np.sqrt(norm_mlp_mse)

print(f"norm_mlp_mae: {norm_mlp_mae}")
print(f"norm_mlp_mse: {norm_mlp_mse}")
print(f"norm_mlp_rmse: {norm_mlp_rmse}")



Cross-validation scores: [0.46623794 0.47588424 0.49196141 0.44694534 0.48874598]
Mean CV accuracy: 0.4739549839228296
norm_mlp_mae: 0.5209003215434084
norm_mlp_mse: 1.0366559485530547
norm_mlp_rmse: 1.0181630265105164


In [9]:
mlp.score(X_train, y_train)

0.272508038585209

# Tables and Charts and Other Pretty Stuff

In [8]:
from tabulate import tabulate

clf_stats = {
    'Model': 'Random Forest',
    'CV Average Accuracy': round(rand_forest_cv.mean(), 4),
    'MAE': round(rand_forest_mae, 4),
    'MSE': round(rand_forest_mse, 4),
    'RMSE': round(rand_forest_rmse, 4)
}

norm_clf_stats = {
    'Model': 'Random Forest (normalized)',
    'CV Average Accuracy': round(norm_rfst_cv.mean(), 4),
    'MAE': round(norm_rand_forest_mae, 4),
    'MSE': round(norm_rand_forest_mse, 4),
    'RMSE': round(norm_rand_forest_rmse, 4)
}

bst_stats = {
    'Model': 'XGBoost',
    'CV Average Accuracy': round(xgb_cv_accuracy, 4),
    'MAE': round(xgb_mae, 4),
    'MSE': round(xgb_mse, 4),
    'RMSE': round(xgb_rmse, 4)
}

norm_xgb_cv_accuracy
norm_bst_stats = {
    'Model': 'XGBoost (Normalized Inputs)',
    'CV Average Accuracy': round(xgb_cv_accuracy, 4),
    'MAE': round(xgb_mae, 4),
    'MSE': round(xgb_mse, 4),
    'RMSE': round(xgb_rmse, 4)
}

mlp_stats = {
    'Model': 'Multi-layer Perceptron',
    'CV Average Accuracy': round(mlp_cv.mean(), 4),
    'MAE': round(mlp_mae, 4),
    'MSE': round(mlp_mse, 4),
    'RMSE': round(mlp_rmse, 4)
}

norm_mlp_stats = {
    'Model': 'Multi-layer Perceptron (Normalized Inputs)',
    'CV Average Accuracy': round(norm_mlp_cv.mean(), 4),
    'MAE': round(norm_mlp_mae, 4),
    'MSE': round(norm_mlp_mse, 4),
    'RMSE': round(norm_mlp_rmse, 4)
}



# Combine the stats into a list for tabulate
all_stats = [mlp_stats, norm_mlp_stats, clf_stats, norm_clf_stats, bst_stats]

# Create a table
table = tabulate(all_stats, headers="keys", tablefmt="pretty")
print(table)

+--------------------------------------------+---------------------+--------+--------+--------+
|                   Model                    | CV Average Accuracy |  MAE   |  MSE   |  RMSE  |
+--------------------------------------------+---------------------+--------+--------+--------+
|           Multi-layer Perceptron           |        0.474        | 0.9402 | 1.8045 | 1.3433 |
| Multi-layer Perceptron (Normalized Inputs) |       0.5929        | 0.5209 | 1.0367 | 1.0182 |
|               Random Forest                |       0.6465        |  0.0   |  0.0   |  0.0   |
|         Random Forest (normalized)         |       0.6342        | 0.0039 | 0.0077 | 0.0878 |
|                  XGBoost                   |        0.85         |  0.12  |  0.18  |  0.42  |
+--------------------------------------------+---------------------+--------+--------+--------+
