In [4]:
import utility as ut
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

In [34]:
# TODO: Process data
# This is easy with dataframes. Pick out what features we wanna use, make sure they are numerical, scale (if necessary)
# Then just pick out the features and go .to_numpy() and then our y values .to_numpy() and we're ready to train

In [35]:
df = ut.make_big_df()
df['run'] = np.where(df['Play Type'].str.contains('Rush'), 1, 0)
df['pass'] = np.where(df['Play Type'].str.contains('Pass'), 1, 0)
df['Scoring'] = df['Scoring'].astype(int)

df[['Clock Minutes', 'Clock Seconds']]
df['totalseconds'] = (df['Clock Minutes'] * 60) + df['Clock Seconds']
df[['Clock Minutes', 'Clock Seconds', 'totalseconds']]
df['pointsscored'] = df['Offense Score'] + df['Defense Score']
df[['Offense Score', 'Defense Score', 'pointsscored']]
df['Previous Play Yards'] = df.groupby('Game Id')['Yards Gained'].shift()

In [36]:
key_features = ['Offense Score', 'Defense Score', 'Drive Number', 'Play Number', 'Period', 'totalseconds', 
                'Offense Timeouts', 'Yard Line', 'Yards To Goal', 'Down', 'Distance', 'Previous Play Yards', 
                'Play Type']


# Filter the DataFrame
keep_play_types = ['Rush', 'Pass', 'Rushing Touchdown', 'Pass Reception', 'Pass Incompletion',
                           'Passing Touchdown', 'Pass Interception Return']
df = df[df['Play Type'].isin(keep_play_types)]

play_type_mapping = {
    'Rushing Touchdown': 'Rush',
    'Pass Reception': 'Pass',
    'Pass Incompletion': 'Pass',
    'Passing Touchdown': 'Pass',
    'Pass Interception Return': 'Pass'
}

df = df.replace(play_type_mapping)
df['Play Type'] = LabelEncoder().fit_transform(df['Play Type'])


df = df[key_features]
df = df.dropna()
X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

In [37]:
# TODO: Train models
# Also pretty easy, just input the stuff, tweak hyperparameters (grid search baby) and set that beast loose

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

print(f"Training accuracy: {clf.score(X_train, y_train)}")
print(f"Test accuracy: {clf.score(X_test, y_test)}")

Training accuracy: 1.0
Test accuracy: 0.5740740740740741


# Ensemble of MLPs with boosting

In [None]:
# load in data

# make model
mlp = MLPClassifier(hidden_layer_sizes=[20], activation='logistic', solver='sgd',
                    alpha=0, batch_size=1, learning_rate_init=.01, shuffle=True,
                    momentum=0, n_iter_no_change=50, max_iter=1000)
clf = mlp.fit(X_train, y_train)

num_itr = clf.n_iter_

# training set acc
train_pred = clf.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)

# test set acc
test_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print("Number of Iterations until Convergence:", num_itr)
print("Training Set Accuracy:", train_acc)
print("Test Set Accuracy:", test_acc)