In [7]:
import utility as ut
import numpy as np
import random as random
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

In [56]:
# TODO: Process data
# This is easy with dataframes. Pick out what features we wanna use, make sure they are numerical, scale (if necessary)
# Then just pick out the features and go .to_numpy() and then our y values .to_numpy() and we're ready to train

In [85]:
def process(df):
    

    # Filter the DataFrame
    keep_play_types = ['Rush', 'Pass', 'Rushing Touchdown', 'Pass Reception', 'Pass Incompletion',
                            'Passing Touchdown', 'Pass Interception Return', 'Punt', 'Interception', 
                            'Field Goal Good', 'Field Goal Missed']
    df = df[df['Play Type'].isin(keep_play_types)]

    # Simplify play types
    play_type_mapping = {
        'Rushing Touchdown': 'Rush',
        'Pass Reception': 'Pass',
        'Pass Incompletion': 'Pass',
        'Passing Touchdown': 'Pass',
        'Pass Interception Return': 'Pass',
        'Interception': 'Pass',
        'Field Goal Good': 'Field Goal',
        'Field Goal Missed': 'Field Goal'
    }
    df = df.replace(play_type_mapping)

    # Add some useful variables
    df['run'] = np.where(df['Play Type'].str.contains('Rush'), 1, 0)
    df['pass'] = np.where(df['Play Type'].str.contains('Pass'), 1, 0)
    df['Scoring'] = df['Scoring'].astype(int)
    df['totalseconds'] = (df['Clock Minutes'] * 60) + df['Clock Seconds']
    df[['Clock Minutes', 'Clock Seconds']]
    df[['Clock Minutes', 'Clock Seconds', 'totalseconds']]
    df['pointsscored'] = df['Offense Score'] + df['Defense Score']
    df[['Offense Score', 'Defense Score', 'pointsscored']]
    df['pointsscored'] = df['Offense Score'] + df['Defense Score']
    
    # Make lagged variables
    df['L1 Play Yards'] = df.groupby('Game Id')['Yards Gained'].shift()
    df['L2 Play Yards'] = df.groupby('Game Id')['Yards Gained'].shift(2)
    df['L1 Yards Gained'] = df.groupby('Game Id')['Yards Gained'].shift()
    df['L2 Yards Gained'] = df.groupby('Game Id')['Yards Gained'].shift(2)
    df['L1 Play Type'] = df.groupby('Game Id')['Play Type'].shift()
    df['L2 Play Type'] = df.groupby('Game Id')['Play Type'].shift(2)
    df['L1 Down'] = df.groupby('Game Id')['Down'].shift()
    df['L2 Down'] = df.groupby('Game Id')['Down'].shift(2)
    df['L1 Distance'] = df.groupby('Game Id')['Distance'].shift()
    df['L2 Distance'] = df.groupby('Game Id')['Distance'].shift(2)

    key_features = ['Offense Score', 'Defense Score', 'Drive Number', 'Play Number', 'Period', 'totalseconds', 
                'Offense Timeouts', 'Yard Line', 'Yards To Goal', 'Down', 'Distance', 'L1 Play Yards', 'L2 Play Yards',
                'Play Type', 'L1 Yards Gained', 'L2 Yards Gained', 'L1 Play Type', 'L2 Play Type', 'L1 Down', 
                'L2 Down', 'L1 Distance', 'L1 Distance']
    
    # Encode categorical columns
    columns_to_encode = ['Play Type', 'L1 Play Type', 'L2 Play Type']
    label_encoder = LabelEncoder()
    
    df[columns_to_encode] = df[columns_to_encode].apply(lambda col: label_encoder.fit_transform(col))


    return df[key_features]

In [87]:
df = ut.make_big_df()
df = process(df)
df.to_csv('all.csv')

In [61]:
df = ut.make_big_df()
# df['run'] = np.where(df['Play Type'].str.contains('Rush'), 1, 0)
# df['pass'] = np.where(df['Play Type'].str.contains('Pass'), 1, 0)
# df['Scoring'] = df['Scoring'].astype(int)

df[['Clock Minutes', 'Clock Seconds']]
df['totalseconds'] = (df['Clock Minutes'] * 60) + df['Clock Seconds']
df[['Clock Minutes', 'Clock Seconds', 'totalseconds']]
df['pointsscored'] = df['Offense Score'] + df['Defense Score']
df[['Offense Score', 'Defense Score', 'pointsscored']]
df['L1 Yards Gained'] = df.groupby('Game Id')['Yards Gained'].shift()
df['L2 Yards Gained'] = df.groupby('Game Id')['Yards Gained'].shift(2)
df['L1 Play Type'] = df.groupby('Game Id')['Play Type'].shift()
df['L2 Play Type'] = df.groupby('Game Id')['Play Type'].shift(2)
df['L1 Down'] = df.groupby('Game Id')['Down'].shift()
df['L2 Down'] = df.groupby('Game Id')['Down'].shift(2)
df['L1 Down'] = df.groupby('Game Id')['Distance'].shift()
df['L2 Down'] = df.groupby('Game Id')['Distance'].shift(2)
# Down and distance from prev play
df['Play Type'].value_counts()

Play Type
Rush                          252
Pass Reception                167
Pass Incompletion              98
Penalty                        39
Timeout                        34
Kickoff                        33
Punt                           29
Passing Touchdown              22
Kickoff Return (Offense)       17
Rushing Touchdown              11
End Period                     10
Sack                            9
End of Half                     7
Field Goal Good                 7
Field Goal Missed               5
End of Game                     5
Fumble Recovery (Own)           4
Pass Interception Return        2
Fumble Recovery (Opponent)      2
Kickoff Return Touchdown        1
Interception                    1
Safety                          1
Name: count, dtype: int64

In [58]:
key_features = ['Offense Score', 'Defense Score', 'Drive Number', 'Play Number', 'Period', 'totalseconds', 
                'Offense Timeouts', 'Yard Line', 'Yards To Goal', 'Down', 'Distance', 'Previous Play Yards', 
                'Play Type']


# Filter the DataFrame
keep_play_types = ['Rush', 'Pass', 'Rushing Touchdown', 'Pass Reception', 'Pass Incompletion',
                           'Passing Touchdown', 'Pass Interception Return']
df = df[df['Play Type'].isin(keep_play_types)]

play_type_mapping = {
    'Rushing Touchdown': 'Rush',
    'Pass Reception': 'Pass',
    'Pass Incompletion': 'Pass',
    'Passing Touchdown': 'Pass',
    'Pass Interception Return': 'Pass'
}

df = df.replace(play_type_mapping)
df['Play Type'] = LabelEncoder().fit_transform(df['Play Type'])


df = df[key_features]
# df = df.dropna()
X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

In [59]:
# TODO: Train models
# Also pretty easy, just input the stuff, tweak hyperparameters (grid search baby) and set that beast loose

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print(f"Training accuracy: {clf.score(X_train, y_train)}")
print(f"Test accuracy: {clf.score(X_test, y_test)}")

Training accuracy: 1.0
Test accuracy: 0.5185185185185185


# Ensemble of MLPs with boosting

# 

In [11]:
# load in data
df = ut.make_big_df()
df = ut.process(df)

X = df.drop(columns=['Play Type']).to_numpy()
y = df['Play Type'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=random.randint(1, 20))

# make model
mlp = MLPClassifier(hidden_layer_sizes=[20], activation='logistic', solver='sgd',
                    alpha=0, batch_size=1, learning_rate_init=.01, shuffle=True,
                    momentum=0, n_iter_no_change=50, max_iter=1000)
clf = mlp.fit(X_train, y_train)

num_itr = clf.n_iter_

# training set acc
train_pred = clf.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)

# test set acc
test_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print("Number of Iterations until Convergence:", num_itr)
print("Training Set Accuracy:", train_acc)
print("Test Set Accuracy:", test_acc)

AttributeError: module 'utility' has no attribute 'process'