# NFL PLAY PREDICTION
# --------------------------------------------------------------------------------



### Data Prep

In [1]:
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import time

#ignore warnings
warnings.filterwarnings('ignore')

#Import and Clean Data
data_df = pd.read_csv("nfl_data.csv")

#From the Original Dataset
temp_cols = ["passer_player_name", "rusher_player_name"]
columns_to_encode = ["home_team", "away_team", "side_of_field", "defteam", "game_half"]

columns_to_keep = ["posteam", "ydstogo", "down", "game_seconds_remaining", "season", "qtr", "drive",
                    "yardline_100", "goal_to_go", "ydsnet", "posteam_timeouts_remaining",
                    "defteam_timeouts_remaining", "posteam_score", "defteam_score", "score_differential", 
                    "home_wp", "away_wp", "fg_prob", "td_prob", "shotgun", "no_huddle", "wp", "def_wp",
                    "quarter_seconds_remaining"]

target = ["play_type"]

df = data_df[columns_to_keep+columns_to_encode+target]
temp_df = data_df[temp_cols]
data_df = df[(df["play_type"] == "run") | (df["play_type"] == "pass")]
data_df = data_df[data_df['down'].notnull()]
data_df = data_df[data_df['game_seconds_remaining'].notnull()]
data_df = data_df[data_df['home_wp'].notnull()]
data_df = data_df[data_df['away_wp'].notnull()]
data_df = data_df[data_df['quarter_seconds_remaining'].notnull()]

## DATA ENHANCEMENTS ##

#is the possessing team playing at home?
data_df['posteam_home'] = data_df['posteam'] == data_df['home_team']

## PREVIOUS PLAY INFO ##
#what was the outcome of previous plays (run or pass)?
data_df['one_play_ago_type'] = data_df['play_type'].shift()
data_df['two_plays_ago_type'] = data_df['play_type'].shift(2)

#were the previous plays by the same team?
data_df['one_play_ago_pos_same_team'] = data_df['posteam'].shift() == data_df['posteam']
data_df['two_plays_ago_pos_same_team'] = data_df['posteam'].shift(2) == data_df['posteam']

#was the previous passer likely to throw instead of running the ball?
data_df['prev_pocket_passer'] = temp_df['passer_player_name'].shift().isin(['M.Ryan', 'A.Rodgers', 
                                                'T.Brady', 'P.Rivers', 'D.Brees', 'M.Stafford', 
                                                'J.Flacco', 'P.Manning', 'T.Romo', 'K.Cousins'])

#was the previous passer likely to run instead of throwing the ball?
data_df['prev_rush_passer'] = temp_df['passer_player_name'].shift().isin(['C.Newton', 'L.Jackson', 
                                                'D.Watson', 'D.Prescott', 'R.Wilson', 'T.Taylor', 
                                                'R.Griffin', 'M.Vick'])

data_df['prev_top_rusher'] = temp_df['rusher_player_name'].shift().isin(['E.Elliott', 'S.Barkley', 
                                                'D.Johnson', 'T.Gurley', 'A.Peterson', 'L.Bell', 
                                                'L.Mccoy', 'M.Gordon', 'K.Hunt', 'D.Murray', 
                                                'D.Martin', 'R.Rice', 'C.Johnson', 'M.Turner'])

# back-up rushers: 'J.Conner', 'M.Lynch'

#data_df.describe
# show what our data looks like: use data_df.describe

## Encoding

In [2]:
'''SideofField OHE'''
#Encode and One Hot Encode SideofField
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_SOF = LabelEncoder()
data_df["side_of_field"] = labelencoder_SOF.fit_transform(data_df["side_of_field"].astype(str))
onehotencoder_SOF = OneHotEncoder(categorical_features=[0], sparse=False)
SideofField_OHE = onehotencoder_SOF.fit_transform(data_df["side_of_field"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["side_of_field"])

'''posteam OHE'''
#Encode and One Hot Encode postteam
labelencoder_posteam = LabelEncoder()
data_df["posteam"] = labelencoder_posteam.fit_transform(data_df["posteam"].astype(str))
onehotencoder_posteam = OneHotEncoder(categorical_features=[0], sparse=False)
posteam_OHE = onehotencoder_posteam.fit_transform(data_df["posteam"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["posteam"])

'''DefensiveTeam OHE'''
#Encode and One Hot Encode DefensiveTeam
labelencoder_defteam = LabelEncoder()
data_df["defteam"] = labelencoder_defteam.fit_transform(data_df["defteam"].astype(str))
onehotencoder_defteam = OneHotEncoder(categorical_features=[0], sparse=False)
defteam_OHE = onehotencoder_defteam.fit_transform(data_df["defteam"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["defteam"])

'''AwayTeam OHE'''
#Encode and One Hot Encode AwayTeam
labelencoder_awayteam = LabelEncoder()
data_df["away_team"] = labelencoder_awayteam.fit_transform(data_df["away_team"].astype(str))
onehotencoder_awayteam = OneHotEncoder(categorical_features=[0], sparse=False)
awayteam_OHE = onehotencoder_awayteam.fit_transform(data_df["away_team"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["away_team"])

'''HomeTeam OHE'''
#Encode and One Hot Encode HomeTeam
labelencoder_hometeam = LabelEncoder()
data_df["home_team"] = labelencoder_hometeam.fit_transform(data_df["home_team"].astype(str))
onehotencoder_hometeam = OneHotEncoder(categorical_features=[0], sparse=False)
hometeam_OHE = onehotencoder_hometeam.fit_transform(data_df["home_team"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["home_team"])

'''game_half OHE'''
#Encode and One Hot Encode game_half
labelencoder_gamehalf = LabelEncoder()
data_df["game_half"] = labelencoder_gamehalf.fit_transform(data_df["game_half"].astype(str))
onehotencoder_gamehalf = OneHotEncoder(categorical_features=[0], sparse=False)
gamehalf_OHE = onehotencoder_gamehalf.fit_transform(data_df["game_half"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["game_half"])

'''one_play_ago_type OHE'''
#Encode and One Hot Encode one_play_ago_type
labelencoder_oneplayagotype = LabelEncoder()
data_df["one_play_ago_type"] = labelencoder_oneplayagotype.fit_transform(data_df["one_play_ago_type"].astype(str))
onehotencoder_oneplayagotype = OneHotEncoder(categorical_features=[0], sparse=False)
oneplayagotype_OHE = onehotencoder_oneplayagotype.fit_transform(data_df["one_play_ago_type"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["one_play_ago_type"])

'''one_play_ago_pos_same_team OHE'''
#Encode and One Hot Encode one_play_ago_pos_same_team
labelencoder_oneplayagopossameteam = LabelEncoder()
data_df["one_play_ago_pos_same_team"] = labelencoder_oneplayagopossameteam.fit_transform(data_df["one_play_ago_pos_same_team"].astype(str))
onehotencoder_oneplayagopossameteam = OneHotEncoder(categorical_features=[0], sparse=False)
oneplayagopossameteam_OHE = onehotencoder_oneplayagopossameteam.fit_transform(data_df["one_play_ago_pos_same_team"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["one_play_ago_pos_same_team"])

'''two_plays_ago_type OHE'''
#Encode and One Hot Encode two_plays_ago_type
labelencoder_twoplaysagotype = LabelEncoder()
data_df["two_plays_ago_type"] = labelencoder_twoplaysagotype.fit_transform(data_df["two_plays_ago_type"].astype(str))
onehotencoder_twoplaysagotype = OneHotEncoder(categorical_features=[0], sparse=False)
twoplaysagotype_OHE = onehotencoder_twoplaysagotype.fit_transform(data_df["two_plays_ago_type"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["two_plays_ago_type"])

'''two_plays_ago_pos_same_team OHE'''
#Encode and One Hot Encode two_plays_ago_pos_same_team
labelencoder_twoplaysagopossameteam = LabelEncoder()
data_df["two_plays_ago_pos_same_team"] = labelencoder_twoplaysagopossameteam.fit_transform(data_df["two_plays_ago_pos_same_team"].astype(str))
onehotencoder_twoplaysagopossameteam = OneHotEncoder(categorical_features=[0], sparse=False)
twoplaysagopossameteam_OHE = onehotencoder_twoplaysagopossameteam.fit_transform(data_df["two_plays_ago_pos_same_team"].values.reshape(-1,1))
data_df = data_df.drop(columns = ["two_plays_ago_pos_same_team"])



## Re-assemble the data

In [3]:
#Concatenate all the one hot encodings
OHE_matrix = np.concatenate((SideofField_OHE, posteam_OHE, defteam_OHE, awayteam_OHE, 
                             hometeam_OHE, gamehalf_OHE, oneplayagotype_OHE,
                             oneplayagopossameteam_OHE, twoplaysagotype_OHE,
                             twoplaysagopossameteam_OHE), axis=1)

#Rearrange the columns so Playtype is the last column
cols = ["ydstogo", "down", "game_seconds_remaining", "season", "qtr", "drive", "yardline_100",
        "goal_to_go", "ydsnet", "posteam_timeouts_remaining", "defteam_timeouts_remaining",
        "posteam_score", "defteam_score", "score_differential", "home_wp", "away_wp",
        "fg_prob", "td_prob", "shotgun", "no_huddle", "wp", "def_wp", "quarter_seconds_remaining",
        "posteam_home", "prev_pocket_passer", "prev_top_rusher",
        "play_type"]

data_df = data_df[cols]

X = data_df.iloc[:,:-1].values
y = data_df.iloc[:,-1].values

#Concatenate One hot encoded variables with X
X = np.concatenate((X,OHE_matrix), axis=1)


#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=0)

In [4]:
#K-Fold Cross Validation
from sklearn.model_selection import cross_val_score

def getModelAccuracies(classifierObj): 
    modelAccuracies = cross_val_score(estimator=classifierObj, X=X_train, y=y_train, cv=10)
    return {"score": {"mean": modelAccuracies.mean(), "std": modelAccuracies.std()}}

## Normalize the data

In [5]:

#Normalizing the features
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler()
X_train_norm = sc_X.fit_transform(X_train)
X_test_norm = sc_X.transform(X_test)

## Machine Learning Models
# -------------------------------


### Logistic Regression Model

In [6]:
#Fitting Logistic Regression to Training Set
from sklearn.linear_model import LogisticRegression
log_start_time = time.time()

classifierObj= LogisticRegression(random_state=0)
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred= classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_logistic = confusion_matrix(y_test, y_pred)

#Get model k-fold validated scores
print(getModelAccuracies(classifierObj))
log_end_time = time.time()
print("Time to run Logistic Regression: " + str(log_end_time - log_start_time))

{'score': {'mean': 0.7150366395911079, 'std': 0.0015638080808045866}}
Time to run Logistic Regression: 153.2670919895172




### Naive Bayes Model

In [7]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.naive_bayes import GaussianNB
nb_start_time = time.time()
classifierObj = GaussianNB()
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_nb = confusion_matrix(y_test, y_pred)

#Get model k-fold validated scores
print(getModelAccuracies(classifierObj))
nb_end_time = time.time()
print("Time to run Naive Bayes: " + str(nb_end_time - nb_start_time))

{'score': {'mean': 0.6165668497160193, 'std': 0.004173581116792475}}
Time to run Naive Bayes: 59.67180609703064


### Decision Tree Model

In [8]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.tree import DecisionTreeClassifier
dt_start_time = time.time()
classifierObj = DecisionTreeClassifier(criterion='entropy')
classifierObj.fit(X_train,y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the prediction using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_dt = confusion_matrix(y_test, y_pred)

#Get model k-fold validated scores
print(getModelAccuracies(classifierObj))
dt_end_time = time.time()
print("Time to run Decision Tree: " + str(dt_end_time - dt_start_time))

{'score': {'mean': 0.6704839496581917, 'std': 0.0028634609856156177}}
Time to run Decision Tree: 348.4114902019501


### Random Forest

In [9]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.ensemble import RandomForestClassifier
rf_start_time = time.time()
classifierObj = RandomForestClassifier(criterion='entropy', n_estimators=29)
classifierObj.fit(X_train,y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_rf = confusion_matrix(y_test, y_pred)

#Get model k-fold validated scores
print(getModelAccuracies(classifierObj))
rf_end_time = time.time()
print("Time to run Random Forest: " + str(rf_end_time - rf_start_time))

{'score': {'mean': 0.7342641499857095, 'std': 0.0016660626276984694}}
Time to run Random Forest: 509.4532377719879


### KNN

In [24]:
## Commented out due to processing time and low accuracy

# #Normalizing the features
# from sklearn.preprocessing import StandardScaler 
# sc_X = StandardScaler()
# X_train_norm = sc_X.fit_transform(X_train)
# X_test_norm = sc_X.transform(X_test)

# #Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj 
# from sklearn.neighbors import KNeighborsClassifier
# knn_start_time = time.time()
# classifierObj = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
# classifierObj.fit(X_train_norm, y_train)

# #Making predictions on the Test Set
# y_pred = classifierObj.predict(X_test_norm)
# knn_end_time = time.time()
# #Evaluating the predictions using a Confusion Matrix
# from sklearn.metrics import confusion_matrix
# cm_knn = confusion_matrix(y_test, y_pred)

# knn_score = classifierObj.score(X_test_norm, y_test)
# print("Score:" + str(knn_score))
# print("Time to run KNN: " + str(knn_end_time - knn_start_time))

### Kernel SVM

In [0]:
## Commented out due to processing time and low accuracy

# from sklearn.svm import SVC
# ksvm_start_time = time.time()
# classifierObj = SVC(kernel='rbf')
# classifierObj.fit(X_train_norm, y_train)

# #Making predictions on the Test Set 
# y_pred = classifierObj.predict(X_test_norm)

# #Evaluating the predictions using a Confusion Matrix 
# from sklearn.metrics import confusion_matrix
# cm_svm = confusion_matrix(y_test, y_pred)
# ksvm_end_time = time.time()

# #Get model svm validated scores
# ksvm_score = classifierObj.score(X_test_norm, y_test)
# print("Score: " + str(ksvm_score))
# print("Time to run KSVM: " + str(ksvm_end_time - ksvm_start_time))