# NFL Injury Prediction

By Kushal Gourikrishna

## Problem Statement

The NFL has always been a very violent sport and injuries are an unfortunate byproduct of this. However, aside from injuries suffered due to the physicality of the game there are also a large number of non-contact injuries that are suffered every year. There has been a suspicion that turf type can have an impact on player injuries but there are a variety of factors on every play that could play a role such as the speed and direction of the player, weather conditions, stadium, etc. 

## Objective

Build and test different classification models and choose a model that can help predict injuries to NFL players based on a suite of factors present during a typical NFL game.

## Setup

In [None]:
# Import libraries
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style

import tensorflow as tf
from tensorflow import keras as kr
from keras import metrics
tf.get_logger().setLevel('INFO')

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score, classification_report,recall_score,precision_score,roc_auc_score,roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [None]:
# load the data
injury_record = pd.read_csv('InjuryRecord.csv')
player_track_data = pd.read_csv('PlayerTrackData.csv')
play_list = pd.read_csv('PlayList.csv')

In [None]:
# set the max columns to 50
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows',50)

## Data Exploration and Cleaning

### Injury Data

In [None]:
#See head of injury record data
injury_record.head()

In [None]:
#See baseline stats of injury record
injury_record.describe(include='all')

#### Fill Nulls for Injury Data

In [None]:
#Check injury data for nulls
injury_record.isnull().sum()

In [None]:
#Fill missing PlayKey values in injury dataframe with last play recorded for that Game
injury_record['PlayKey'] = injury_record.apply(
    lambda row: play_list.loc[play_list['GameID'] == row['GameID']].iloc[-1]['PlayKey'] if pd.isnull(row['PlayKey']) else row['PlayKey'],
    axis=1
)

In [None]:
#Check injury data for nulls after filling missing values
injury_record.isnull().sum()

### Play List Data

In [None]:
play_list.head(15)

In [None]:
play_list.describe(include='all')

#### Check Nulls for Play List Data

In [None]:
play_list.isnull().sum()

#### Clean Play List and normalize categorical variables

In [None]:
play_list['StadiumType'].unique()

In [None]:
play_list['Weather'].unique()

In [None]:
array_outdoors = ['Outdoor', 'Oudoor', 'Outdoors',
       'Ourdoor', 'Outddors', 'Heinz Field', 'Outdor', 'Outside', 'Cloudy']
array_indoors = ['Indoors', 'Indoor', 'Indoor', 'Retractable Roof']
array_open = ['Open','Outdoor Retr Roof-Open', 'Retr. Roof-Open', 'Indoor, Open Roof',
       'Domed, Open', 'Domed, open', 'Retr. Roof - Open']
array_closed = ['Closed Dome', 'Domed, closed', 'Dome', 'Domed',
       'Retr. Roof-Closed', 'Outdoor Retr Roof-Open', 'Retractable Roof', 'Indoor, Roof Closed', 'Retr. Roof - Closed', 'Bowl', 'Dome, closed',
       'Retr. Roof Closed']

play_list['StadiumType'] = play_list['StadiumType'].replace(array_outdoors, 'Outdoors')
play_list['StadiumType'] = play_list['StadiumType'].replace(array_indoors, 'Indoors')
play_list['StadiumType'] = play_list['StadiumType'].replace(array_open, 'Roof Open')
play_list['StadiumType'] = play_list['StadiumType'].replace(array_closed, 'Roof Closed')

In [None]:
array_clear = ['Clear and warm', 'Sunny', 'Clear',
       'Sunny and warm', 'Clear and Cool',
       'Clear and cold', 'Sunny and cold', 'Partly Sunny',
       'Mostly Sunny', 'Clear Skies', 'Partly sunny', 
       'Sunny and clear', 'Clear skies',
       'Sunny Skies', 'Fair', 'Partly clear', 
       'Heat Index 95', 'Sunny, highs to upper 80s', 
       'Mostly sunny', 'Sunny, Windy', 'Mostly Sunny Skies', 
       'Clear and Sunny', 'Clear and sunny',
       'Clear to Partly Cloudy', 'Cold']

array_cloudy = ['Mostly Cloudy', 'Cloudy',
       'Cloudy, fog started developing in 2nd quarter',
       'Partly Cloudy', 'Mostly cloudy', 'Cloudy and cold',
       'Cloudy and Cool', 'Partly cloudy', 
       'Party Cloudy', 'Hazy', 'Partly Clouidy',
       'Overcast', 'Cloudy, 50% change of rain',
       'Mostly Coudy', 'Cloudy, chance of rain',
       'Sun & clouds', 'Cloudy, Rain',
       'cloudy', 'Coudy']

array_indoors = ['Controlled Climate','Indoor',
       'N/A (Indoors)', 'Indoors', 'N/A Indoor']

array_precip = ['Rain',
       'Snow',
       'Scattered Showers',
       'Light Rain',
       'Heavy lake effect snow', 'Cloudy, Rain',
       'Rainy',
       'Cloudy, light snow accumulating 1-3"',
       'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Rain shower', 'Rain likely, temps in low 40s.', 'Rain Chance 40%', 'Rain likely, temps in low 40s.',
       'Cloudy, 50% change of rain', '10% Chance of Rain', 'Showers', '30% Chance of Rain']

play_list['Weather'] = play_list['Weather'].replace(array_clear, 'Clear')
play_list['Weather'] = play_list['Weather'].replace(array_cloudy, 'Cloudy')
play_list['Weather'] = play_list['Weather'].replace(array_indoors, 'Indoors')
play_list['Weather'] = play_list['Weather'].replace(array_precip, 'Precipitation')

##### After cleaning, fill nulls and invalid data for Play List data

In [None]:
#Fill stadium type nulls for outdoor "weather" with "Outdoors"
play_list.loc[(play_list['StadiumType'].isnull()) & (play_list['Weather'] != 'Indoors'),'StadiumType'] = 'Outdoors'

In [None]:
#Fill weather nulls for indoor stadiums with indoors
play_list.loc[(play_list['Weather'].isnull()) & ((play_list['StadiumType']!='Roof Open') & (play_list['StadiumType']!='Outdoors')),'Weather'] = 'Indoors'

In [None]:
#Fill weather nulls for outdoor stadiums with mode of cloudy
play_list.loc[play_list['Weather'].isnull(),'Weather'] = 'Cloudy'

In [None]:
#Fill play type nulls with mode which is pass plays
play_list.loc[play_list['PlayType'].isnull(),'PlayType'] = 'Pass'

In [None]:
#Check that null values of play list are gone
play_list.isnull().sum()

In [None]:
play_list['PlayType'].value_counts()

In [None]:
play_list = play_list[play_list.PlayType != '0']

In [None]:
play_list['PlayType'].value_counts()

In [None]:
#Check rows with temp = -999
play_list.loc[play_list['Temperature'] == -999]

In [None]:
#Replace temp = -999 with 67 for when game is in stadium with roof closed
play_list.loc[(play_list['StadiumType'] == 'Roof Closed') & (play_list['Temperature'] == -999),'Temperature'] = 67


In [None]:
#Replace temp = -999 with 67 for when game is in indoor stadium
play_list.loc[(play_list['StadiumType'] == 'Indoors') & (play_list['Temperature'] == -999),'Temperature'] = 67


In [None]:
#Compute mean temp for all outdoor games
outdoor_temp_mean = play_list.loc[(play_list['StadiumType'] == 'Roof Open') | (play_list['StadiumType'] == 'Outdoors')]['Temperature'].mean()

In [None]:
#Fill temp = -999 with mean of outdoor game temps
play_list.loc[play_list['Temperature'] == -999,'Temperature'] = outdoor_temp_mean

### Player Tracking Data

In [None]:
#Create column in player track data for 'angle' based on orientation and direction
player_track_data['angle'] = player_track_data['o'] - player_track_data['dir']

#Find max and avg values in player tracking data per play for player
grouped_max = player_track_data[['PlayKey','time', 'dir', 'dis', 'o', 's', 'angle']].groupby(by=['PlayKey']).max()
grouped_average = player_track_data[['PlayKey','time', 'dir', 'dis', 'o', 's', 'angle','x','y']].groupby(by=['PlayKey']).mean()
    

### Clean and Merge All Datasets

In [None]:
play_list = play_list.drop(['PositionGroup','Position'],axis=1)

In [None]:
play_inj = pd.merge(play_list,injury_record, on='PlayKey',how='left')

In [None]:
play_inj['DM_M1'] = play_inj['DM_M1'].fillna(0).astype(int)
play_inj['DM_M7'] = play_inj['DM_M7'].fillna(0).astype(int)
play_inj['DM_M28'] = play_inj['DM_M28'].fillna(0).astype(int)
play_inj['DM_M42'] = play_inj['DM_M42'].fillna(0).astype(int)
play_inj['BodyPart'] = play_inj['BodyPart'].fillna('None')

In [None]:
play_inj = play_inj.drop(['PlayerKey_y','GameID_y','Surface'],axis=1)

In [None]:
play_inj.DM_M1 = play_inj.DM_M1 - play_inj.DM_M7
play_inj.DM_M7 = play_inj.DM_M7 - play_inj.DM_M28
play_inj.DM_M28 = play_inj.DM_M28 - play_inj.DM_M42
play_inj['Injury'] = play_inj['DM_M1'] + play_inj['DM_M7'] + play_inj['DM_M28'] + play_inj['DM_M42']

In [None]:
play_inj = play_inj.drop(['DM_M1','DM_M7','DM_M28','DM_M42'],axis=1)

In [None]:
play_inj_track = play_inj.merge(grouped_max.reset_index(), on=['PlayKey'])
play_inj_track = play_inj_track.merge(grouped_average.reset_index(), on=['PlayKey'], suffixes=('_max', '_avg'))

In [None]:
play_inj_track.head()

In [None]:
play_inj_track = play_inj_track.drop(['dir_max','o_max','dir_avg','o_avg','PlayerKey_x','GameID_x'],axis = 1)

### Merged Data with Feature Columns

In [None]:
#Merged and cleaned dataset with features
play_inj_track['PlayType'].value_counts()

### Transform Categorical Data 

In [None]:
injury_model_data = pd.get_dummies(play_inj_track, prefix=['RosterPosition'], columns=['RosterPosition'])
injury_model_data = pd.get_dummies(injury_model_data, prefix=['StadiumType'], columns=['StadiumType'])
injury_model_data = pd.get_dummies(injury_model_data, prefix=['FieldType'], columns=['FieldType'])
injury_model_data = pd.get_dummies(injury_model_data, prefix=['Weather'], columns=['Weather'])
injury_model_data = pd.get_dummies(injury_model_data, prefix=['PlayType'], columns=['PlayType'])

injury_model_data = injury_model_data.drop(['PlayKey','BodyPart'],axis=1)


## Exploratory Data Analysis

In [None]:
# Creating numerical columns
num_cols = ['PlayerDay','PlayerGame','Temperature','PlayerGamePlay','time_max','dis_max','s_max','angle_max'
           ,'time_avg','dis_avg','s_avg','angle_avg','x','y']

# Creating categorical variables 
cat_cols = ['Injury','RosterPosition','StadiumType','FieldType','Weather','PlayType']

In [None]:
injury_record.groupby('BodyPart').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(10, 3), title='Count of injuries by Body Part')
plt.show()

In [None]:
# Plotting the correlation between numerical variables
plt.figure(figsize = (15, 8))

sns.heatmap(injury_model_data[num_cols].corr(), annot = True, fmt = '0.2f', cmap = 'YlGnBu')

In [None]:
#Drop highly correlated features

injury_model_data = injury_model_data.drop(['PlayerGame','time_max','dis_avg'],axis = 1)

In [None]:
num_cols = ['PlayerDay','Temperature','PlayerGamePlay','dis_max','s_max','angle_max'
           ,'time_avg','s_avg','angle_avg','x','y']

In [None]:
# Plotting the correlation between numerical variables again
plt.figure(figsize = (15, 8))

sns.heatmap(injury_model_data[num_cols].corr(), annot = True, fmt = '0.2f', cmap = 'YlGnBu')

In [None]:
injury_model_data.head()

# Model Building

### Create Metrics Function

In [None]:
def metrics_score(actual, predicted, model):
    
    acc = accuracy_score(actual, predicted)
    prec = precision_score(actual, predicted)
    rec = recall_score(actual, predicted)
    f1 = f1_score(actual, predicted)
    roc_auc = roc_auc_score(actual,predicted)

    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)

    plt.figure(figsize = (8, 5))

    sns.heatmap(cm, annot = True, fmt = '.2f', xticklabels = ['No Injury', 'Injury'], yticklabels = ['No Injury', 'Injury'])

    plt.ylabel('Actual')

    plt.xlabel('Predicted')
    
    plt.show()
    
    model_table = pd.DataFrame([[model, acc, 1-rec, rec, prec, f1, roc_auc]],
               columns = ['Model', 'Accuracy', 'FalseNegRate', 'Recall', 'Precision', 'F1 Score','ROC_AUC'])
    
    return model_table

### Balancing Dataset

In [None]:
# Split the data into features (X) and target (y)
X = injury_model_data.drop('Injury', axis=1)
y = injury_model_data['Injury']

In [None]:
labels, counts = np.unique(y, return_counts=True)
labels = ['No Injury','Injury']

plt.figure(figsize=(10, 5))
plt.bar(labels,counts)
plt.title('Dataset proportions')
plt.xlabel("Player Injury (Yes/No)")
plt.ylabel("Count")
plt.show()

In [None]:
print(counts)

In [None]:
from imblearn.under_sampling import NearMiss

nm = NearMiss()

X_nm, y_nm = nm.fit_resample(X, y)

In [None]:
labels, counts = np.unique(y_nm, return_counts=True)
labels = ['No Injury','Injury']

plt.figure(figsize=(10, 10))
plt.bar(labels,counts)
plt.title('Dataset proportions')
plt.xlabel("Player Injury (Yes/No)")
plt.ylabel("Count")
plt.show()

In [None]:
from imblearn.over_sampling import ADASYN 

sm = ADASYN(random_state=42)
X_a, y_a = sm.fit_resample(X, y)

In [None]:
labels, counts = np.unique(y_a, return_counts=True)
labels = ['No Injury','Injury']

plt.figure(figsize=(10, 10))
plt.bar(labels,counts)
plt.title('Dataset proportions')
plt.xlabel("Player Injury (Yes/No)")
plt.ylabel("Count")
plt.show()

### Train/Test Split the Data

#### Split Data Without Augmentation

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)

#Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Split Data with Undersampling

In [None]:
# Split the data into training and test sets
#X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.2,random_state=42, stratify=y_nm)

X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)

#Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled_nm = scaler.fit_transform(X_train_nm)

#### Split Data with Oversampling

In [None]:
# Split the data into training and test sets
#X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y_a, test_size=0.2,random_state=42, stratify=y_a)

X_train_a, y_train_a = sm.fit_resample(X_train, y_train)

#Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled_a = scaler.fit_transform(X_train_a)
#X_test_scaled_a = scaler.transform(X_test_a)

In [None]:
X_train.shape

In [None]:
X_train_nm.shape

In [None]:
X_train_a.shape

## Build Models with Undersampled Data

### KNN 

In [None]:
# Define K-NN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 4)

In [None]:
# Fitting data to the K-NN model

knn.fit(X_train_scaled_nm,y_train_nm)

In [None]:
# Checking the performance of K-NN model on the training data
y_pred_test_knn = knn.predict(X_test_scaled)

knn_test = metrics_score(y_test, y_pred_test_knn,'KNN')

In [None]:
results_testset_nm = knn_test
results_testset_nm

### Logistic Regression

In [None]:
lg = LogisticRegression(random_state=0).fit(X_train_scaled_nm, y_train_nm)
y_pred_log = lg.predict(X_test_scaled)

lg_test = metrics_score(y_test,y_pred_log,'Logistic Regression')

In [None]:
results_testset_nm = results_testset_nm.append(lg_test, ignore_index = True)
results_testset_nm

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree classifer object
dt = DecisionTreeClassifier(random_state = 1)

# Train Decision Tree Classifer
dt = dt.fit(X_train_nm,y_train_nm)

#Predict the response for test dataset
y_pred_dt = dt.predict(X_test)

In [None]:
dt_test = metrics_score(y_test,y_pred_dt,'Decision Tree')

In [None]:
results_testset_nm= results_testset_nm.append(dt_test, ignore_index = True)
results_testset_nm

In [None]:
# Plot the feature importance

importances = dt.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance,importance_df.index)

In [None]:
from sklearn.metrics import make_scorer
# Choose the type of classifier
dtree_estimator = DecisionTreeClassifier(random_state = 1)

# Grid of parameters to choose from
parameters = {'max_depth': np.arange(2, 7), 
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': [5, 10, 20, 25]
             }

# Type of scoring used to compare parameter combinations
scorer = make_scorer(recall_score, pos_label = 1)

# Run the grid search
gridCV = GridSearchCV(dtree_estimator, parameters, scoring = scorer, cv = 10)

# Fitting the grid search on the train data
gridCV = gridCV.fit(X_train_nm, y_train_nm)

# Set the classifier to the best combination of parameters
dtree_estimator = gridCV.best_estimator_

# Fit the best estimator to the data
dtree_estimator.fit(X_train_nm, y_train_nm)

In [None]:
# Checking performance on the test dataset
y_pred_dt_tuned = dtree_estimator.predict(X_test)

dt_test_tuned = metrics_score(y_test, y_pred_dt_tuned,'Decision Tree Tuned')

In [None]:
# Plot the feature importance

importances = dtree_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance,importance_df.index)

In [None]:
results_testset_nm = results_testset_nm.append(dt_test_tuned, ignore_index = True)
results_testset_nm

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create Random Forest classifer object
rf = RandomForestClassifier(random_state = 1)

# Train random forest
rf.fit(X_train_nm,y_train_nm)  

#Predict the response for test dataset
y_pred_rf = rf.predict(X_test)

In [None]:
rf_test = metrics_score(y_test,y_pred_rf,'Random Forest')

In [None]:
# Plot the feature importance

importances = rf.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset_nm = results_testset_nm.append(rf_test, ignore_index = True)
results_testset_nm

In [None]:
# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(random_state = 1)

# Grid of parameters to choose from
params_rf = {  
        "n_estimators": [100, 250, 500],
        "min_samples_leaf": np.arange(1, 4, 1),
        "max_features": [0.7, 0.9, 'auto'],
}


# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = make_scorer(recall_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, params_rf, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(X_train_nm, y_train_nm)

# Set the classifier to the best combination of parameters
rf_estimator_tuned = grid_obj.best_estimator_

rf_estimator_tuned.fit(X_train_nm, y_train_nm)

In [None]:
# Checking performance on the training data
y_pred_train_rf_tuned = rf_estimator_tuned.predict(X_test)

rf_test_tuned = metrics_score(y_test, y_pred_train_rf_tuned,"Random Forest Tuned")

In [None]:
# Plot the feature importance

importances = rf_estimator_tuned.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset_nm = results_testset_nm.append(rf_test_tuned, ignore_index = True)
results_testset_nm

### XG Boost

In [None]:
# Importing the XGBReressor from the xgboost
from xgboost import XGBClassifier

# XGBoost Regressor
xgb = XGBClassifier(random_state = 1, eval_metric = 'logloss')

# Fitting the model
xgb.fit(X_train_nm,y_train_nm)

# Checking performance on the training data
y_pred_test_xg = xgb.predict(X_test)

xg_test = metrics_score(y_test,y_pred_test_xg,'XGBoost')

In [None]:
# Plot the feature importance

importances = xgb.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset_nm = results_testset_nm.append(xg_test, ignore_index = True)
results_testset_nm

### AdaBoost

In [None]:
# Importing the Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
abc.fit(X_train_nm, y_train_nm)

#Predict the response for test dataset
y_pred = abc.predict(X_test)

a_test = metrics_score(y_test,y_pred,'AdaBoost')

In [None]:
# Plot the feature importance

importances = abc.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset_nm = results_testset_nm.append(a_test, ignore_index = True)
results_testset_nm

## Build Models without Data Augmentation

### KNN

In [None]:
# Define K-NN model

knn = KNeighborsClassifier(n_neighbors = 4)

In [None]:
# Fitting data to the K-NN model

knn.fit(X_train_scaled,y_train)

In [None]:
# Checking the performance of K-NN model on the training data
y_pred_test_knn = knn.predict(X_test_scaled)

knn_test = metrics_score(y_test, y_pred_test_knn,'KNN')

In [None]:
results_testset = knn_test
results_testset

### Logistic Regression

In [None]:
lg = LogisticRegression(random_state=0,class_weight='balanced').fit(X_train_scaled, y_train)
y_pred_log = lg.predict(X_test_scaled)

lg_test = metrics_score(y_test,y_pred_log,'Logistic Regression')

In [None]:
results_testset = results_testset.append(lg_test, ignore_index = True)
results_testset

### Decision Tree

In [None]:
# Create Decision Tree classifer object
dt = DecisionTreeClassifier(random_state = 1,class_weight='balanced')

# Train Decision Tree Classifer
dt = dt.fit(X_train,y_train)

#Predict the response for test dataset
y_pred_dt = dt.predict(X_test)

In [None]:
dt_test = metrics_score(y_test,y_pred_dt,'Decision Tree')

In [None]:
results_testset = results_testset.append(dt_test, ignore_index = True)
results_testset

In [None]:
# Choose the type of classifier
dtree_estimator = DecisionTreeClassifier(random_state = 1,class_weight='balanced')

# Grid of parameters to choose from
parameters = {'max_depth': np.arange(2, 7), 
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': [5, 10, 20, 25]
             }

# Type of scoring used to compare parameter combinations
scorer = make_scorer(recall_score, pos_label = 1)

# Run the grid search
gridCV = GridSearchCV(dtree_estimator, parameters, scoring = scorer, cv = 10)

# Fitting the grid search on the train data
gridCV = gridCV.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
dtree_estimator = gridCV.best_estimator_

# Fit the best estimator to the data
dtree_estimator.fit(X_train, y_train)

In [None]:
# Checking performance on the test dataset
y_pred_dt_tuned = dtree_estimator.predict(X_test)

dt_test_tuned = metrics_score(y_test, y_pred_dt_tuned,'Decision Tree Tuned')

In [None]:
results_testset = results_testset.append(dt_test_tuned, ignore_index = True)
results_testset

In [None]:
# Plot the feature importance

importances = dtree_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance,importance_df.index)

### Random Forest

In [None]:
# Create Random Forest classifer object
rf = RandomForestClassifier(random_state = 1,class_weight='balanced')

# Train random forest
rf.fit(X_train,y_train)  

#Predict the response for test dataset
y_pred_rf = rf.predict(X_test)

In [None]:
rf_test = metrics_score(y_test,y_pred_rf,'Random Forest')

In [None]:
results_testset = results_testset.append(rf_test, ignore_index = True)
results_testset

In [None]:
# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(random_state = 1,class_weight='balanced')

# Grid of parameters to choose from
params_rf = {  
        "n_estimators": [100, 250, 500],
        "min_samples_leaf": np.arange(1, 4, 1),
        "max_features": [0.7, 0.9, 'auto'],
}


# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = make_scorer(recall_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, params_rf, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
rf_estimator_tuned = grid_obj.best_estimator_

rf_estimator_tuned.fit(X_train, y_train)

In [None]:
# Checking performance on the training data
y_pred_train_rf_tuned = rf_estimator_tuned.predict(X_test)

rf_test_tuned = metrics_score(y_test, y_pred_train_rf_tuned,"Random Forest Tuned")

In [None]:
# Plot the feature importance

importances = rf_estimator_tuned.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset = results_testset.append(rf_test_tuned, ignore_index = True)
results_testset

### XGBoost

In [None]:
from sklearn.utils.class_weight import compute_sample_weight
weights = compute_sample_weight(class_weight='balanced', y=y_train)

# XGBoost Classifier
xgb = XGBClassifier(random_state = 1, eval_metric = 'logloss')

# Fitting the model
xgb.fit(X_train,y_train,sample_weight = weights)

# Checking performance on the training data
y_pred_test_xg = xgb.predict(X_test)

xg_test = metrics_score(y_test,y_pred_test_xg,'XGBoost')

In [None]:
results_testset = results_testset.append(xg_test, ignore_index = True)
results_testset

### AdaBoost

In [None]:
# Importing the Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
abc.fit(X_train, y_train,sample_weight=weights)

#Predict the response for test dataset
y_pred = abc.predict(X_test)

a_test = metrics_score(y_test,y_pred,'AdaBoost')

In [None]:
# Plot the feature importance

importances = abc.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset = results_testset.append(a_test, ignore_index = True)
results_testset

## Build Models with Oversampled Data

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors = 4)

In [None]:
# Fitting data to the K-NN model

knn.fit(X_train_scaled_a,y_train_a)

In [None]:
# Checking the performance of K-NN model on the training data
y_pred_test_knn = knn.predict(X_test_scaled)

knn_test = metrics_score(y_test, y_pred_test_knn,'KNN')

In [None]:
results_testset_a = knn_test
results_testset_a

### Logistic Regression

In [None]:
lg = LogisticRegression(random_state=0).fit(X_train_scaled_a, y_train_a)
y_pred_log = lg.predict(X_test_scaled)

lg_test = metrics_score(y_test,y_pred_log,'Logistic Regression')

In [None]:
results_testset_a = results_testset_a.append(lg_test, ignore_index = True)
results_testset_a

### Decision Tree

In [None]:
# Create Decision Tree classifer object
dt = DecisionTreeClassifier(random_state = 1)

# Train Decision Tree Classifer
dt = dt.fit(X_train_a,y_train_a)

#Predict the response for test dataset
y_pred_dt = dt.predict(X_test)

In [None]:
dt_test = metrics_score(y_test,y_pred_dt,'Decision Tree')

In [None]:
results_testset_a = results_testset_a.append(dt_test, ignore_index = True)
results_testset_a

In [None]:
# Plot the feature importance

importances = dt.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance,importance_df.index)

### Random Forest

In [None]:
# Create Random Forest classifer object
rf = RandomForestClassifier(random_state = 1)

# Train random forest
rf.fit(X_train_a,y_train_a)  

#Predict the response for test dataset
y_pred_rf = rf.predict(X_test)

In [None]:
rf_test = metrics_score(y_test,y_pred_rf,'Random Forest')

In [None]:
results_testset_a = results_testset_a.append(rf_test, ignore_index = True)
results_testset_a

In [None]:
# Plot the feature importance

importances = rf.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

### XGBoost

In [None]:
# XGBoost Classifier
xgb = XGBClassifier(random_state = 1, eval_metric = 'logloss')

# Fitting the model
xgb.fit(X_train_a,y_train_a)

# Checking performance on the training data
y_pred_test_xg = xgb.predict(X_test)

xg_test = metrics_score(y_test,y_pred_test_xg,'XGBoost')

In [None]:
results_testset_a = results_testset_a.append(xg_test, ignore_index = True)
results_testset_a

### AdaBoost

In [None]:
# Importing the Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
abc.fit(X_train_a, y_train_a)

#Predict the response for test dataset
y_pred = abc.predict(X_test)

a_test = metrics_score(y_test,y_pred,'AdaBoost')

In [None]:
# Plot the feature importance

importances = abc.feature_importances_

columns = X.columns

importance_rf = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_rf.Importance,importance_rf.index)

In [None]:
results_testset_a = results_testset_a.append(a_test, ignore_index = True)
results_testset_a

## Exploring Neural Networks

### Begin without Data Augmentation

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization

model = Sequential([
    Dense(units=16, input_dim = 41,activation='relu'),
    BatchNormalization(),
    Dense(units=24,activation='relu'),
    Dense(24,activation='relu'),
    Dropout(0.5),
    Dense(24,activation='relu'),
    Dense(1,activation='sigmoid'),                        # binary classification injury or not
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
score = model.evaluate(X_test, y_test)
print(score)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
nn_test = metrics_score(y_test,y_pred_nn,'Basic Neural Network')

In [None]:
nn_testset = nn_test
nn_testset

### Try to add weighted loss to account for imbalanced data

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight = "balanced", classes= np.unique(y_train), y= y_train)

In [None]:
class_weights = dict(enumerate(class_weights))
class_weights

In [None]:
model.fit(X_train,y_train,batch_size=15,epochs=5,class_weight=class_weights, shuffle=True,validation_split=0.1,verbose=1)

In [None]:
score_weighted = model.evaluate(X_test, y_test)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
nn_test = metrics_score(y_test,y_pred_nn,'Weighted Neural Network')

In [None]:
nn_testset = nn_testset.append(nn_test, ignore_index = True)
nn_testset 

### Try ANN with Undersampled Data

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_nm,y_train_nm,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
nn_test = metrics_score(y_test,y_pred_nn,'Undersampled Neural Network')

In [None]:
nn_testset = nn_testset.append(nn_test, ignore_index = True)
nn_testset 

### Oversampled Neural Network

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_a,y_train_a,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
nn_test = metrics_score(y_test,y_pred_nn,'Oversampled Neural Network')

In [None]:
nn_testset = nn_testset.append(nn_test, ignore_index = True)
nn_testset 

### Build CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten, Reshape
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(41, 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(Adam(lr=.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train,y_train,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
cnn_test = metrics_score(y_test,y_pred_nn,'Convolutional Neural Network')

In [None]:
cnn_testset = cnn_test
cnn_testset

### Weighted CNN

In [None]:
model.fit(X_train,y_train,batch_size=15,epochs=5,class_weight=class_weights,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
cnn_test = metrics_score(y_test,y_pred_nn,'Weighted Convolutional Neural Network')

In [None]:
cnn_testset = cnn_testset.append(cnn_test, ignore_index = True)
cnn_testset 

### Undersampled CNN

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_nm,y_train_nm,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
cnn_test = metrics_score(y_test,y_pred_nn,'Undersampled Convolutional Neural Network')

In [None]:
cnn_testset = cnn_testset.append(cnn_test, ignore_index = True)
cnn_testset 

### Oversampled CNN

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_a,y_train_a,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
cnn_test = metrics_score(y_test,y_pred_nn,'Oversampled Convolutional Neural Network')

In [None]:
cnn_testset = cnn_testset.append(cnn_test, ignore_index = True)
cnn_testset 

### Build GRU RNN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential()
model.add(layers.GRU(64, input_shape=(41,1)))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

In [None]:
model.compile(Adam(lr=.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train,y_train,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
gru_test = metrics_score(y_test,y_pred_nn,'GRU RNN')

In [None]:
gru_testset = gru_test
gru_testset

### Undersampled GRU

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_nm,y_train_nm,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
gru_test = metrics_score(y_test,y_pred_nn,'Undersampled GRU RNN')

In [None]:
gru_testset = gru_testset.append(gru_test, ignore_index = True)
gru_testset 

### Oversampled GRU

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_a,y_train_a,batch_size=15,epochs=5,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
gru_test = metrics_score(y_test,y_pred_nn,'Oversampled GRU RNN')

In [None]:
gru_testset = gru_testset.append(gru_test, ignore_index = True)
gru_testset 

### Weighted GRU

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=15,epochs=5,class_weight=class_weights,validation_split=0.1,verbose=1)

In [None]:
#Predict the response for test dataset
y_pred_nn = model.predict(X_test)

In [None]:
for i in range(len(y_test)):
    if y_pred_nn[i]>0.5:
        y_pred_nn[i]=1 
    else:
        y_pred_nn[i]=0

In [None]:
gru_test = metrics_score(y_test,y_pred_nn,'Weighted GRU RNN')

In [None]:
gru_testset = gru_testset.append(gru_test, ignore_index = True)
gru_testset 