In [None]:
# Basic package
import pandas as pd
import numpy as np
import datetime as dt

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Save model for future use
import pickle

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

np.random.seed(42)

In [None]:
# Display all results in a cell, not just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df_1 =pd.read_csv('NFL Analytics Challenge Data Set 1.csv')
df_2 =pd.read_csv('NFL Analytics Challenge Data Set 2.csv')
df_3 =pd.read_csv('NFL Analytics Challenge Data Set 3.csv')

In [None]:
# Data cleaning and engineering
# Dataset 1
df_1['date'] =pd.to_datetime(df_1['date']) # Change to datetime format
df_1['away'] =df_1['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_1_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home'
                    ]

dic ={'Jan':'1', 'Feb':'2', 'Mar':'3', 'Apr':'4', 'May':'5', 'Jun':'6',
      'Jul':'7', 'Aug':'8', 'Sep':'9', 'Oct':'10', 'Nov':'11', 'Dec':'12'
     }

for col in df_1_cols_to_split:
    df_1[col] =df_1[col].replace(dic, regex=True)
    df_1 =df_1.join(df_1[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_1.columns[37:]:
    df_1[col] = df_1[col].astype('int64')
    
#df_1.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m =df_1['possession_away'].str.len().max()
df_1['possession_away'] =df_1['possession_away'].str.rjust(m, '0')
df_1['possession_home'] =df_1['possession_home'].str.rjust(m, '0')

df_1['possession_away_in_seconds'] =((df_1['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_away'].str[6:].astype('int64')))

df_1['possession_home_in_seconds'] =((df_1['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_home'].str[6:].astype('int64')))

In [None]:
# Data cleaning and engineering
# Dataset 2
df_2['date'] =pd.to_datetime(df_2['date']) # Change to datetime format
df_2['away'] =df_2['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_2_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home'
                    ]

for col in df_2_cols_to_split:
    df_2[col] =df_2[col].replace(dic, regex=True)
    df_2 =df_2.join(df_2[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_2.columns[37:]:
    df_2[col] = df_2[col].astype('int64')
    
#df_2.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m =df_2['possession_away'].str.len().max()
df_2['possession_away'] =df_2['possession_away'].str.rjust(m, '0')
df_2['possession_home'] =df_2['possession_home'].str.rjust(m, '0')

df_2['possession_away_in_seconds'] =((df_2['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_away'].str[6:].astype('int64')))

df_2['possession_home_in_seconds'] =((df_2['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_home'].str[6:].astype('int64')))

In [None]:
# Data cleaning and engineering
# Dataset 3
df_3['date'] =pd.to_datetime(df_3['date']) # Change to datetime format
df_3['away'] =df_3['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

In [None]:
# Create final dataframe
df = pd.concat([df_1, df_2],ignore_index=True)
df =df.merge(df_3, on=['date','away','home'])

In [None]:
df.info()
df.shape
df.head()

In [None]:
# Create classifier column, if home team win, then 1, if home team lose, then 0
df.loc[df['score_away'] < df['score_home'], 'home_win'] = 1 
df.loc[df['score_away'] > df['score_home'], 'home_win'] = 0 

# Convert to percentage
df['comp_att_away_percentage'] =df['comp_att_away0']/df['comp_att_away1']
df['comp_att_home_percentage'] =df['comp_att_home0']/df['comp_att_home1']

# Drop redzone columns
df = df.drop(columns=['redzone_away','redzone_home',
                      'redzone_away0', 'redzone_away1',
                      'redzone_home0','redzone_home1'])

# Create column with SuperBowl winner by year
winner_dict ={2002:'Buccaneers',
             2003:'Patriots',
             2004:'Patriots',
             2005:'Steelers',
             2006:'Colts',
             2007:'Giants',
             2008:'Steelers',
             2009:'Saints',
             2010:'Packers',
             2011:'Giants',
             2012:'Ravens',
             2013:'Seahawks',
             2014:'Patriots',
             2015:'Broncos',
             2016:'Patriots',
             2017:'Eagles',
             2018:'Patriots',
             2019:'Chiefs',
             2020:'Buccaneers',
             2021:'Rams'
             }
df['super_bowl_winner'] =df['date'].dt.year.map(winner_dict)

# create a dictionary that shows teams in conference/division
# AFC vs NFC Conferences
# each has North, South, East, and West Divisions
conf_divisions = {
    'Colts':'AFC South', 
    'Ravens':'AFC North', 
    'Seahawks':'NFC West', 
    'Patriots':'AFC East', 
    'Packers':'NFC North', 
    'Giants':'NFC East',
    'Steelers':'AFC North', 
    'Jets':'AFC East', 
    'Titans':'AFC South', 
    'Panthers':'NFC South', 
    'Eagles':'NFC East', 
    'Chargers':'AFC West',
    'Saints':'NFC South', 
    '49ers':'NFC West', 
    'Falcons':'NFC South', 
    'Texans':'AFC South', 
    'Jaguars':'AFC South', 
    'Chiefs':'AFC West', 
    'Vikings':'NFC North',
    'Broncos':'AFC West', 
    'Cowboys':'NFC East', 
    'Cardinals':'NFC West', 
    'Bengals':'AFC North', 
    'Bills':'AFC East', 
    'Rams':'NFC West', 
    'Lions':'NFC North',
    'Washington':'NFC East', 
    'Raiders':'AFC West', 
    'Browns':'AFC North', 
    'Buccaneers':'NFC South', 
    'Bears':'NFC North', 
    'Dolphins':'AFC East'  
}
df['Conference_Division'] = df['away'].map(conf_divisions)

# Create column Super Bowl Year
df['super_bowl_year'] =df['date'].dt.year

# TO BE DETERMINED how to treat tie record?
#df['away_win'].unique()
#df[~df['away_win'].isin([0,1])] #11 tie record

In [None]:
df

In [None]:
# export to excel
df.to_excel('NFL_Analytics_Challenge_HA_v2.xlsx')

In [None]:
df.dtypes

In [None]:
# filter for january and february
df[df['date'].dt.month.isin([1,2])]

In [None]:
# update year column by subtracting 1 for games in Jan/Feb
df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] = df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] - 1

In [None]:
# check 
df[df['date'].dt.month.isin([1,2])]

In [None]:
# update the super bowl winner for rows changed
df['super_bowl_winner'] = df['super_bowl_year'].map(winner_dict)

In [None]:
# check
df

In [None]:
# delete the conference_division column (what value does it provide?)

In [None]:
# differentiate between playoffs and regular season???

# correct games played to correct super bowl year
# all games in February and January belong to previous year
# filter games in this range


# identify super bowl winner in each season (DONE/UPDATED)

### TO DO for monday:

In [None]:
# step 1
# aggregate data by year (sum yards, etc.)
# groupby
# how to group? sum, average

# step 2
# vertically combine all data to get FULL team season statistics

# step 3
# for each year, add column indicating 0 did not win SB, 1 won SB




In [None]:
# Only considered matches before 2022
df =df[df['date']<'2022-01-01']

# To be discussed
# Only considered no tie matches
df =df[df['home_win'].isin([0,1])]

In [None]:
X = df.drop(columns=['date', 'away', 'home','third_downs_away', 'third_downs_home',
                     'fourth_downs_away', 'fourth_downs_home', 'comp_att_away', 'comp_att_home',
                     'sacks_away', 'sacks_home', 'penalties_away', 'penalties_home',
                     'redzone_away', 'redzone_home','possession_away','possession_home',
                     'away_win'])

y = df['home_win']