In [1]:
# Basic package
import pandas as pd
import numpy as np
import datetime as dt

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Save model for future use
import pickle

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

np.random.seed(42)

In [2]:
# Display all results in a cell, not just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df_1 =pd.read_csv('NFL Analytics Challenge Data Set 1.csv')
df_2 =pd.read_csv('NFL Analytics Challenge Data Set 2.csv')
df_3 =pd.read_csv('NFL Analytics Challenge Data Set 3.csv')

In [4]:
# df1 observations

## fourth_downs_away needs to be switched
## fourth_downs_home needs to be switched

# df2 observations

## fourth_downs_away needs to be switched
## fourth_downs_home needs to be switched

# df3 observations

## OK

In [5]:
# Data cleaning and engineering
# Dataset 1
df_1['date'] =pd.to_datetime(df_1['date']) # Change to datetime format
df_1['away'] =df_1['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_1_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home'
                    ]

dic ={'Jan':'1', 'Feb':'2', 'Mar':'3', 'Apr':'4', 'May':'5', 'Jun':'6',
      'Jul':'7', 'Aug':'8', 'Sep':'9', 'Oct':'10', 'Nov':'11', 'Dec':'12'
     }

for col in df_1_cols_to_split:
    df_1[col] =df_1[col].replace(dic, regex=True)
    df_1 =df_1.join(df_1[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_1.columns[37:]:
    df_1[col] = df_1[col].astype('int64')
    
#df_1.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m =df_1['possession_away'].str.len().max()
df_1['possession_away'] =df_1['possession_away'].str.rjust(m, '0')
df_1['possession_home'] =df_1['possession_home'].str.rjust(m, '0')

df_1['possession_away_in_seconds'] =((df_1['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_away'].str[6:].astype('int64')))

df_1['possession_home_in_seconds'] =((df_1['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_home'].str[6:].astype('int64')))

In [6]:
# Data cleaning and engineering
# Dataset 2
df_2['date'] =pd.to_datetime(df_2['date']) # Change to datetime format
df_2['away'] =df_2['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_2_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home'
                    ]

for col in df_2_cols_to_split:
    df_2[col] =df_2[col].replace(dic, regex=True)
    df_2 =df_2.join(df_2[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_2.columns[37:]:
    df_2[col] = df_2[col].astype('int64')
    
#df_2.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m =df_2['possession_away'].str.len().max()
df_2['possession_away'] =df_2['possession_away'].str.rjust(m, '0')
df_2['possession_home'] =df_2['possession_home'].str.rjust(m, '0')

df_2['possession_away_in_seconds'] =((df_2['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_away'].str[6:].astype('int64')))

df_2['possession_home_in_seconds'] =((df_2['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_home'].str[6:].astype('int64')))

In [7]:
# Data cleaning and engineering
# Dataset 3
df_3['date'] =pd.to_datetime(df_3['date']) # Change to datetime format
df_3['away'] =df_3['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

In [8]:
# Create final dataframe
df = pd.concat([df_1, df_2],ignore_index=True)
df =df.merge(df_3, on=['date','away','home'])

In [9]:
df.info()
df.shape
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5088 entries, 0 to 5087
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        5088 non-null   datetime64[ns]
 1   away                        5088 non-null   object        
 2   home                        5088 non-null   object        
 3   first_downs_away            5088 non-null   int64         
 4   first_downs_home            5088 non-null   int64         
 5   third_downs_away            5088 non-null   object        
 6   third_downs_home            5088 non-null   object        
 7   fourth_downs_away           5088 non-null   object        
 8   fourth_downs_home           5088 non-null   object        
 9   passing_yards_away          5088 non-null   int64         
 10  passing_yards_home          5088 non-null   int64         
 11  rushing_yards_away          5088 non-null   int64       

(5088, 65)

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,penalties_home0,penalties_home1,redzone_away0,redzone_away1,redzone_home0,redzone_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home
0,2002-09-05,49ers,Giants,13,21,12-4,16-9,0-0,0-1,166,...,10,80,0,8,0,6,99120,116880,16,13
1,2002-09-08,Seahawks,Raiders,14,27,11-1,12-7,2-2,1-1,143,...,5,45,0,2,0,2,90540,125460,17,31
2,2002-09-08,Jets,Bills,18,26,8-2,17-7,0-0,2-2,193,...,10,82,0,9,0,8,1266,140880,37,31
3,2002-09-08,Vikings,Bears,19,20,13-5,13-7,0-0,0-0,228,...,4,33,0,7,0,6,113460,102540,23,27
4,2002-09-08,Chargers,Bengals,27,13,10-6,11-4,0-0,0-0,160,...,9,57,0,7,0,5,136080,1332,34,6


In [10]:
# Create classifier column, if home team win, then 1, if home team lose, then 0
df.loc[df['score_away'] < df['score_home'], 'home_win'] = 1 
df.loc[df['score_away'] > df['score_home'], 'home_win'] = 0 

# Convert to percentage
df['comp_att_away_percentage'] =df['comp_att_away0']/df['comp_att_away1']
df['comp_att_home_percentage'] =df['comp_att_home0']/df['comp_att_home1']

# Drop redzone columns
df = df.drop(columns=['redzone_away','redzone_home',
                      'redzone_away0', 'redzone_away1',
                      'redzone_home0','redzone_home1'])

# Create column with SuperBowl winner by year
winner_dict ={2002:'Buccaneers',
             2003:'Patriots',
             2004:'Patriots',
             2005:'Steelers',
             2006:'Colts',
             2007:'Giants',
             2008:'Steelers',
             2009:'Saints',
             2010:'Packers',
             2011:'Giants',
             2012:'Ravens',
             2013:'Seahawks',
             2014:'Patriots',
             2015:'Broncos',
             2016:'Patriots',
             2017:'Eagles',
             2018:'Patriots',
             2019:'Chiefs',
             2020:'Buccaneers',
             2021:'Rams'
             }
df['super_bowl_winner'] =df['date'].dt.year.map(winner_dict)

# create a dictionary that shows teams in conference/division
## HA 4/1: This code column no longer needed.
# AFC vs NFC Conferences
# each has North, South, East, and West Divisions
# conf_divisions = {
#     'Colts':'AFC South', 
#     'Ravens':'AFC North', 
#     'Seahawks':'NFC West', 
#     'Patriots':'AFC East', 
#     'Packers':'NFC North', 
#     'Giants':'NFC East',
#     'Steelers':'AFC North', 
#     'Jets':'AFC East', 
#     'Titans':'AFC South', 
#     'Panthers':'NFC South', 
#     'Eagles':'NFC East', 
#     'Chargers':'AFC West',
#     'Saints':'NFC South', 
#     '49ers':'NFC West', 
#     'Falcons':'NFC South', 
#     'Texans':'AFC South', 
#     'Jaguars':'AFC South', 
#     'Chiefs':'AFC West', 
#     'Vikings':'NFC North',
#     'Broncos':'AFC West', 
#     'Cowboys':'NFC East', 
#     'Cardinals':'NFC West', 
#     'Bengals':'AFC North', 
#     'Bills':'AFC East', 
#     'Rams':'NFC West', 
#     'Lions':'NFC North',
#     'Washington':'NFC East', 
#     'Raiders':'AFC West', 
#     'Browns':'AFC North', 
#     'Buccaneers':'NFC South', 
#     'Bears':'NFC North', 
#     'Dolphins':'AFC East'  
# }
# df['Conference_Division'] = df['away'].map(conf_divisions)

# Create column Super Bowl Year
df['super_bowl_year'] =df['date'].dt.year

# TO BE DETERMINED how to treat tie record?
#df['away_win'].unique()
#df[~df['away_win'].isin([0,1])] #11 tie record

In [11]:
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,penalties_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home,home_win,comp_att_away_percentage,comp_att_home_percentage,super_bowl_winner,super_bowl_year
0,2002-09-05,49ers,Giants,13,21,12-4,16-9,0-0,0-1,166,...,80,99120,116880,16,13,0.0,0.615385,0.622222,Buccaneers,2002
1,2002-09-08,Seahawks,Raiders,14,27,11-1,12-7,2-2,1-1,143,...,45,90540,125460,17,31,1.0,0.718750,0.678571,Buccaneers,2002
2,2002-09-08,Jets,Bills,18,26,8-2,17-7,0-0,2-2,193,...,82,1266,140880,37,31,0.0,0.800000,0.666667,Buccaneers,2002
3,2002-09-08,Vikings,Bears,19,20,13-5,13-7,0-0,0-0,228,...,33,113460,102540,23,27,1.0,0.571429,0.606061,Buccaneers,2002
4,2002-09-08,Chargers,Bengals,27,13,10-6,11-4,0-0,0-0,160,...,57,136080,1332,34,6,0.0,0.789474,0.580645,Buccaneers,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11-4,14-3,0-0,4-2,355,...,61,122880,93120,30,27,0.0,0.736842,0.555556,,2022
5084,2022-01-23,Bills,Chiefs,23,30,14-6,13-8,4-4,1-1,313,...,1,99420,131880,36,42,1.0,0.729730,0.750000,,2022
5085,2022-01-30,Bengals,Chiefs,21,24,14-8,12-6,0-0,0-0,243,...,2,129360,106920,27,24,0.0,0.605263,0.666667,,2022
5086,2022-01-30,49ers,Rams,16,25,9-3,18-11,0-0,0-1,232,...,2,87660,128340,17,20,1.0,0.533333,0.688889,,2022


In [12]:
# need to switch third_downs_away and third_downs_home (currently shown as opposite)

# third downs away
df[['third_downs_away','third_downs_completed_away']] = df['third_downs_away'].str.split('-',expand=True)

# third downs home
df[['third_downs_home','third_downs_completed_home']] = df['third_downs_home'].str.split('-',expand=True)

In [13]:
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,possession_home_in_seconds,score_away,score_home,home_win,comp_att_away_percentage,comp_att_home_percentage,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home
0,2002-09-05,49ers,Giants,13,21,12,16,0-0,0-1,166,...,116880,16,13,0.0,0.615385,0.622222,Buccaneers,2002,4,9
1,2002-09-08,Seahawks,Raiders,14,27,11,12,2-2,1-1,143,...,125460,17,31,1.0,0.718750,0.678571,Buccaneers,2002,1,7
2,2002-09-08,Jets,Bills,18,26,8,17,0-0,2-2,193,...,140880,37,31,0.0,0.800000,0.666667,Buccaneers,2002,2,7
3,2002-09-08,Vikings,Bears,19,20,13,13,0-0,0-0,228,...,102540,23,27,1.0,0.571429,0.606061,Buccaneers,2002,5,7
4,2002-09-08,Chargers,Bengals,27,13,10,11,0-0,0-0,160,...,1332,34,6,0.0,0.789474,0.580645,Buccaneers,2002,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0-0,4-2,355,...,93120,30,27,0.0,0.736842,0.555556,,2022,4,3
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4-4,1-1,313,...,131880,36,42,1.0,0.729730,0.750000,,2022,6,8
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0-0,0-0,243,...,106920,27,24,0.0,0.605263,0.666667,,2022,8,6
5086,2022-01-30,49ers,Rams,16,25,9,18,0-0,0-1,232,...,128340,17,20,1.0,0.533333,0.688889,,2022,3,11


In [14]:
# also need to switch fourth_downs_away and fourth_downs_home

### NOTE: need to investigate within each data file because of data issues for fourth down. Data set 1 seems to be
### correct. But dataset 2 has opposite

# fourth_downs_away
df[['fourth_downs_away','fourth_downs_completed_away']] = df['fourth_downs_away'].str.split('-',expand=True)

# fourth_downs_home
df[['fourth_downs_home','fourth_downs_completed_home']] = df['fourth_downs_home'].str.split('-',expand=True)

In [15]:
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,score_home,home_win,comp_att_away_percentage,comp_att_home_percentage,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home
0,2002-09-05,49ers,Giants,13,21,12,16,0,0,166,...,13,0.0,0.615385,0.622222,Buccaneers,2002,4,9,0,1
1,2002-09-08,Seahawks,Raiders,14,27,11,12,2,1,143,...,31,1.0,0.718750,0.678571,Buccaneers,2002,1,7,2,1
2,2002-09-08,Jets,Bills,18,26,8,17,0,2,193,...,31,0.0,0.800000,0.666667,Buccaneers,2002,2,7,0,2
3,2002-09-08,Vikings,Bears,19,20,13,13,0,0,228,...,27,1.0,0.571429,0.606061,Buccaneers,2002,5,7,0,0
4,2002-09-08,Chargers,Bengals,27,13,10,11,0,0,160,...,6,0.0,0.789474,0.580645,Buccaneers,2002,6,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,27,0.0,0.736842,0.555556,,2022,4,3,0,2
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,42,1.0,0.729730,0.750000,,2022,6,8,4,1
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,24,0.0,0.605263,0.666667,,2022,8,6,0,0
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,20,1.0,0.533333,0.688889,,2022,3,11,0,1


In [16]:
# convert third down and fourth down to decimal

# third down away
df['third_down_perc_away'] = df['third_downs_completed_away'].astype(int) / df['third_downs_away'].astype(int)

# third down home
df['third_down_home_perc'] = df['third_downs_completed_home'].astype(int) / df['third_downs_home'].astype(int)

# fourth down away (NOTE: Switched columns because of previous switching, underlying columns will be dropped)
df['fourth_down_perc_away'] = df['fourth_downs_away'].astype(int) / df['fourth_downs_completed_away'].astype(int)

# fourth down home (NOTE: Switched columns because of previous switching, underlying columns will be dropped)
df['fourth_down_perc_home'] = df['fourth_downs_home'].astype(int) / df['fourth_downs_completed_home'].astype(int)

In [17]:
df.dtypes

date                           datetime64[ns]
away                                   object
home                                   object
first_downs_away                        int64
first_downs_home                        int64
                                    ...      
fourth_downs_completed_home            object
third_down_perc_away                  float64
third_down_home_perc                  float64
fourth_down_perc_away                 float64
fourth_down_perc_home                 float64
Length: 72, dtype: object

In [18]:
df.to_excel('final_dataset_TEST.xlsx')

In [19]:
# filter for january and february
df[df['date'].dt.month.isin([1,2])]

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home
256,2005-01-02,Jaguars,Raiders,13,16,18,17,1,4,149,...,Steelers,2005,8,5,1,2,0.444444,0.294118,1.0,2.0
257,2005-01-02,Steelers,Bills,15,16,19,12,1,1,105,...,Steelers,2005,8,2,1,1,0.421053,0.166667,1.0,1.0
258,2005-01-02,Packers,Bears,17,17,11,15,0,3,327,...,Steelers,2005,4,3,1,1,0.363636,0.200000,0.0,3.0
259,2005-01-02,Lions,Titans,23,15,15,13,2,0,331,...,Steelers,2005,5,5,1,0,0.333333,0.384615,2.0,
260,2005-01-02,Jets,Rams,22,21,18,13,1,0,144,...,Steelers,2005,5,5,1,0,0.277778,0.384615,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,,2022,4,3,0,2,0.363636,0.214286,,2.0
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,,2022,6,8,4,1,0.428571,0.615385,1.0,1.0
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,,2022,8,6,0,0,0.571429,0.500000,,
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,,2022,3,11,0,1,0.333333,0.611111,,0.0


In [20]:
# update year column by subtracting 1 for games in Jan/Feb
df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] = df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] - 1

In [21]:
# check 
df[df['date'].dt.month.isin([1,2])]

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home
256,2005-01-02,Jaguars,Raiders,13,16,18,17,1,4,149,...,Steelers,2004,8,5,1,2,0.444444,0.294118,1.0,2.0
257,2005-01-02,Steelers,Bills,15,16,19,12,1,1,105,...,Steelers,2004,8,2,1,1,0.421053,0.166667,1.0,1.0
258,2005-01-02,Packers,Bears,17,17,11,15,0,3,327,...,Steelers,2004,4,3,1,1,0.363636,0.200000,0.0,3.0
259,2005-01-02,Lions,Titans,23,15,15,13,2,0,331,...,Steelers,2004,5,5,1,0,0.333333,0.384615,2.0,
260,2005-01-02,Jets,Rams,22,21,18,13,1,0,144,...,Steelers,2004,5,5,1,0,0.277778,0.384615,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,,2021,4,3,0,2,0.363636,0.214286,,2.0
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,,2021,6,8,4,1,0.428571,0.615385,1.0,1.0
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,,2021,8,6,0,0,0.571429,0.500000,,
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,,2021,3,11,0,1,0.333333,0.611111,,0.0


In [22]:
# update the super bowl winner for rows changed
df['super_bowl_winner'] = df['super_bowl_year'].map(winner_dict)

In [29]:
# check
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home,Match_Winner
0,2002-09-05,49ers,Giants,13,21,12,16,0,0,166,...,2002,4,9,0,1,0.333333,0.562500,,0.0,49ers
1,2002-09-08,Seahawks,Raiders,14,27,11,12,2,1,143,...,2002,1,7,2,1,0.090909,0.583333,1.0,1.0,Raiders
2,2002-09-08,Jets,Bills,18,26,8,17,0,2,193,...,2002,2,7,0,2,0.250000,0.411765,,1.0,Jets
3,2002-09-08,Vikings,Bears,19,20,13,13,0,0,228,...,2002,5,7,0,0,0.384615,0.538462,,,Bears
4,2002-09-08,Chargers,Bengals,27,13,10,11,0,0,160,...,2002,6,4,0,0,0.600000,0.363636,,,Chargers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,2021,4,3,0,2,0.363636,0.214286,,2.0,Rams
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,2021,6,8,4,1,0.428571,0.615385,1.0,1.0,Chiefs
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,2021,8,6,0,0,0.571429,0.500000,,,Bengals
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,2021,3,11,0,1,0.333333,0.611111,,0.0,Rams


In [31]:
df.columns

Index(['date', 'away', 'home', 'first_downs_away', 'first_downs_home',
       'third_downs_away', 'third_downs_home', 'fourth_downs_away',
       'fourth_downs_home', 'passing_yards_away', 'passing_yards_home',
       'rushing_yards_away', 'rushing_yards_home', 'total_yards_away',
       'total_yards_home', 'comp_att_away', 'comp_att_home', 'sacks_away',
       'sacks_home', 'rushing_attempts_away', 'rushing_attempts_home',
       'fumbles_away', 'fumbles_home', 'int_away', 'int_home',
       'turnovers_away', 'turnovers_home', 'penalties_away', 'penalties_home',
       'drives_away', 'drives_home', 'def_st_td_away', 'def_st_td_home',
       'possession_away', 'possession_home', 'third_downs_away0',
       'third_downs_away1', 'third_downs_home0', 'third_downs_home1',
       'fourth_downs_away0', 'fourth_downs_away1', 'fourth_downs_home0',
       'fourth_downs_home1', 'comp_att_away0', 'comp_att_away1',
       'comp_att_home0', 'comp_att_home1', 'sacks_away0', 'sacks_away1',
       'sa

In [28]:
df['Match_Winner'] = np.where(df['score_away'] > df['score_home'], df['away'],
                        np.where(df['score_away'] < df['score_home'], df['home'],''))

In [32]:
df_home =df[['date', 'home', 'first_downs_home', 'third_downs_home',
       'fourth_downs_home', 'passing_yards_home', 'rushing_yards_home',
       'total_yards_home', 'comp_att_home','sacks_home', 'rushing_attempts_home', 
        'fumbles_home', 'int_home', 'turnovers_home', 'penalties_home','drives_home', 
        'def_st_td_home','possession_home','third_downs_home0', 'third_downs_home1',
        'fourth_downs_home0','fourth_downs_home1','comp_att_home0', 'comp_att_home1',
       'sacks_home0', 'sacks_home1', 'penalties_home0', 'penalties_home1',
       'possession_home_in_seconds', 'score_home', 'home_win','comp_att_home_percentage',
       'super_bowl_winner', 'super_bowl_year','third_downs_completed_home',
       'fourth_downs_completed_home','third_down_home_perc',
       'fourth_down_perc_home', 'Match_Winner']]

In [45]:
df_away=df[['date', 'away', 'first_downs_away','third_downs_away', 'fourth_downs_away', 'passing_yards_away',
       'rushing_yards_away', 'total_yards_away', 'comp_att_away', 'sacks_away', 'rushing_attempts_away',
       'fumbles_away', 'int_away',
       'turnovers_away', 'penalties_away','drives_away', 'def_st_td_away','possession_away', 'third_downs_away0',
       'third_downs_away1','fourth_downs_away0', 'fourth_downs_away1', 'comp_att_away0', 'comp_att_away1',
       'sacks_away0', 'sacks_away1','penalties_away0', 'penalties_away1',
       'possession_away_in_seconds','score_away','home_win','comp_att_away_percentage',
       'super_bowl_winner', 'super_bowl_year', 'third_downs_completed_away','fourth_downs_completed_away',
       'third_down_perc_away','fourth_down_perc_away', 'Match_Winner']]

In [54]:
df_home.shape

(5088, 39)

In [55]:
df_away.shape

(5088, 40)

In [46]:
new_columns = [column.replace('away', 'team') for column in df_away.columns]
df_away.columns = new_columns

df_away['is_away']=1

new_columns = [column.replace('home', 'team') for column in df_home.columns]
df_home.columns = new_columns

In [57]:
df_final =pd.concat([df_home,df_away], axis=0, ignore_index=True)

In [70]:
df_final.groupby(by=['super_bowl_year','team']).mean().columns

Index(['first_downs_team', 'passing_yards_team', 'rushing_yards_team',
       'rushing_attempts_team', 'fumbles_team', 'int_team', 'turnovers_team',
       'drives_team', 'def_st_td_team', 'third_downs_team0',
       'third_downs_team1', 'fourth_downs_team0', 'fourth_downs_team1',
       'comp_att_team0', 'comp_att_team1', 'sacks_team0', 'sacks_team1',
       'penalties_team0', 'penalties_team1', 'possession_team_in_seconds',
       'score_team', 'team_win', 'comp_att_team_percentage',
       'third_down_team_perc', 'fourth_down_perc_team', 'home_win',
       'third_down_perc_team', 'is_away'],
      dtype='object')

In [72]:
df_final.groupby(by=['super_bowl_year','team']).agg({
                                                    'first_downs_team': 'mean',
                                                    'passing_yards_team':'mean', 
                                                    'rushing_yards_team':'mean',
                                                    'rushing_attempts_team':'mean', 
                                                    'fumbles_team':'mean', 
                                                    'int_team':'mean', 
                                                    'turnovers_team':'mean',
                                                    'drives_team':'mean', 
                                                    'def_st_td_team':'mean', 
                                                    'third_downs_team0':'mean',
                                                    'third_downs_team1':'mean', 
                                                    'fourth_downs_team0':'mean', 
                                                    'fourth_downs_team1':'mean',
                                                    'comp_att_team0':'mean', 
                                                    'comp_att_team1':'mean', 
                                                    'sacks_team0':'mean', 
                                                    'sacks_team1':'mean',
                                                    'penalties_team0':'mean', 
                                                    'penalties_team1':'mean', 
                                                    'possession_team_in_seconds':'mean',
                                                    'score_team':'mean', 
                                                    'team_win':'mean', 
                                                    'comp_att_team_percentage':'mean',
                                                    'third_down_team_perc':'mean', 
                                                    'fourth_down_perc_team':'mean', 
                                                    'home_win':'mean',
                                                    'third_down_perc_team':'mean', 
                                                    'is_away':'mean'
                                                    })

Unnamed: 0_level_0,Unnamed: 1_level_0,first_downs_team,passing_yards_team,rushing_yards_team,rushing_attempts_team,fumbles_team,int_team,turnovers_team,drives_team,def_st_td_team,third_downs_team0,...,penalties_team1,possession_team_in_seconds,score_team,team_win,comp_att_team_percentage,third_down_team_perc,fourth_down_perc_team,home_win,third_down_perc_team,is_away
super_bowl_year,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2002,49ers,21.055556,221.055556,133.111111,29.0000,0.555556,0.777778,1.333333,15.000000,0.166667,14.5000,...,37.111111,104913.944444,22.888889,0.666667,0.616331,0.503899,1.545455,0.444444,0.597572,1.0
2002,Bears,16.250000,190.687500,84.000000,23.8750,1.062500,1.125000,2.187500,17.250000,0.125000,13.1250,...,52.437500,86728.625000,17.562500,0.375000,0.663036,0.370739,1.181818,0.875000,0.305291,1.0
2002,Bengals,19.687500,217.250000,108.125000,26.6250,0.812500,1.375000,2.187500,16.750000,0.437500,13.9375,...,47.125000,84057.937500,17.437500,0.125000,0.597892,0.393524,0.937500,0.875000,0.379876,1.0
2002,Bills,22.187500,249.687500,99.750000,24.2500,1.000000,0.937500,1.937500,15.875000,0.312500,13.2500,...,62.375000,112781.250000,23.687500,0.625000,0.610519,0.419815,1.000000,0.625000,0.415499,1.0
2002,Broncos,22.312500,239.000000,141.625000,28.5625,0.437500,1.250000,1.687500,14.687500,0.375000,12.6250,...,49.250000,107268.312500,24.500000,0.625000,0.657844,0.394219,1.115385,0.500000,0.367041,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,Steelers,19.333333,181.666667,108.333333,27.0000,0.333333,0.666667,1.000000,13.666667,0.333333,17.0000,...,46.333333,122060.000000,21.000000,1.000000,0.620883,0.277778,1.333333,0.500000,0.395221,1.0
2021,Texans,16.500000,212.500000,75.000000,24.0000,0.000000,0.500000,0.500000,11.000000,0.000000,13.0000,...,66.000000,102510.000000,16.000000,0.000000,0.676610,0.363636,0.500000,1.000000,0.333333,1.0
2021,Titans,18.666667,201.333333,154.000000,33.0000,0.000000,1.000000,1.000000,10.666667,0.000000,12.0000,...,38.000000,112020.000000,26.000000,0.500000,0.688657,0.295833,0.500000,0.000000,0.615385,1.0
2021,Vikings,11.000000,203.000000,65.500000,16.5000,0.000000,0.000000,0.000000,11.000000,0.500000,12.5000,...,25.000000,1340.000000,20.500000,1.000000,0.625874,0.538462,4.000000,1.000000,0.166667,1.0


In [24]:
# delete the conference_division column (what value does it provide?)

In [None]:
# differentiate between playoffs and regular season???

# correct games played to correct super bowl year
# all games in February and January belong to previous year
# filter games in this range


# identify super bowl winner in each season (DONE/UPDATED)

### TO DO for monday:

In [None]:
# step 1
# aggregate data by year (sum yards, etc.)
# groupby
# how to group? sum, average

# step 2
# vertically combine all data to get FULL team season statistics

# step 3
# for each year, add column indicating 0 did not win SB, 1 won SB




In [None]:
# Only considered matches before 2022
df =df[df['date']<'2022-01-01']

# To be discussed
# Only considered no tie matches
df =df[df['home_win'].isin([0,1])]

In [None]:
X = df.drop(columns=['date', 'away', 'home','third_downs_away', 'third_downs_home',
                     'fourth_downs_away', 'fourth_downs_home', 'comp_att_away', 'comp_att_home',
                     'sacks_away', 'sacks_home', 'penalties_away', 'penalties_home',
                     'redzone_away', 'redzone_home','possession_away','possession_home',
                     'away_win'])

y = df['home_win']