In [1]:
# Basic package
import pandas as pd
import numpy as np
import datetime as dt

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Save model for future use
import pickle

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

np.random.seed(42)

In [2]:
# Display all results in a cell, not just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#Display all columns
pd.set_option('display.max_columns', None)

In [3]:
# read in datasets
df_1 = pd.read_csv('data/NFL Analytics Challenge Data Set 1_reformat.csv')
df_2 = pd.read_csv('data/NFL Analytics Challenge Data Set 2_reformat.csv')
df_3 = pd.read_csv('data/NFL Analytics Challenge Data Set 3.csv')

In [4]:
# preview dataset1
df_1.head()

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,passing_yards_home,rushing_yards_away,rushing_yards_home,total_yards_away,total_yards_home,comp_att_away,comp_att_home,sacks_away,sacks_home,rushing_attempts_away,rushing_attempts_home,fumbles_away,fumbles_home,int_away,int_home,turnovers_away,turnovers_home,penalties_away,penalties_home,redzone_away,redzone_home,drives_away,drives_home,def_st_td_away,def_st_td_home,possession_away,possession_home
0,9/5/2002,. 49ers,Giants,13,21,4-12,9-16,0-0,0-1,166,318,113,43,279,361,16-26,28-45,0-0,3-24,25,22,0,0,1,3,1,3,5-29,10-80,0-8,0-6,13,15,0,0,27:32:00,32:28:00
1,9/8/2002,Seahawks,Raiders,14,27,1-11,7-12,2-2,1-1,143,202,43,221,423,423,23-32,19-28,3-12,2-12,16,40,0,1,0,1,0,2,13-105,5-45,0-2,0-2,4,4,0,0,25:09:00,34:51:00
2,9/8/2002,Jets,Bills,18,26,2-8,7-17,0-0,2-2,193,242,73,142,266,384,24-30,26-39,3-17,4-29,14,32,1,1,0,2,1,3,10-90,10-82,0-9,0-8,17,18,2,0,21:06,39:08:00
3,9/8/2002,Vikings,Bears,19,20,5-13,7-13,0-0,0-0,228,288,140,80,368,368,16-28,20-33,1-6,1-9,33,26,1,1,2,1,3,2,8-52,4-33,0-7,0-6,17,17,0,0,31:31:00,28:29:00
4,9/8/2002,Chargers,Bengals,27,13,6-10,4-11,0-0,0-0,160,167,241,36,401,203,15-19,18-31,1-00,4-31,45,13,0,0,0,1,0,1,4-39,9-57,0-7,0-5,11,15,0,0,37:48:00,22:12


In [5]:
# preview dataset2
df_2.head()

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,passing_yards_home,rushing_yards_away,rushing_yards_home,total_yards_away,total_yards_home,comp_att_away,comp_att_home,sacks_away,sacks_home,rushing_attempts_away,rushing_attempts_home,fumbles_away,fumbles_home,int_away,int_home,turnovers_away,turnovers_home,penalties_away,penalties_home,redzone_away,redzone_home,drives_away,drives_home,def_st_td_away,def_st_td_home,possession_away,possession_home
0,1/4/2003,Colts,Jets,10,26,5-13,6-11,0-0,0-0,124,216,52,180,176,396,14-31,19-25,1-13,2-6,14,42,1,0,2,0,3,0,2-10,3-30,0-7,0-5,17,10,0,0,19:42,40:18:00
1,1/4/2003,Falcons,Packers,21,17,8-17,8-15,0-0,0-3,117,233,192,56,309,289,13-25,20-42,0-0,2-14,44,19,0,3,0,2,0,5,3-20,3-15,0-5,0-6,13,17,1,0,36:04:00,23:56
2,1/5/2003,Browns,Steelers,21,30,8-17,3-10,0-0,0-0,409,343,38,89,447,432,26-43,30-48,2-20,3-24,28,20,0,1,1,2,1,3,9-75,4-35,0-7,0-9,21,21,0,1,33:02:00,26:58:00
3,1/5/2003,Giants,49ers,26,23,6-13,6-13,0-0,1-2,327,356,119,90,446,446,29-44,28-45,2-15,0-0,29,20,0,1,1,1,1,2,5-50,2-20,0-9,0-9,19,18,0,0,34:39:00,25:21:00
4,1/11/2003,Steelers,Titans,21,29,4-14,12-18,0-0,1-1,257,331,67,99,324,430,21-42,27-45,1-9,1-7,20,36,0,2,1,2,1,4,6-41,8-92,0-8,0-8,19,20,0,0,24:43:00,37:32:00


In [6]:
# Data cleaning and engineering
# Dataset 1
df_1['date'] =pd.to_datetime(df_1['date']) # Change to datetime format
df_1['away'] =df_1['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_1_cols_to_split = ['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home']

# loop through columns to split
for col in df_1_cols_to_split:
    #df_1[col] =df_1[col].replace(dic, regex=True)
    df_1[col]=df_1[col].str.replace('00000','')
    df_1 =df_1.join(df_1[col].str.split('-', expand=True).add_prefix(col))

In [7]:
# preview datset1
df_1.head()

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,passing_yards_home,rushing_yards_away,rushing_yards_home,total_yards_away,total_yards_home,comp_att_away,comp_att_home,sacks_away,sacks_home,rushing_attempts_away,rushing_attempts_home,fumbles_away,fumbles_home,int_away,int_home,turnovers_away,turnovers_home,penalties_away,penalties_home,redzone_away,redzone_home,drives_away,drives_home,def_st_td_away,def_st_td_home,possession_away,possession_home,third_downs_away0,third_downs_away1,third_downs_home0,third_downs_home1,fourth_downs_away0,fourth_downs_away1,fourth_downs_home0,fourth_downs_home1,comp_att_away0,comp_att_away1,comp_att_home0,comp_att_home1,sacks_away0,sacks_away1,sacks_home0,sacks_home1,penalties_away0,penalties_away1,penalties_home0,penalties_home1,redzone_away0,redzone_away1,redzone_home0,redzone_home1
0,2002-09-05,49ers,Giants,13,21,4-12,9-16,0-0,0-1,166,318,113,43,279,361,16-26,28-45,0-0,3-24,25,22,0,0,1,3,1,3,5-29,10-80,0-8,0-6,13,15,0,0,27:32:00,32:28:00,4,12,9,16,0,0,0,1,16,26,28,45,0,0,3,24,5,29,10,80,0,8,0,6
1,2002-09-08,Seahawks,Raiders,14,27,1-11,7-12,2-2,1-1,143,202,43,221,423,423,23-32,19-28,3-12,2-12,16,40,0,1,0,1,0,2,13-105,5-45,0-2,0-2,4,4,0,0,25:09:00,34:51:00,1,11,7,12,2,2,1,1,23,32,19,28,3,12,2,12,13,105,5,45,0,2,0,2
2,2002-09-08,Jets,Bills,18,26,2-8,7-17,0-0,2-2,193,242,73,142,266,384,24-30,26-39,3-17,4-29,14,32,1,1,0,2,1,3,10-90,10-82,0-9,0-8,17,18,2,0,21:06,39:08:00,2,8,7,17,0,0,2,2,24,30,26,39,3,17,4,29,10,90,10,82,0,9,0,8
3,2002-09-08,Vikings,Bears,19,20,5-13,7-13,0-0,0-0,228,288,140,80,368,368,16-28,20-33,1-6,1-9,33,26,1,1,2,1,3,2,8-52,4-33,0-7,0-6,17,17,0,0,31:31:00,28:29:00,5,13,7,13,0,0,0,0,16,28,20,33,1,6,1,9,8,52,4,33,0,7,0,6
4,2002-09-08,Chargers,Bengals,27,13,6-10,4-11,0-0,0-0,160,167,241,36,401,203,15-19,18-31,1-00,4-31,45,13,0,0,0,1,0,1,4-39,9-57,0-7,0-5,11,15,0,0,37:48:00,22:12,6,10,4,11,0,0,0,0,15,19,18,31,1,0,4,31,4,39,9,57,0,7,0,5


In [8]:
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_1.columns[37:]:
    df_1[col] = df_1[col].astype('int64')

df_1.info() #Validate that all columns have correct data type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 61 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   1575 non-null   datetime64[ns]
 1   away                   1575 non-null   object        
 2   home                   1575 non-null   object        
 3   first_downs_away       1575 non-null   int64         
 4   first_downs_home       1575 non-null   int64         
 5   third_downs_away       1575 non-null   object        
 6   third_downs_home       1575 non-null   object        
 7   fourth_downs_away      1575 non-null   object        
 8   fourth_downs_home      1575 non-null   object        
 9   passing_yards_away     1575 non-null   int64         
 10  passing_yards_home     1575 non-null   int64         
 11  rushing_yards_away     1575 non-null   int64         
 12  rushing_yards_home     1575 non-null   int64         
 13  tot

In [9]:
# Convert last two columns to time in seconds
m = df_1['possession_away'].str.len().max()
df_1['possession_away'] = df_1['possession_away'].str.rjust(m, '0')
df_1['possession_home'] = df_1['possession_home'].str.rjust(m, '0')

df_1['possession_away_in_seconds'] = ((df_1['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_away'].str[6:].astype('int64')))

df_1['possession_home_in_seconds'] = ((df_1['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_home'].str[6:].astype('int64')))

In [10]:
# Data cleaning and engineering for Dataset2 (same as dataset1)
df_2['date'] = pd.to_datetime(df_2['date']) # Change to datetime format
df_2['away'] = df_2['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_2_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home']

for col in df_2_cols_to_split:

    df_2[col]=df_2[col].str.replace('00000','')
    df_2 =df_2.join(df_2[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_2.columns[37:]:
    df_2[col] = df_2[col].astype('int64')
    
#df_2.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m = df_2['possession_away'].str.len().max()
df_2['possession_away'] = df_2['possession_away'].str.rjust(m, '0')
df_2['possession_home'] = df_2['possession_home'].str.rjust(m, '0')

df_2['possession_away_in_seconds'] = ((df_2['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_away'].str[6:].astype('int64')))

df_2['possession_home_in_seconds'] = ((df_2['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_home'].str[6:].astype('int64')))

In [11]:
# Data cleaning and engineering for Dataset 3
df_3['date'] = pd.to_datetime(df_3['date']) # Change to datetime format
df_3['away'] = df_3['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

In [12]:
# Create final dataframe
df = pd.concat([df_1, df_2],ignore_index=True)
df = df.merge(df_3, on=['date','away','home'])

In [13]:
df.info()
df.shape
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5088 entries, 0 to 5087
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        5088 non-null   datetime64[ns]
 1   away                        5088 non-null   object        
 2   home                        5088 non-null   object        
 3   first_downs_away            5088 non-null   int64         
 4   first_downs_home            5088 non-null   int64         
 5   third_downs_away            5088 non-null   object        
 6   third_downs_home            5088 non-null   object        
 7   fourth_downs_away           5088 non-null   object        
 8   fourth_downs_home           5088 non-null   object        
 9   passing_yards_away          5088 non-null   int64         
 10  passing_yards_home          5088 non-null   int64         
 11  rushing_yards_away          5088 non-null   int64       

(5088, 65)

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,passing_yards_home,rushing_yards_away,rushing_yards_home,total_yards_away,total_yards_home,comp_att_away,comp_att_home,sacks_away,sacks_home,rushing_attempts_away,rushing_attempts_home,fumbles_away,fumbles_home,int_away,int_home,turnovers_away,turnovers_home,penalties_away,penalties_home,redzone_away,redzone_home,drives_away,drives_home,def_st_td_away,def_st_td_home,possession_away,possession_home,third_downs_away0,third_downs_away1,third_downs_home0,third_downs_home1,fourth_downs_away0,fourth_downs_away1,fourth_downs_home0,fourth_downs_home1,comp_att_away0,comp_att_away1,comp_att_home0,comp_att_home1,sacks_away0,sacks_away1,sacks_home0,sacks_home1,penalties_away0,penalties_away1,penalties_home0,penalties_home1,redzone_away0,redzone_away1,redzone_home0,redzone_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home
0,2002-09-05,49ers,Giants,13,21,4-12,9-16,0-0,0-1,166,318,113,43,279,361,16-26,28-45,0-0,3-24,25,22,0,0,1,3,1,3,5-29,10-80,0-8,0-6,13,15,0,0,27:32:00,32:28:00,4,12,9,16,0,0,0,1,16,26,28,45,0,0,3,24,5,29,10,80,0,8,0,6,99120,116880,16,13
1,2002-09-08,Seahawks,Raiders,14,27,1-11,7-12,2-2,1-1,143,202,43,221,423,423,23-32,19-28,3-12,2-12,16,40,0,1,0,1,0,2,13-105,5-45,0-2,0-2,4,4,0,0,25:09:00,34:51:00,1,11,7,12,2,2,1,1,23,32,19,28,3,12,2,12,13,105,5,45,0,2,0,2,90540,125460,17,31
2,2002-09-08,Jets,Bills,18,26,2-8,7-17,0-0,2-2,193,242,73,142,266,384,24-30,26-39,3-17,4-29,14,32,1,1,0,2,1,3,10-90,10-82,0-9,0-8,17,18,2,0,00021:06,39:08:00,2,8,7,17,0,0,2,2,24,30,26,39,3,17,4,29,10,90,10,82,0,9,0,8,1266,140880,37,31
3,2002-09-08,Vikings,Bears,19,20,5-13,7-13,0-0,0-0,228,288,140,80,368,368,16-28,20-33,1-6,1-9,33,26,1,1,2,1,3,2,8-52,4-33,0-7,0-6,17,17,0,0,31:31:00,28:29:00,5,13,7,13,0,0,0,0,16,28,20,33,1,6,1,9,8,52,4,33,0,7,0,6,113460,102540,23,27
4,2002-09-08,Chargers,Bengals,27,13,6-10,4-11,0-0,0-0,160,167,241,36,401,203,15-19,18-31,1-00,4-31,45,13,0,0,0,1,0,1,4-39,9-57,0-7,0-5,11,15,0,0,37:48:00,00022:12,6,10,4,11,0,0,0,0,15,19,18,31,1,0,4,31,4,39,9,57,0,7,0,5,136080,1332,34,6


In [14]:
# Create win column, if home team win, then 1, if away team win, then 1
df.loc[df['score_away'] < df['score_home'], 'home_win'] = 1 
df.loc[df['score_away'] > df['score_home'], 'away_win'] = 1

# Create column with SuperBowl winner by year
winner_dict = {2002:'Buccaneers',
             2003:'Patriots',
             2004:'Patriots',
             2005:'Steelers',
             2006:'Colts',
             2007:'Giants',
             2008:'Steelers',
             2009:'Saints',
             2010:'Packers',
             2011:'Giants',
             2012:'Ravens',
             2013:'Seahawks',
             2014:'Patriots',
             2015:'Broncos',
             2016:'Patriots',
             2017:'Eagles',
             2018:'Patriots',
             2019:'Chiefs',
             2020:'Buccaneers',
             2021:'Rams'}

# map winner_dict
df['super_bowl_winner'] = df['date'].dt.year.map(winner_dict)

# Create column Super Bowl Year
df['super_bowl_year'] = df['date'].dt.year

In [15]:
# filter for january and february due to playoff games 'belonging' to prior year
df[df['date'].dt.month.isin([1,2])]

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,passing_yards_home,rushing_yards_away,rushing_yards_home,total_yards_away,total_yards_home,comp_att_away,comp_att_home,sacks_away,sacks_home,rushing_attempts_away,rushing_attempts_home,fumbles_away,fumbles_home,int_away,int_home,turnovers_away,turnovers_home,penalties_away,penalties_home,redzone_away,redzone_home,drives_away,drives_home,def_st_td_away,def_st_td_home,possession_away,possession_home,third_downs_away0,third_downs_away1,third_downs_home0,third_downs_home1,fourth_downs_away0,fourth_downs_away1,fourth_downs_home0,fourth_downs_home1,comp_att_away0,comp_att_away1,comp_att_home0,comp_att_home1,sacks_away0,sacks_away1,sacks_home0,sacks_home1,penalties_away0,penalties_away1,penalties_home0,penalties_home1,redzone_away0,redzone_away1,redzone_home0,redzone_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home,home_win,away_win,super_bowl_winner,super_bowl_year
256,2005-01-02,Jaguars,Raiders,13,16,8-18,5-17,1-1,2-4,149,134,93,147,242,281,15-28,15-39,1-00,2-8,34,28,1,1,1,3,2,4,7-87,8-72,0-6,0-6,15,16,0,0,30:53:00,29:07:00,8,18,5,17,1,1,2,4,15,28,15,39,1,0,2,8,7,87,8,72,0,6,0,6,111180,104820,13,6,,1.0,Steelers,2005
257,2005-01-02,Steelers,Bills,15,16,8-19,2-12,1-1,1-1,105,171,157,96,262,267,12-25,16-30,2-15,3-18,43,21,1,2,2,1,3,3,7-40,12-108,0-7,0-6,18,21,2,2,35:03:00,24:57:00,8,19,2,12,1,1,1,1,12,25,16,30,2,15,3,18,7,40,12,108,0,7,0,6,126180,89820,29,24,,1.0,Steelers,2005
258,2005-01-02,Packers,Bears,17,17,4-11,3-15,0-1,1-3,327,136,60,110,387,246,16-26,20-29,0-0,9-60,30,27,0,0,0,1,0,1,5-35,4-29,0-4,0-7,14,16,2,0,28:51:00,31:09:00,4,11,3,15,0,1,1,3,16,26,20,29,0,0,9,60,5,35,4,29,0,4,0,7,103860,112140,31,14,,1.0,Steelers,2005
259,2005-01-02,Lions,Titans,23,15,5-15,5-13,1-2,0-0,331,217,103,95,434,312,33-49,18-33,2-15,0-0,26,26,1,0,1,0,2,0,6-52,7-50,0-6,0-6,16,17,0,2,32:58:00,27:02:00,5,15,5,13,1,2,0,0,33,49,18,33,2,15,0,0,6,52,7,50,0,6,0,6,118680,97320,19,24,1.0,,Steelers,2005
260,2005-01-02,Jets,Rams,22,21,5-18,5-13,1-1,0-0,144,432,180,47,324,479,21-36,29-40,6-37,3-18,39,19,0,1,0,2,0,3,7-75,10-76,0-7,0-8,19,21,3,0,41:08:00,30:50:00,5,18,5,13,1,1,0,0,21,36,29,40,6,37,3,18,7,75,10,76,0,7,0,8,148080,111000,29,32,1.0,,Steelers,2005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,4-11,3-14,0-0,2-4,355,308,73,51,428,359,28-38,30-54,2-11,3-21,30,14,4,1,0,1,4,2,4-45,4-61,2-5,2-3,15,15,0,0,34:08:00,25:52:00,4,11,3,14,0,0,2,4,28,38,30,54,2,11,3,21,4,45,4,61,2,5,2,3,122880,93120,30,27,,1.0,,2022
5084,2022-01-23,Bills,Chiefs,23,30,6-14,8-13,4-4,1-1,313,370,109,182,422,552,27-37,33-44,2-16,2-8,24,27,0,0,0,0,0,0,3-15,1-10,3-3,3-5,9,11,0,0,27:37:00,36:38:00,6,14,8,13,4,4,1,1,27,37,33,44,2,16,2,8,3,15,1,10,3,3,3,5,99420,131880,36,42,1.0,,,2022
5085,2022-01-30,Bengals,Chiefs,21,24,8-14,6-12,0-0,0-0,243,236,116,139,359,375,23-38,26-39,1-7,4-39,27,24,0,0,1,2,1,2,4-30,2-11,1-4,3-5,10,11,0,0,35:56:00,29:42:00,8,14,6,12,0,0,0,0,23,38,26,39,1,7,4,39,4,30,2,11,1,4,3,5,129360,106920,27,24,,1.0,,2022
5086,2022-01-30,49ers,Rams,16,25,3-9,11-18,0-0,0-1,232,326,50,70,282,396,16-30,31-45,0-0,2-11,20,29,0,0,1,1,1,1,6-54,2-10,1-1,2-6,9,9,0,0,24:21:00,35:39:00,3,9,11,18,0,0,0,1,16,30,31,45,0,0,2,11,6,54,2,10,1,1,2,6,87660,128340,17,20,1.0,,,2022


In [16]:
# update year column by subtracting 1 for games in Jan/Feb
df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] = df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] - 1

In [17]:
# update the super bowl winner for rows changed
df['super_bowl_winner'] = df['super_bowl_year'].map(winner_dict)

In [18]:
# column to identify whether regular season or playoffs, regular season could be up to 3 days in january
# assign playoffs as after january 3rd and before march
df.loc[(df['date'].dt.month >= 1) & (df['date'].dt.day > 3) & (df['date'].dt.month < 3), 'playoffs'] = 1

In [19]:
# due to 17 game regular season in 2021, create exception for above cell. games till January 9th are regular season
df.loc[(df['date'] > '2022-01-01') & (df['date'] < '2022-01-10'), 'playoffs'] = 0

In [20]:
# fill NaNs in 'playoffs' w/ 0
df['playoffs'] = df['playoffs'].fillna(0)

In [21]:
# fill for NaNs in home and away win columns w/ 0
df[['home_win','away_win']] = df[['home_win','away_win']].fillna(0)

In [22]:
df.columns

Index(['date', 'away', 'home', 'first_downs_away', 'first_downs_home',
       'third_downs_away', 'third_downs_home', 'fourth_downs_away',
       'fourth_downs_home', 'passing_yards_away', 'passing_yards_home',
       'rushing_yards_away', 'rushing_yards_home', 'total_yards_away',
       'total_yards_home', 'comp_att_away', 'comp_att_home', 'sacks_away',
       'sacks_home', 'rushing_attempts_away', 'rushing_attempts_home',
       'fumbles_away', 'fumbles_home', 'int_away', 'int_home',
       'turnovers_away', 'turnovers_home', 'penalties_away', 'penalties_home',
       'redzone_away', 'redzone_home', 'drives_away', 'drives_home',
       'def_st_td_away', 'def_st_td_home', 'possession_away',
       'possession_home', 'third_downs_away0', 'third_downs_away1',
       'third_downs_home0', 'third_downs_home1', 'fourth_downs_away0',
       'fourth_downs_away1', 'fourth_downs_home0', 'fourth_downs_home1',
       'comp_att_away0', 'comp_att_away1', 'comp_att_home0', 'comp_att_home1',
       '

In [23]:
# create dataframe of only regular season games
reg_season_df = df.loc[df['playoffs'] == 0 ,:]

In [24]:
reg_season_df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,passing_yards_home,rushing_yards_away,rushing_yards_home,total_yards_away,total_yards_home,comp_att_away,comp_att_home,sacks_away,sacks_home,rushing_attempts_away,rushing_attempts_home,fumbles_away,fumbles_home,int_away,int_home,turnovers_away,turnovers_home,penalties_away,penalties_home,redzone_away,redzone_home,drives_away,drives_home,def_st_td_away,def_st_td_home,possession_away,possession_home,third_downs_away0,third_downs_away1,third_downs_home0,third_downs_home1,fourth_downs_away0,fourth_downs_away1,fourth_downs_home0,fourth_downs_home1,comp_att_away0,comp_att_away1,comp_att_home0,comp_att_home1,sacks_away0,sacks_away1,sacks_home0,sacks_home1,penalties_away0,penalties_away1,penalties_home0,penalties_home1,redzone_away0,redzone_away1,redzone_home0,redzone_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home,home_win,away_win,super_bowl_winner,super_bowl_year,playoffs
0,2002-09-05,49ers,Giants,13,21,4-12,9-16,0-0,0-1,166,318,113,43,279,361,16-26,28-45,0-0,3-24,25,22,0,0,1,3,1,3,5-29,10-80,0-8,0-6,13,15,0,0,27:32:00,32:28:00,4,12,9,16,0,0,0,1,16,26,28,45,0,0,3,24,5,29,10,80,0,8,0,6,99120,116880,16,13,0.0,1.0,Buccaneers,2002,0.0
1,2002-09-08,Seahawks,Raiders,14,27,1-11,7-12,2-2,1-1,143,202,43,221,423,423,23-32,19-28,3-12,2-12,16,40,0,1,0,1,0,2,13-105,5-45,0-2,0-2,4,4,0,0,25:09:00,34:51:00,1,11,7,12,2,2,1,1,23,32,19,28,3,12,2,12,13,105,5,45,0,2,0,2,90540,125460,17,31,1.0,0.0,Buccaneers,2002,0.0
2,2002-09-08,Jets,Bills,18,26,2-8,7-17,0-0,2-2,193,242,73,142,266,384,24-30,26-39,3-17,4-29,14,32,1,1,0,2,1,3,10-90,10-82,0-9,0-8,17,18,2,0,00021:06,39:08:00,2,8,7,17,0,0,2,2,24,30,26,39,3,17,4,29,10,90,10,82,0,9,0,8,1266,140880,37,31,0.0,1.0,Buccaneers,2002,0.0
3,2002-09-08,Vikings,Bears,19,20,5-13,7-13,0-0,0-0,228,288,140,80,368,368,16-28,20-33,1-6,1-9,33,26,1,1,2,1,3,2,8-52,4-33,0-7,0-6,17,17,0,0,31:31:00,28:29:00,5,13,7,13,0,0,0,0,16,28,20,33,1,6,1,9,8,52,4,33,0,7,0,6,113460,102540,23,27,1.0,0.0,Buccaneers,2002,0.0
4,2002-09-08,Chargers,Bengals,27,13,6-10,4-11,0-0,0-0,160,167,241,36,401,203,15-19,18-31,1-00,4-31,45,13,0,0,0,1,0,1,4-39,9-57,0-7,0-5,11,15,0,0,37:48:00,00022:12,6,10,4,11,0,0,0,0,15,19,18,31,1,0,4,31,4,39,9,57,0,7,0,5,136080,1332,34,6,0.0,1.0,Buccaneers,2002,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5070,2022-01-09,49ers,Rams,23,19,9-14,10-17,0-0,1-1,314,201,135,64,449,265,24-33,21-32,3-26,5-37,31,27,0,0,2,2,2,2,6-56,3-15,2-4,3-4,11,11,0,0,36:44:00,31:24:00,9,14,10,17,0,0,1,1,24,33,21,32,3,26,5,37,6,56,3,15,2,4,3,4,132240,113040,27,24,0.0,1.0,Rams,2021,0.0
5071,2022-01-09,Patriots,Dolphins,21,23,5-10,7-15,0-0,0-1,245,103,134,195,379,298,20-30,15-22,2-16,1-6,27,43,2,0,1,0,3,0,8-78,5-33,3-4,2-3,10,9,0,2,26:24:00,33:36:00,5,10,7,15,0,0,0,1,20,30,15,22,2,16,1,6,8,78,5,33,3,4,2,3,95040,120960,24,33,1.0,0.0,Rams,2021,0.0
5072,2022-01-09,Seahawks,Cardinals,19,20,8-12,9-18,0-0,1-2,229,187,202,118,431,305,15-26,28-39,1-9,5-53,30,28,1,1,1,0,2,1,4-30,6-46,2-3,1-3,11,10,0,1,24:11:00,35:49:00,8,12,9,18,0,0,1,2,15,26,28,39,1,9,5,53,4,30,6,46,2,3,1,3,87060,128940,38,30,0.0,1.0,Rams,2021,0.0
5073,2022-01-09,Panthers,Buccaneers,18,21,4-14,4-11,2-6,1-1,207,324,110,85,317,409,29-43,29-39,2-12,1-2,26,20,1,0,1,0,2,0,1-10,2-10,2-4,3-3,12,12,0,0,35:05:00,24:55:00,4,14,4,11,2,6,1,1,29,43,29,39,2,12,1,2,1,10,2,10,2,4,3,3,126300,89700,17,41,1.0,0.0,Rams,2021,0.0


## Creating home and away DataFrames

In [25]:
# filter only for columns needed
df_home = reg_season_df[['date', 'home', 'first_downs_home', 'passing_yards_home', 
             'rushing_yards_home', 'total_yards_home', 'rushing_attempts_home', 
             'fumbles_home', 'int_home', 'turnovers_home', 'drives_home', 'def_st_td_home',
             'third_downs_home0', 'third_downs_home1', 'fourth_downs_home0','fourth_downs_home1',
             'comp_att_home0', 'comp_att_home1', 'sacks_home0', 'sacks_home1', 'penalties_home0', 'penalties_home1', 
             'possession_home_in_seconds', 'score_home', 'home_win', 'super_bowl_winner', 'super_bowl_year']]

In [26]:
# filter only for columns needed
df_away = reg_season_df[['date', 'away', 'first_downs_away', 'passing_yards_away', 
              'rushing_yards_away', 'total_yards_away', 'rushing_attempts_away', 
              'fumbles_away', 'int_away', 'turnovers_away', 'drives_away', 'def_st_td_away',
              'third_downs_away0', 'third_downs_away1', 'fourth_downs_away0', 'fourth_downs_away1', 
              'comp_att_away0', 'comp_att_away1', 'sacks_away0', 'sacks_away1', 'penalties_away0', 'penalties_away1', 
              'possession_away_in_seconds', 'score_away', 'away_win', 'super_bowl_winner', 'super_bowl_year']]

In [27]:
df_home.shape

(4891, 27)

In [28]:
df_away.shape

(4891, 27)

## 'Stacking' Home and Away DataFrames to get full season stats

In [29]:
# replace away w/ team
new_columns = [column.replace('away', 'team') for column in df_away.columns]
df_away.columns = new_columns

# repalce home w/ team
new_columns = [column.replace('home', 'team') for column in df_home.columns]
df_home.columns = new_columns

In [30]:
df_final = pd.concat([df_home,df_away], axis=0, ignore_index=True)

In [31]:
df_final.columns

Index(['date', 'team', 'first_downs_team', 'passing_yards_team',
       'rushing_yards_team', 'total_yards_team', 'rushing_attempts_team',
       'fumbles_team', 'int_team', 'turnovers_team', 'drives_team',
       'def_st_td_team', 'third_downs_team0', 'third_downs_team1',
       'fourth_downs_team0', 'fourth_downs_team1', 'comp_att_team0',
       'comp_att_team1', 'sacks_team0', 'sacks_team1', 'penalties_team0',
       'penalties_team1', 'possession_team_in_seconds', 'score_team',
       'team_win', 'super_bowl_winner', 'super_bowl_year'],
      dtype='object')

In [32]:
# fix 'total_yards_team' column some were showing as objects w/ ___
df_final['total_yards_team'] = df_final['passing_yards_team'] + df_final['rushing_yards_team']

In [33]:
# df_final.to_excel('complete_data_v4.xlsx')

In [None]:
df_final.groupby(by=['super_bowl_year','team']).mean().columns

In [None]:
df_final.groupby(by=['super_bowl_year','team']).agg({
                                                    'first_downs_team': 'mean',
                                                    'passing_yards_team':'mean', 
                                                    'rushing_yards_team':'mean',
                                                    'rushing_attempts_team':'mean', 
                                                    'fumbles_team':'mean', 
                                                    'int_team':'mean', 
                                                    'turnovers_team':'mean',
                                                    'drives_team':'mean', 
                                                    'def_st_td_team':'mean', 
                                                    'third_downs_team0':'mean',
                                                    'third_downs_team1':'mean', 
                                                    'fourth_downs_team0':'mean', 
                                                    'fourth_downs_team1':'mean',
                                                    'comp_att_team0':'mean', 
                                                    'comp_att_team1':'mean', 
                                                    'sacks_team0':'mean', 
                                                    'sacks_team1':'mean',
                                                    'penalties_team0':'mean', 
                                                    'penalties_team1':'mean', 
                                                    'possession_team_in_seconds':'mean',
                                                    'score_team':'mean', 
                                                    'team_win':'mean', 
                                                    'comp_att_team_percentage':'mean',
                                                    'third_down_team_perc':'mean', 
                                                    'fourth_down_perc_team':'mean', 
                                                    'home_win':'mean',
                                                    'third_down_perc_team':'mean', 
                                                    'is_away':'mean'
                                                    })