In [2]:
# Basic package
import pandas as pd
import numpy as np
import datetime as dt

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Save model for future use
import pickle

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

np.random.seed(42)

In [3]:
# Display all results in a cell, not just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
df_1 =pd.read_csv('NFL Analytics Challenge Data Set 1.csv')
df_2 =pd.read_csv('NFL Analytics Challenge Data Set 2.csv')
df_3 =pd.read_csv('NFL Analytics Challenge Data Set 3.csv')

In [5]:
# df1 observations

## fourth_downs_away needs to be switched
## fourth_downs_home needs to be switched

# df2 observations

## fourth_downs_away needs to be switched
## fourth_downs_home needs to be switched

# df3 observations

## OK

In [6]:
# Data cleaning and engineering
# Dataset 1
df_1['date'] =pd.to_datetime(df_1['date']) # Change to datetime format
df_1['away'] =df_1['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_1_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home'
                    ]

dic ={'Jan':'1', 'Feb':'2', 'Mar':'3', 'Apr':'4', 'May':'5', 'Jun':'6',
      'Jul':'7', 'Aug':'8', 'Sep':'9', 'Oct':'10', 'Nov':'11', 'Dec':'12'
     }

for col in df_1_cols_to_split:
    df_1[col] =df_1[col].replace(dic, regex=True)
    df_1 =df_1.join(df_1[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_1.columns[37:]:
    df_1[col] = df_1[col].astype('int64')
    
#df_1.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m =df_1['possession_away'].str.len().max()
df_1['possession_away'] =df_1['possession_away'].str.rjust(m, '0')
df_1['possession_home'] =df_1['possession_home'].str.rjust(m, '0')

df_1['possession_away_in_seconds'] =((df_1['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_away'].str[6:].astype('int64')))

df_1['possession_home_in_seconds'] =((df_1['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_1['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_1['possession_home'].str[6:].astype('int64')))

In [7]:
# Data cleaning and engineering
# Dataset 2
df_2['date'] =pd.to_datetime(df_2['date']) # Change to datetime format
df_2['away'] =df_2['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

# Convert text to number then split columns with composite number
df_2_cols_to_split =['third_downs_away','third_downs_home','fourth_downs_away','fourth_downs_home',
                     'comp_att_away','comp_att_home','sacks_away','sacks_home',
                     'penalties_away','penalties_home','redzone_away','redzone_home'
                    ]

for col in df_2_cols_to_split:
    df_2[col] =df_2[col].replace(dic, regex=True)
    df_2 =df_2.join(df_2[col].str.split('-', expand=True).add_prefix(col))
    
# Convert newly created columns to correct datatype to remove leading zeros
for col in df_2.columns[37:]:
    df_2[col] = df_2[col].astype('int64')
    
#df_2.info() #Validate that all columns have correct data type

# Convert two last columns to time in seconds
m =df_2['possession_away'].str.len().max()
df_2['possession_away'] =df_2['possession_away'].str.rjust(m, '0')
df_2['possession_home'] =df_2['possession_home'].str.rjust(m, '0')

df_2['possession_away_in_seconds'] =((df_2['possession_away'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_away'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_away'].str[6:].astype('int64')))

df_2['possession_home_in_seconds'] =((df_2['possession_home'].str[:2].astype('int64'))*3600 +
                                            (df_2['possession_home'].str[3:5].astype('int64'))*60 +
                                            (df_2['possession_home'].str[6:].astype('int64')))

In [8]:
# Data cleaning and engineering
# Dataset 3
df_3['date'] =pd.to_datetime(df_3['date']) # Change to datetime format
df_3['away'] =df_3['away'].str.replace('.','').str.strip() # Remove punctuation and whitespace

In [9]:
# Create final dataframe
df = pd.concat([df_1, df_2],ignore_index=True)
df =df.merge(df_3, on=['date','away','home'])

In [10]:
df.info()
df.shape
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5088 entries, 0 to 5087
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        5088 non-null   datetime64[ns]
 1   away                        5088 non-null   object        
 2   home                        5088 non-null   object        
 3   first_downs_away            5088 non-null   int64         
 4   first_downs_home            5088 non-null   int64         
 5   third_downs_away            5088 non-null   object        
 6   third_downs_home            5088 non-null   object        
 7   fourth_downs_away           5088 non-null   object        
 8   fourth_downs_home           5088 non-null   object        
 9   passing_yards_away          5088 non-null   int64         
 10  passing_yards_home          5088 non-null   int64         
 11  rushing_yards_away          5088 non-null   int64       

(5088, 65)

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,penalties_home0,penalties_home1,redzone_away0,redzone_away1,redzone_home0,redzone_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home
0,2002-09-05,49ers,Giants,13,21,12-4,16-9,0-0,0-1,166,...,10,80,0,8,0,6,99120,116880,16,13
1,2002-09-08,Seahawks,Raiders,14,27,11-1,12-7,2-2,1-1,143,...,5,45,0,2,0,2,90540,125460,17,31
2,2002-09-08,Jets,Bills,18,26,8-2,17-7,0-0,2-2,193,...,10,82,0,9,0,8,1266,140880,37,31
3,2002-09-08,Vikings,Bears,19,20,13-5,13-7,0-0,0-0,228,...,4,33,0,7,0,6,113460,102540,23,27
4,2002-09-08,Chargers,Bengals,27,13,10-6,11-4,0-0,0-0,160,...,9,57,0,7,0,5,136080,1332,34,6


In [11]:
# Create classifier column, if home team win, then 1, if home team lose, then 0
df.loc[df['score_away'] < df['score_home'], 'home_win'] = 1 
df.loc[df['score_away'] > df['score_home'], 'home_win'] = 0 

# Convert to percentage
df['comp_att_away_percentage'] =df['comp_att_away0']/df['comp_att_away1']
df['comp_att_home_percentage'] =df['comp_att_home0']/df['comp_att_home1']

# Drop redzone columns
df = df.drop(columns=['redzone_away','redzone_home',
                      'redzone_away0', 'redzone_away1',
                      'redzone_home0','redzone_home1'])

# Create column with SuperBowl winner by year
winner_dict ={2002:'Buccaneers',
             2003:'Patriots',
             2004:'Patriots',
             2005:'Steelers',
             2006:'Colts',
             2007:'Giants',
             2008:'Steelers',
             2009:'Saints',
             2010:'Packers',
             2011:'Giants',
             2012:'Ravens',
             2013:'Seahawks',
             2014:'Patriots',
             2015:'Broncos',
             2016:'Patriots',
             2017:'Eagles',
             2018:'Patriots',
             2019:'Chiefs',
             2020:'Buccaneers',
             2021:'Rams'
             }
df['super_bowl_winner'] =df['date'].dt.year.map(winner_dict)

# create a dictionary that shows teams in conference/division
## HA 4/1: This code column no longer needed.
# AFC vs NFC Conferences
# each has North, South, East, and West Divisions
# conf_divisions = {
#     'Colts':'AFC South', 
#     'Ravens':'AFC North', 
#     'Seahawks':'NFC West', 
#     'Patriots':'AFC East', 
#     'Packers':'NFC North', 
#     'Giants':'NFC East',
#     'Steelers':'AFC North', 
#     'Jets':'AFC East', 
#     'Titans':'AFC South', 
#     'Panthers':'NFC South', 
#     'Eagles':'NFC East', 
#     'Chargers':'AFC West',
#     'Saints':'NFC South', 
#     '49ers':'NFC West', 
#     'Falcons':'NFC South', 
#     'Texans':'AFC South', 
#     'Jaguars':'AFC South', 
#     'Chiefs':'AFC West', 
#     'Vikings':'NFC North',
#     'Broncos':'AFC West', 
#     'Cowboys':'NFC East', 
#     'Cardinals':'NFC West', 
#     'Bengals':'AFC North', 
#     'Bills':'AFC East', 
#     'Rams':'NFC West', 
#     'Lions':'NFC North',
#     'Washington':'NFC East', 
#     'Raiders':'AFC West', 
#     'Browns':'AFC North', 
#     'Buccaneers':'NFC South', 
#     'Bears':'NFC North', 
#     'Dolphins':'AFC East'  
# }
# df['Conference_Division'] = df['away'].map(conf_divisions)

# Create column Super Bowl Year
df['super_bowl_year'] =df['date'].dt.year

# TO BE DETERMINED how to treat tie record?
#df['away_win'].unique()
#df[~df['away_win'].isin([0,1])] #11 tie record

In [12]:
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,penalties_home1,possession_away_in_seconds,possession_home_in_seconds,score_away,score_home,home_win,comp_att_away_percentage,comp_att_home_percentage,super_bowl_winner,super_bowl_year
0,2002-09-05,49ers,Giants,13,21,12-4,16-9,0-0,0-1,166,...,80,99120,116880,16,13,0.0,0.615385,0.622222,Buccaneers,2002
1,2002-09-08,Seahawks,Raiders,14,27,11-1,12-7,2-2,1-1,143,...,45,90540,125460,17,31,1.0,0.718750,0.678571,Buccaneers,2002
2,2002-09-08,Jets,Bills,18,26,8-2,17-7,0-0,2-2,193,...,82,1266,140880,37,31,0.0,0.800000,0.666667,Buccaneers,2002
3,2002-09-08,Vikings,Bears,19,20,13-5,13-7,0-0,0-0,228,...,33,113460,102540,23,27,1.0,0.571429,0.606061,Buccaneers,2002
4,2002-09-08,Chargers,Bengals,27,13,10-6,11-4,0-0,0-0,160,...,57,136080,1332,34,6,0.0,0.789474,0.580645,Buccaneers,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11-4,14-3,0-0,4-2,355,...,61,122880,93120,30,27,0.0,0.736842,0.555556,,2022
5084,2022-01-23,Bills,Chiefs,23,30,14-6,13-8,4-4,1-1,313,...,1,99420,131880,36,42,1.0,0.729730,0.750000,,2022
5085,2022-01-30,Bengals,Chiefs,21,24,14-8,12-6,0-0,0-0,243,...,2,129360,106920,27,24,0.0,0.605263,0.666667,,2022
5086,2022-01-30,49ers,Rams,16,25,9-3,18-11,0-0,0-1,232,...,2,87660,128340,17,20,1.0,0.533333,0.688889,,2022


In [13]:
# need to switch third_downs_away and third_downs_home (currently shown as opposite)

# third downs away
df[['third_downs_away','third_downs_completed_away']] = df['third_downs_away'].str.split('-',expand=True)

# third downs home
df[['third_downs_home','third_downs_completed_home']] = df['third_downs_home'].str.split('-',expand=True)

In [14]:
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,possession_home_in_seconds,score_away,score_home,home_win,comp_att_away_percentage,comp_att_home_percentage,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home
0,2002-09-05,49ers,Giants,13,21,12,16,0-0,0-1,166,...,116880,16,13,0.0,0.615385,0.622222,Buccaneers,2002,4,9
1,2002-09-08,Seahawks,Raiders,14,27,11,12,2-2,1-1,143,...,125460,17,31,1.0,0.718750,0.678571,Buccaneers,2002,1,7
2,2002-09-08,Jets,Bills,18,26,8,17,0-0,2-2,193,...,140880,37,31,0.0,0.800000,0.666667,Buccaneers,2002,2,7
3,2002-09-08,Vikings,Bears,19,20,13,13,0-0,0-0,228,...,102540,23,27,1.0,0.571429,0.606061,Buccaneers,2002,5,7
4,2002-09-08,Chargers,Bengals,27,13,10,11,0-0,0-0,160,...,1332,34,6,0.0,0.789474,0.580645,Buccaneers,2002,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0-0,4-2,355,...,93120,30,27,0.0,0.736842,0.555556,,2022,4,3
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4-4,1-1,313,...,131880,36,42,1.0,0.729730,0.750000,,2022,6,8
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0-0,0-0,243,...,106920,27,24,0.0,0.605263,0.666667,,2022,8,6
5086,2022-01-30,49ers,Rams,16,25,9,18,0-0,0-1,232,...,128340,17,20,1.0,0.533333,0.688889,,2022,3,11


In [15]:
# also need to switch fourth_downs_away and fourth_downs_home

### NOTE: need to investigate within each data file because of data issues for fourth down. Data set 1 seems to be
### correct. But dataset 2 has opposite

# fourth_downs_away
df[['fourth_downs_away','fourth_downs_completed_away']] = df['fourth_downs_away'].str.split('-',expand=True)

# fourth_downs_home
df[['fourth_downs_home','fourth_downs_completed_home']] = df['fourth_downs_home'].str.split('-',expand=True)

In [16]:
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,score_home,home_win,comp_att_away_percentage,comp_att_home_percentage,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home
0,2002-09-05,49ers,Giants,13,21,12,16,0,0,166,...,13,0.0,0.615385,0.622222,Buccaneers,2002,4,9,0,1
1,2002-09-08,Seahawks,Raiders,14,27,11,12,2,1,143,...,31,1.0,0.718750,0.678571,Buccaneers,2002,1,7,2,1
2,2002-09-08,Jets,Bills,18,26,8,17,0,2,193,...,31,0.0,0.800000,0.666667,Buccaneers,2002,2,7,0,2
3,2002-09-08,Vikings,Bears,19,20,13,13,0,0,228,...,27,1.0,0.571429,0.606061,Buccaneers,2002,5,7,0,0
4,2002-09-08,Chargers,Bengals,27,13,10,11,0,0,160,...,6,0.0,0.789474,0.580645,Buccaneers,2002,6,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,27,0.0,0.736842,0.555556,,2022,4,3,0,2
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,42,1.0,0.729730,0.750000,,2022,6,8,4,1
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,24,0.0,0.605263,0.666667,,2022,8,6,0,0
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,20,1.0,0.533333,0.688889,,2022,3,11,0,1


In [17]:
# convert third down and fourth down to decimal

# third down away
df['third_down_perc_away'] = df['third_downs_completed_away'].astype(int) / df['third_downs_away'].astype(int)

# third down home
df['third_down_home_perc'] = df['third_downs_completed_home'].astype(int) / df['third_downs_home'].astype(int)

# fourth down away (NOTE: Switched columns because of previous switching, underlying columns will be dropped)
df['fourth_down_perc_away'] = df['fourth_downs_away'].astype(int) / df['fourth_downs_completed_away'].astype(int)

# fourth down home (NOTE: Switched columns because of previous switching, underlying columns will be dropped)
df['fourth_down_perc_home'] = df['fourth_downs_home'].astype(int) / df['fourth_downs_completed_home'].astype(int)

In [18]:
df[df['fourth_down_perc_home'].isna()]

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home
3,2002-09-08,Vikings,Bears,19,20,13,13,0,0,228,...,Buccaneers,2002,5,7,0,0,0.384615,0.538462,,
4,2002-09-08,Chargers,Bengals,27,13,10,11,0,0,160,...,Buccaneers,2002,6,4,0,0,0.600000,0.363636,,
5,2002-09-08,Chiefs,Browns,24,24,11,11,0,0,276,...,Buccaneers,2002,5,4,0,0,0.454545,0.363636,,
7,2002-09-08,Eagles,Titans,17,22,15,10,1,0,181,...,Buccaneers,2002,7,6,1,0,0.466667,0.600000,1.0,
10,2002-09-08,Ravens,Panthers,15,15,14,14,0,0,212,...,Buccaneers,2002,5,5,0,0,0.357143,0.357143,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5076,2022-01-15,Patriots,Bills,20,29,14,7,4,0,216,...,,2022,7,6,4,0,0.500000,0.857143,1.0,
5079,2022-01-16,Steelers,Chiefs,19,26,16,12,2,0,201,...,,2022,7,8,2,0,0.437500,0.666667,1.0,
5080,2022-01-17,Cardinals,Rams,14,16,0,13,2,0,122,...,,2022,9,6,1,0,inf,0.461538,2.0,
5082,2022-01-22,49ers,Packers,12,14,11,12,0,0,106,...,,2022,4,5,1,0,0.363636,0.416667,0.0,


In [19]:
df.to_excel('final_dataset_TEST.xlsx')

In [20]:
# filter for january and february
df[df['date'].dt.month.isin([1,2])]

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home
256,2005-01-02,Jaguars,Raiders,13,16,18,17,1,4,149,...,Steelers,2005,8,5,1,2,0.444444,0.294118,1.0,2.0
257,2005-01-02,Steelers,Bills,15,16,19,12,1,1,105,...,Steelers,2005,8,2,1,1,0.421053,0.166667,1.0,1.0
258,2005-01-02,Packers,Bears,17,17,11,15,0,3,327,...,Steelers,2005,4,3,1,1,0.363636,0.200000,0.0,3.0
259,2005-01-02,Lions,Titans,23,15,15,13,2,0,331,...,Steelers,2005,5,5,1,0,0.333333,0.384615,2.0,
260,2005-01-02,Jets,Rams,22,21,18,13,1,0,144,...,Steelers,2005,5,5,1,0,0.277778,0.384615,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,,2022,4,3,0,2,0.363636,0.214286,,2.0
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,,2022,6,8,4,1,0.428571,0.615385,1.0,1.0
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,,2022,8,6,0,0,0.571429,0.500000,,
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,,2022,3,11,0,1,0.333333,0.611111,,0.0


In [21]:
# update year column by subtracting 1 for games in Jan/Feb
df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] = df.loc[df['date'].dt.month.isin([1,2]), 'super_bowl_year'] - 1

In [22]:
# check 
df[df['date'].dt.month.isin([1,2])]

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home
256,2005-01-02,Jaguars,Raiders,13,16,18,17,1,4,149,...,Steelers,2004,8,5,1,2,0.444444,0.294118,1.0,2.0
257,2005-01-02,Steelers,Bills,15,16,19,12,1,1,105,...,Steelers,2004,8,2,1,1,0.421053,0.166667,1.0,1.0
258,2005-01-02,Packers,Bears,17,17,11,15,0,3,327,...,Steelers,2004,4,3,1,1,0.363636,0.200000,0.0,3.0
259,2005-01-02,Lions,Titans,23,15,15,13,2,0,331,...,Steelers,2004,5,5,1,0,0.333333,0.384615,2.0,
260,2005-01-02,Jets,Rams,22,21,18,13,1,0,144,...,Steelers,2004,5,5,1,0,0.277778,0.384615,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,,2021,4,3,0,2,0.363636,0.214286,,2.0
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,,2021,6,8,4,1,0.428571,0.615385,1.0,1.0
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,,2021,8,6,0,0,0.571429,0.500000,,
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,,2021,3,11,0,1,0.333333,0.611111,,0.0


In [23]:
# update the super bowl winner for rows changed
df['super_bowl_winner'] = df['super_bowl_year'].map(winner_dict)

In [24]:
# check
df

Unnamed: 0,date,away,home,first_downs_away,first_downs_home,third_downs_away,third_downs_home,fourth_downs_away,fourth_downs_home,passing_yards_away,...,super_bowl_winner,super_bowl_year,third_downs_completed_away,third_downs_completed_home,fourth_downs_completed_away,fourth_downs_completed_home,third_down_perc_away,third_down_home_perc,fourth_down_perc_away,fourth_down_perc_home
0,2002-09-05,49ers,Giants,13,21,12,16,0,0,166,...,Buccaneers,2002,4,9,0,1,0.333333,0.562500,,0.0
1,2002-09-08,Seahawks,Raiders,14,27,11,12,2,1,143,...,Buccaneers,2002,1,7,2,1,0.090909,0.583333,1.0,1.0
2,2002-09-08,Jets,Bills,18,26,8,17,0,2,193,...,Buccaneers,2002,2,7,0,2,0.250000,0.411765,,1.0
3,2002-09-08,Vikings,Bears,19,20,13,13,0,0,228,...,Buccaneers,2002,5,7,0,0,0.384615,0.538462,,
4,2002-09-08,Chargers,Bengals,27,13,10,11,0,0,160,...,Buccaneers,2002,6,4,0,0,0.600000,0.363636,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2022-01-23,Rams,Buccaneers,24,20,11,14,0,4,355,...,Rams,2021,4,3,0,2,0.363636,0.214286,,2.0
5084,2022-01-23,Bills,Chiefs,23,30,14,13,4,1,313,...,Rams,2021,6,8,4,1,0.428571,0.615385,1.0,1.0
5085,2022-01-30,Bengals,Chiefs,21,24,14,12,0,0,243,...,Rams,2021,8,6,0,0,0.571429,0.500000,,
5086,2022-01-30,49ers,Rams,16,25,9,18,0,0,232,...,Rams,2021,3,11,0,1,0.333333,0.611111,,0.0


In [25]:
df.columns

Index(['date', 'away', 'home', 'first_downs_away', 'first_downs_home',
       'third_downs_away', 'third_downs_home', 'fourth_downs_away',
       'fourth_downs_home', 'passing_yards_away', 'passing_yards_home',
       'rushing_yards_away', 'rushing_yards_home', 'total_yards_away',
       'total_yards_home', 'comp_att_away', 'comp_att_home', 'sacks_away',
       'sacks_home', 'rushing_attempts_away', 'rushing_attempts_home',
       'fumbles_away', 'fumbles_home', 'int_away', 'int_home',
       'turnovers_away', 'turnovers_home', 'penalties_away', 'penalties_home',
       'drives_away', 'drives_home', 'def_st_td_away', 'def_st_td_home',
       'possession_away', 'possession_home', 'third_downs_away0',
       'third_downs_away1', 'third_downs_home0', 'third_downs_home1',
       'fourth_downs_away0', 'fourth_downs_away1', 'fourth_downs_home0',
       'fourth_downs_home1', 'comp_att_away0', 'comp_att_away1',
       'comp_att_home0', 'comp_att_home1', 'sacks_away0', 'sacks_away1',
       'sa

In [26]:
df['Match_Winner'] = np.where(df['score_away'] > df['score_home'], df['away'],
                        np.where(df['score_away'] < df['score_home'], df['home'],''))

In [27]:
df_home =df[['date', 'home', 'first_downs_home', 'third_downs_home',
       'fourth_downs_home', 'passing_yards_home', 'rushing_yards_home',
       'total_yards_home', 'comp_att_home','sacks_home', 'rushing_attempts_home', 
        'fumbles_home', 'int_home', 'turnovers_home', 'penalties_home','drives_home', 
        'def_st_td_home','possession_home','third_downs_home0', 'third_downs_home1',
        'fourth_downs_home0','fourth_downs_home1','comp_att_home0', 'comp_att_home1',
       'sacks_home0', 'sacks_home1', 'penalties_home0', 'penalties_home1',
       'possession_home_in_seconds', 'score_home', 'home_win','comp_att_home_percentage',
       'super_bowl_winner', 'super_bowl_year','third_downs_completed_home',
       'fourth_downs_completed_home','third_down_home_perc',
       'fourth_down_perc_home', 'Match_Winner']]

In [28]:
df_away=df[['date', 'away', 'first_downs_away','third_downs_away', 'fourth_downs_away', 'passing_yards_away',
       'rushing_yards_away', 'total_yards_away', 'comp_att_away', 'sacks_away', 'rushing_attempts_away',
       'fumbles_away', 'int_away',
       'turnovers_away', 'penalties_away','drives_away', 'def_st_td_away','possession_away', 'third_downs_away0',
       'third_downs_away1','fourth_downs_away0', 'fourth_downs_away1', 'comp_att_away0', 'comp_att_away1',
       'sacks_away0', 'sacks_away1','penalties_away0', 'penalties_away1',
       'possession_away_in_seconds','score_away','home_win','comp_att_away_percentage',
       'super_bowl_winner', 'super_bowl_year', 'third_downs_completed_away','fourth_downs_completed_away',
       'third_down_perc_away','fourth_down_perc_away', 'Match_Winner']]

In [29]:
df_home.shape

(5088, 39)

In [30]:
df_home.isna().sum()

date                              0
home                              0
first_downs_home                  0
third_downs_home                  0
fourth_downs_home                 0
passing_yards_home                0
rushing_yards_home                0
total_yards_home                  0
comp_att_home                     0
sacks_home                        0
rushing_attempts_home             0
fumbles_home                      0
int_home                          0
turnovers_home                    0
penalties_home                    0
drives_home                       0
def_st_td_home                    0
possession_home                   0
third_downs_home0                 0
third_downs_home1                 0
fourth_downs_home0                0
fourth_downs_home1                0
comp_att_home0                    0
comp_att_home1                    0
sacks_home0                       0
sacks_home1                       0
penalties_home0                   0
penalties_home1             

In [31]:
df_away.shape

(5088, 39)

In [32]:
df_away.isna().sum()

date                              0
away                              0
first_downs_away                  0
third_downs_away                  0
fourth_downs_away                 0
passing_yards_away                0
rushing_yards_away                0
total_yards_away                  0
comp_att_away                     0
sacks_away                        0
rushing_attempts_away             0
fumbles_away                      0
int_away                          0
turnovers_away                    0
penalties_away                    0
drives_away                       0
def_st_td_away                    0
possession_away                   0
third_downs_away0                 0
third_downs_away1                 0
fourth_downs_away0                0
fourth_downs_away1                0
comp_att_away0                    0
comp_att_away1                    0
sacks_away0                       0
sacks_away1                       0
penalties_away0                   0
penalties_away1             

In [33]:
df_away['is_away']=1
df_home['is_away']=0

new_columns = [column.replace('away', 'team') for column in df_away.columns]
df_away.columns = new_columns

new_columns = [column.replace('home', 'team') for column in df_home.columns]
df_home.columns = new_columns

In [34]:
df_final =pd.concat([df_home,df_away], axis=0, ignore_index=True)

In [35]:
df_final.groupby(by=['super_bowl_year','team']).mean().columns

Index(['first_downs_team', 'passing_yards_team', 'rushing_yards_team',
       'rushing_attempts_team', 'fumbles_team', 'int_team', 'turnovers_team',
       'drives_team', 'def_st_td_team', 'third_downs_team0',
       'third_downs_team1', 'fourth_downs_team0', 'fourth_downs_team1',
       'comp_att_team0', 'comp_att_team1', 'sacks_team0', 'sacks_team1',
       'penalties_team0', 'penalties_team1', 'possession_team_in_seconds',
       'score_team', 'team_win', 'comp_att_team_percentage',
       'third_down_team_perc', 'fourth_down_perc_team', 'is_away', 'home_win',
       'third_down_perc_team', 'is_team'],
      dtype='object')

In [36]:
df_final.groupby(by=['super_bowl_year','team']).agg({
                                                    'first_downs_team': 'mean',
                                                    'passing_yards_team':'mean', 
                                                    'rushing_yards_team':'mean',
                                                    'rushing_attempts_team':'mean', 
                                                    'fumbles_team':'mean', 
                                                    'int_team':'mean', 
                                                    'turnovers_team':'mean',
                                                    'drives_team':'mean', 
                                                    'def_st_td_team':'mean', 
                                                    'third_downs_team0':'mean',
                                                    'third_downs_team1':'mean', 
                                                    'fourth_downs_team0':'mean', 
                                                    'fourth_downs_team1':'mean',
                                                    'comp_att_team0':'mean', 
                                                    'comp_att_team1':'mean', 
                                                    'sacks_team0':'mean', 
                                                    'sacks_team1':'mean',
                                                    'penalties_team0':'mean', 
                                                    'penalties_team1':'mean', 
                                                    'possession_team_in_seconds':'mean',
                                                    'score_team':'mean', 
                                                    'team_win':'mean', 
                                                    'comp_att_team_percentage':'mean',
                                                    'third_down_team_perc':'mean', 
                                                    'fourth_down_perc_team':'mean', 
                                                    'home_win':'mean',
                                                    'third_down_perc_team':'mean', 
                                                    'is_away':'mean'
                                                    })

Unnamed: 0_level_0,Unnamed: 1_level_0,first_downs_team,passing_yards_team,rushing_yards_team,rushing_attempts_team,fumbles_team,int_team,turnovers_team,drives_team,def_st_td_team,third_downs_team0,...,penalties_team1,possession_team_in_seconds,score_team,team_win,comp_att_team_percentage,third_down_team_perc,fourth_down_perc_team,home_win,third_down_perc_team,is_away
super_bowl_year,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2002,49ers,21.055556,221.055556,133.111111,29.0000,0.555556,0.777778,1.333333,15.000000,0.166667,14.5000,...,37.111111,104913.944444,22.888889,0.666667,0.616331,0.503899,1.545455,0.444444,0.597572,0.0
2002,Bears,16.250000,190.687500,84.000000,23.8750,1.062500,1.125000,2.187500,17.250000,0.125000,13.1250,...,52.437500,86728.625000,17.562500,0.375000,0.663036,0.370739,1.181818,0.875000,0.305291,0.0
2002,Bengals,19.687500,217.250000,108.125000,26.6250,0.812500,1.375000,2.187500,16.750000,0.437500,13.9375,...,47.125000,84057.937500,17.437500,0.125000,0.597892,0.393524,0.937500,0.875000,0.379876,0.0
2002,Bills,22.187500,249.687500,99.750000,24.2500,1.000000,0.937500,1.937500,15.875000,0.312500,13.2500,...,62.375000,112781.250000,23.687500,0.625000,0.610519,0.419815,1.000000,0.625000,0.415499,0.0
2002,Broncos,22.312500,239.000000,141.625000,28.5625,0.437500,1.250000,1.687500,14.687500,0.375000,12.6250,...,49.250000,107268.312500,24.500000,0.625000,0.657844,0.394219,1.115385,0.500000,0.367041,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,Steelers,19.333333,181.666667,108.333333,27.0000,0.333333,0.666667,1.000000,13.666667,0.333333,17.0000,...,46.333333,122060.000000,21.000000,1.000000,0.620883,0.277778,1.333333,0.500000,0.395221,0.0
2021,Texans,16.500000,212.500000,75.000000,24.0000,0.000000,0.500000,0.500000,11.000000,0.000000,13.0000,...,66.000000,102510.000000,16.000000,0.000000,0.676610,0.363636,0.500000,1.000000,0.333333,0.0
2021,Titans,18.666667,201.333333,154.000000,33.0000,0.000000,1.000000,1.000000,10.666667,0.000000,12.0000,...,38.000000,112020.000000,26.000000,0.500000,0.688657,0.295833,0.500000,0.000000,0.615385,0.0
2021,Vikings,11.000000,203.000000,65.500000,16.5000,0.000000,0.000000,0.000000,11.000000,0.500000,12.5000,...,25.000000,1340.000000,20.500000,1.000000,0.625874,0.538462,4.000000,1.000000,0.166667,0.0


In [37]:
# delete the conference_division column (what value does it provide?)

In [38]:
# differentiate between playoffs and regular season???

# correct games played to correct super bowl year
# all games in February and January belong to previous year
# filter games in this range


# identify super bowl winner in each season (DONE/UPDATED)

### TO DO for monday:

In [39]:
# step 1
# aggregate data by year (sum yards, etc.)
# groupby
# how to group? sum, average

# step 2
# vertically combine all data to get FULL team season statistics

# step 3
# for each year, add column indicating 0 did not win SB, 1 won SB




In [40]:
# Only considered matches before 2022
df =df[df['date']<'2022-01-01']

# To be discussed
# Only considered no tie matches
df =df[df['home_win'].isin([0,1])]

In [41]:
# PLESE CHANGE THIS CELL!!!!!!!!!!!!

#df_final.isna().sum()
# Plug NaN value with dummy to build quick model
for i in ['team_win','third_down_team_perc','fourth_down_perc_team','home_win','third_down_perc_team','is_away']:
    df_final[i] =df_final[i].fillna(df_final[i].mean())

In [42]:
df_final.columns

Index(['date', 'team', 'first_downs_team', 'third_downs_team',
       'fourth_downs_team', 'passing_yards_team', 'rushing_yards_team',
       'total_yards_team', 'comp_att_team', 'sacks_team',
       'rushing_attempts_team', 'fumbles_team', 'int_team', 'turnovers_team',
       'penalties_team', 'drives_team', 'def_st_td_team', 'possession_team',
       'third_downs_team0', 'third_downs_team1', 'fourth_downs_team0',
       'fourth_downs_team1', 'comp_att_team0', 'comp_att_team1', 'sacks_team0',
       'sacks_team1', 'penalties_team0', 'penalties_team1',
       'possession_team_in_seconds', 'score_team', 'team_win',
       'comp_att_team_percentage', 'super_bowl_winner', 'super_bowl_year',
       'third_downs_completed_team', 'fourth_downs_completed_team',
       'third_down_team_perc', 'fourth_down_perc_team', 'Match_Winner',
       'is_away', 'home_win', 'third_down_perc_team', 'is_team'],
      dtype='object')

In [43]:
X = df_final.drop(columns=['first_downs_team', 'third_downs_team',
       'fourth_downs_team', 'passing_yards_team', 'rushing_yards_team',
       'total_yards_team', 'comp_att_team', 'sacks_team',
       'rushing_attempts_team', 'fumbles_team', 'int_team', 'turnovers_team',
       'penalties_team', 'drives_team', 'def_st_td_team', 'possession_team',
       'score_team', 'team_win','comp_att_team_percentage',
       'super_bowl_winner', 'super_bowl_year',
       'third_downs_completed_team', 'fourth_downs_completed_team',
       'third_down_team_perc', 'fourth_down_perc_team', 'Match_Winner',
       'home_win', 'third_down_perc_team', 'is_away'])

y = df_final['home_win']

In [44]:
df_final

Unnamed: 0,date,team,first_downs_team,third_downs_team,fourth_downs_team,passing_yards_team,rushing_yards_team,total_yards_team,comp_att_team,sacks_team,...,super_bowl_year,third_downs_completed_team,fourth_downs_completed_team,third_down_team_perc,fourth_down_perc_team,Match_Winner,is_away,home_win,third_down_perc_team,is_team
0,2002-09-05,Giants,21,16,0,318,43,361,28-45,24-3,...,2002,9,1,0.562500,0.000000,49ers,0.0,0.568249,inf,
1,2002-09-08,Raiders,27,12,1,202,221,423,19-28,12-2,...,2002,7,1,0.583333,1.000000,Raiders,0.0,0.568249,inf,
2,2002-09-08,Bills,26,17,2,242,142,384,26-39,29-4,...,2002,7,2,0.411765,1.000000,Jets,0.0,0.568249,inf,
3,2002-09-08,Bears,20,13,0,288,80,368,20-33,9-1,...,2002,7,0,0.538462,0.906688,Bears,0.0,0.568249,inf,
4,2002-09-08,Bengals,13,11,0,167,36,203,18-31,4-31,...,2002,4,0,0.363636,0.906688,Chargers,0.0,0.568249,inf,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10171,2022-01-23,Rams,24,11,0,355,73,428,28-38,11-2,...,2021,4,0,inf,0.906688,Rams,0.0,0.000000,0.363636,1.0
10172,2022-01-23,Bills,23,14,4,313,109,422,27-37,16-2,...,2021,6,4,inf,1.000000,Chiefs,0.0,1.000000,0.428571,1.0
10173,2022-01-30,Bengals,21,14,0,243,116,359,23-38,7-1,...,2021,8,0,inf,0.906688,Bengals,0.0,0.000000,0.571429,1.0
10174,2022-01-30,49ers,16,9,0,232,50,282,16-30,0-0,...,2021,3,0,inf,0.906688,Rams,0.0,1.000000,0.333333,1.0


In [45]:
df_final['is_superbowl_winner'] =np.where(df_final['team']==df_final['super_bowl_winner'], 1,0)
df_final['is_match_winner']=np.where(df_final['team']==df_final['Match_Winner'],1,0)

In [46]:
df_final_sample =df_final[df_final['super_bowl_year']==2018]

In [48]:
df_final.head(2)

Unnamed: 0,date,team,first_downs_team,third_downs_team,fourth_downs_team,passing_yards_team,rushing_yards_team,total_yards_team,comp_att_team,sacks_team,...,fourth_downs_completed_team,third_down_team_perc,fourth_down_perc_team,Match_Winner,is_away,home_win,third_down_perc_team,is_team,is_superbowl_winner,is_match_winner
0,2002-09-05,Giants,21,16,0,318,43,361,28-45,24-3,...,1,0.5625,0.0,49ers,0.0,0.568249,inf,,0,0
1,2002-09-08,Raiders,27,12,1,202,221,423,19-28,12-2,...,1,0.583333,1.0,Raiders,0.0,0.568249,inf,,0,1


In [49]:
df_final.columns

Index(['date', 'team', 'first_downs_team', 'third_downs_team',
       'fourth_downs_team', 'passing_yards_team', 'rushing_yards_team',
       'total_yards_team', 'comp_att_team', 'sacks_team',
       'rushing_attempts_team', 'fumbles_team', 'int_team', 'turnovers_team',
       'penalties_team', 'drives_team', 'def_st_td_team', 'possession_team',
       'third_downs_team0', 'third_downs_team1', 'fourth_downs_team0',
       'fourth_downs_team1', 'comp_att_team0', 'comp_att_team1', 'sacks_team0',
       'sacks_team1', 'penalties_team0', 'penalties_team1',
       'possession_team_in_seconds', 'score_team', 'team_win',
       'comp_att_team_percentage', 'super_bowl_winner', 'super_bowl_year',
       'third_downs_completed_team', 'fourth_downs_completed_team',
       'third_down_team_perc', 'fourth_down_perc_team', 'Match_Winner',
       'is_away', 'home_win', 'third_down_perc_team', 'is_team',
       'is_superbowl_winner', 'is_match_winner'],
      dtype='object')

# Model

In [86]:
df_model = df_final[['team','first_downs_team', 'third_downs_team','fourth_downs_team',
                     'super_bowl_year',
                     'score_team',
                     'is_away','is_match_winner',
                     'is_superbowl_winner']]

# Cast to correct type to run XGBoost

df_model['third_downs_team'] =df_model['third_downs_team'].astype('int64')
df_model['fourth_downs_team'] =df_model['fourth_downs_team'].astype('int64')

X=df_model.drop(columns=['team','is_superbowl_winner'])
y=df_model['is_superbowl_winner']

In [87]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10176 entries, 0 to 10175
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   first_downs_team   10176 non-null  int64  
 1   third_downs_team   10176 non-null  int64  
 2   fourth_downs_team  10176 non-null  int64  
 3   super_bowl_year    10176 non-null  int64  
 4   score_team         10176 non-null  int64  
 5   is_away            10176 non-null  float64
 6   is_match_winner    10176 non-null  int32  
dtypes: float64(1), int32(1), int64(5)
memory usage: 516.9 KB


## Remedy for imbalanced dataset

In [69]:
# Break index for prediction

#from collections import Counter
#from imblearn.combine import SMOTEENN

#smote_enn = SMOTEENN(random_state=42)
#X_resampled, y_resampled = smote_enn.fit_resample(X, y)

#print(sorted(Counter(y).items()))
#print(sorted(Counter(y_resampled).items()))

#X = X_resampled
#y = y_resampled

## Logistics Regression

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [90]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

yhat_binary = LR.predict(X_train)
print("Accuracy:",metrics.accuracy_score(y_train, yhat_binary))

Accuracy: 0.964060437142438


In [91]:
yhat = LR.predict(X_test)
yhat

yhat_prob = LR.predict_proba(X_test)
yhat_prob

array([0, 0, 0, ..., 0, 0, 0])

array([[0.97998448, 0.02001552],
       [0.93466475, 0.06533525],
       [0.92481769, 0.07518231],
       ...,
       [0.98355096, 0.01644904],
       [0.95075045, 0.04924955],
       [0.97505579, 0.02494421]])

In [92]:
teams =df_model['team']
year =df_model['super_bowl_year']

In [93]:
#Recreate index to get final probability table

ind_list =X_test.index
teams =df_model['team'].iloc[ind_list]
year =df_model['super_bowl_year'].iloc[ind_list]

In [94]:
lr_rank = pd.DataFrame({'Team':teams, 'Year':year, 'Prediction':yhat_prob[:,1]}) 
lr_rank.sort_values(by=['Year','Prediction'], ascending=False)

Unnamed: 0,Team,Year,Prediction
5046,Seahawks,2021,0.094885
5079,Chiefs,2021,0.076300
5071,Dolphins,2021,0.058170
5053,Chargers,2021,0.054464
5064,Vikings,2021,0.051031
...,...,...,...
5120,Jets,2002,0.013642
5227,Washington,2002,0.013268
169,Panthers,2002,0.012987
5143,Giants,2002,0.012748


## Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [96]:
yhat_prob_clf = clf.predict_proba(X_test)
yhat_prob_clf

array([[0.98006339, 0.01993661],
       [0.94647409, 0.05352591],
       [0.9441788 , 0.0558212 ],
       ...,
       [0.97811028, 0.02188972],
       [0.95198491, 0.04801509],
       [0.9754977 , 0.0245023 ]])

In [97]:
clf_rank = pd.DataFrame({'Team':teams, 'Year':year, 'Prediction':yhat_prob_clf[:,1]}) 
clf_rank.sort_values(by=['Year','Prediction'], ascending=False)

Unnamed: 0,Team,Year,Prediction
5046,Seahawks,2021,0.067709
5079,Chiefs,2021,0.062625
5071,Dolphins,2021,0.059386
5053,Chargers,2021,0.058333
5064,Vikings,2021,0.054805
...,...,...,...
5239,Cardinals,2002,0.020080
31,Washington,2002,0.019501
5218,Lions,2002,0.019341
5244,Ravens,2002,0.019262


## XGBoost

In [98]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [99]:
yhat_prob_xgb_clf = xgb_clf.predict_proba(X_test)
yhat_prob_xgb_clf

array([[9.9900299e-01, 9.9702552e-04],
       [9.8839313e-01, 1.1606887e-02],
       [9.0820849e-01, 9.1791518e-02],
       ...,
       [9.9829280e-01, 1.7071733e-03],
       [9.2328495e-01, 7.6715074e-02],
       [9.9901915e-01, 9.8087743e-04]], dtype=float32)

In [100]:
xgb_clf_rank = pd.DataFrame({'Team':teams, 'Year':year, 'Prediction':yhat_prob_clf[:,1]}) 
xgb_clf_rank.sort_values(by=['Year','Prediction'], ascending=False)

Unnamed: 0,Team,Year,Prediction
5046,Seahawks,2021,0.067709
5079,Chiefs,2021,0.062625
5071,Dolphins,2021,0.059386
5053,Chargers,2021,0.058333
5064,Vikings,2021,0.054805
...,...,...,...
5239,Cardinals,2002,0.020080
31,Washington,2002,0.019501
5218,Lions,2002,0.019341
5244,Ravens,2002,0.019262


In [105]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(estimators=[
        ('lr', LR), ('clf', clf), ('xgb', xgb_clf)],
        voting='soft', weights=[1,1,1])

vote_clf = vote_clf.fit(X_train, y_train)

print(vote_clf.predict(X_train))

print(vote_clf.transform(X_train).shape)

[0 0 0 ... 0 0 0]
(6817, 6)


In [106]:
yhat_prob_vote_clf = vote_clf.predict_proba(X_test)
yhat_prob_vote_clf

array([[0.98635029, 0.01364972],
       [0.95651066, 0.04348935],
       [0.925735  , 0.07426501],
       ...,
       [0.98665135, 0.01334865],
       [0.94200677, 0.05799324],
       [0.98319088, 0.01680913]])

In [107]:
vote_clf_rank = pd.DataFrame({'Team':teams, 'Year':year, 'Prediction':yhat_prob_vote_clf[:,1]}) 
vote_clf_rank.sort_values(by=['Year','Prediction'], ascending=False)

Unnamed: 0,Team,Year,Prediction
10137,Raiders,2021,0.253939
5064,Vikings,2021,0.129793
5046,Seahawks,2021,0.083946
10140,Rams,2021,0.077735
10139,Eagles,2021,0.054581
...,...,...,...
5120,Jets,2002,0.012411
5218,Lions,2002,0.011900
5227,Washington,2002,0.011492
31,Washington,2002,0.011423


In [None]:
# Todo
# Convert data to correct type before modeling

# Model consideration
## Add more parameter
## Only by year?
## Hyperparameter tuning