<a href="https://colab.research.google.com/github/jamandgar/NFL_Data_Analysis/blob/master/NFL_HallOfFame_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#CSV files to DataFrames
All data was provided by and collected from Pro Football Reference:
https://www.pro-football-reference.com/play-index/psl_finder.cgi

In [26]:
path = "drive/My Drive/NFL_Data/"
hof_players = pd.read_csv(path+"All_HOF.csv")

Player data was collected for quarterbacks (QB) who played in and retired from the National Football League (NFL) - which prior to 1922 was known as the American Professional Football Association (APFA) - between its inaugural year of 1920 and the current year of 2020. Those who played in the All-America Football Conference (AAFC) or the American Football League (AFL) before they merged with the NFL in 1949 and 1970 respectively, are also included.

###Quarterbacks

In [27]:
##Win-Loss records of QBs who have played and started in a Super Bowl (SB) or in
##an NFL/AFL Championship game prior to 1966 when the Super Bowl-era began
qb_champ = pd.read_csv(path+"QB_Championships.csv")

#Passing data
qb_passing = pd.read_csv(path+"QB_Passing.csv")

#Combined passing, rushing, and/or receiving data
qb_totals = pd.read_csv(path+"QB_Totals.csv")

#Other data related to in-game statistics, e.g. years played
qb_misc = pd.read_csv(path+"QB_Miscellaneous.csv")

#Data Cleaning

###Quarterbacks

In [28]:
hof_qbs = hof_players[hof_players['Pos'] == 'QB']['Player']

##List of Super Bowl QBs is currently organized by the team they played for,
##which isn't necessary
qb_champ = qb_champ.stack()
qb_champ.reset_index(drop=True, inplace=True)

##Isolate win-loss records for each championship QB into individual columns 
qb_champ = qb_champ.str.rpartition(' ')
qb_champ.rename(columns={0: 'Name', 2: 'Ch Record'}, inplace=True)
qb_champ.drop(columns=[1], inplace=True)

qb_champ['Ch W'] = qb_champ['Ch Record'].str.partition('-')[0]
qb_champ['Ch W'] = qb_champ['Ch W'].str.partition('(')[2]

qb_champ['Ch L'] = qb_champ['Ch Record'].str.partition('-')[2]
qb_champ['Ch L'] = qb_champ['Ch L'].str.partition(')')[0]

i = 'int64'
qb_champ = qb_champ.astype({'Ch W': i, 'Ch L': i})
qb_champ['Ch G'] = qb_champ['Ch W'] + qb_champ['Ch L']
qb_champ.drop(columns=['Ch Record'], inplace=True)

##Some players have been in Super Bowls with multiple teams which must be 
##accounted for
qb_champ['Name'] = qb_champ['Name'].str.strip()
qb_champ = qb_champ.groupby('Name', as_index=False).sum()

##The provided 'Player' column is used to uniquely identify individuals, however
##there are inconsistencies with this column between some CSV files
qb_passing.rename(columns={'Player': 'ID'}, inplace=True)
qb_passing['ID'] = qb_passing['ID'].str.replace(" \*", "")
qb_passing['ID'] = qb_passing['ID'].str.replace("\*", "")

qb_totals.rename(columns={'Player': 'ID'}, inplace=True)
qb_totals['ID'] = qb_totals['ID'].str.replace(" \*", "")
qb_totals['ID'] = qb_totals['ID'].str.replace("\*", "")

qb_misc.rename(columns={'Player': 'ID'}, inplace=True)
qb_misc['ID'] = qb_misc['ID'].str.replace(" \*", "")
qb_misc['ID'] = qb_misc['ID'].str.replace("\*", "")

##Create new columns for the name of each player, their championship game stats,
##and their Hall of Fame status
qb_passing['Name'] = qb_passing['ID'].str.partition('\\')[0]
qb_passing['Name'] = qb_passing['Name'].str.partition('*')[0]
qb_passing.insert(0, 'Name', qb_passing.pop('Name'))

#'Name' column used to identify players as 'ID' was not provided for the CSV 
#file used for qb_champ DataFrame
qb_passing = qb_passing.merge(qb_champ, on='Name', how='left')
qb_passing.fillna({'Ch W': 0,'Ch L': 0,'Ch G': 0}, inplace=True)

qb_misc['HOF'] = qb_misc['ID'].isin(hof_qbs)

qb_passing.set_index('ID', inplace=True)
qb_totals.set_index('ID', inplace=True)
qb_misc.set_index('ID', inplace=True)

##Combine all quarterback player data and remove redundant, unneeded, and/or
##unusable data due to a statistic not being recorded until a specific year
qb_passing.drop(columns=['From','Tm','Lg','Pick6','Sk','Yds.1','Sk%','ANY/A',
                         '4QC', 'GWD'], inplace=True)

qb_totals.drop(columns=['From','To','Draft','Tm','Lg','G','GS'], inplace=True)

qb_misc.drop(columns=['From','To','Draft','Tm','Lg','G','GS','AV'], inplace=True)

qb_data = qb_passing.join([qb_totals, qb_misc])
qb_data.sort_index(inplace=True)

##Players not eligible for HOF induction must be removed as well as those who
##had fewer than 20 career pass attempts, as to not skew certain stats
qb_data = qb_data[qb_data['To'] <= 2014]
qb_data = qb_data[qb_data['Att'] >= 20]
qb_data.drop(columns=['To'], inplace=True)

##Ignore the round and league in which a player was drafted.
qb_data['Draft'] = qb_data['Draft'].str.rpartition('-')[2]
qb_data['Draft'] = qb_data['Draft'].str.partition("AFL")[0]

##An empty value for 'Draft' indicates an undrafted player
qb_data = qb_data.astype({'Draft': 'float64'})
qb_data['Draft'].fillna(qb_data['Draft'].max() + 1, inplace=True)

qb_data = qb_data.astype({'Draft': i, 'G': i, 'GS': i, 'W': i, 'L': i, 'T': i, 
                          'Ch W': i, 'Ch L': i, 'Ch G': i, 'HOF': i})

qb_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 616 entries, A.J. Feeley\FeelA.00 to Zeke Bratkowski\BratZe00
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    616 non-null    object 
 1   Draft   616 non-null    int64  
 2   G       616 non-null    int64  
 3   GS      616 non-null    int64  
 4   Cmp     616 non-null    int64  
 5   Att     616 non-null    int64  
 6   Cmp%    616 non-null    float64
 7   Yds     616 non-null    int64  
 8   TD      616 non-null    int64  
 9   Int     616 non-null    int64  
 10  TD%     616 non-null    float64
 11  Int%    616 non-null    float64
 12  Rate    616 non-null    float64
 13  Y/A     616 non-null    float64
 14  AY/A    616 non-null    float64
 15  Y/G     616 non-null    float64
 16  W       616 non-null    int64  
 17  L       616 non-null    int64  
 18  T       616 non-null    int64  
 19  Ch W    616 non-null    int64  
 20  Ch L    616 non-null    int64  
 21  Ch G