<a href="https://colab.research.google.com/github/jamandgar/NFL_Data_Analysis/blob/master/NFL_HallOfFame_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [360]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#CSV files to DataFrames
Data collected from Pro Football Reference:
https://www.pro-football-reference.com/play-index/psl_finder.cgi

###League-Wide Data


In [361]:
path = "drive/My Drive/NFL_Data/"
hof_players = pd.read_csv(path+"All_HOF.csv")
mvps_league = pd.read_csv(path+"All_MVP-NFL.csv")
mvps_sb = pd.read_csv(path+"All_MVP-SB.csv")

###Individual Player Data
Data was collected for quarterbacks (QB) who played in and retired from the National Football League (NFL) - which prior to 1922 was known as the American Professional Football Association (APFA) - between its inaugural year of 1920 and the current year of 2020. Those who played in the All-America Football Conference (AAFC) or the American Football League (AFL) before they merged with the NFL in 1949 and 1970 respectively, are also included.

#####Quarterbacks

In [362]:
#Super Bowl (SB) appearances and victories (this is typically tracked for just 
#the quarterback position)
qb_sb_games = pd.read_csv(path+"QB_Championships.csv")

#Passing data
qb_passing = pd.read_csv(path+"QB_Passing.csv")

#Combined passing, rushing, and/or receiving data
qb_totals = pd.read_csv(path+"QB_Totals.csv")

#Other data related to in-game statistics, e.g. years played
qb_misc = pd.read_csv(path+"QB_Miscellaneous.csv")

#Data Cleaning

###League-Wide Data

In [363]:
hof_qbs = hof_players[hof_players['Pos'] == 'QB']['Player']

mvps_league = mvps_league['Player'].value_counts().to_frame(name='League_MVPs')
mvps_league.rename_axis('Player ID', inplace=True)

mvps_sb = mvps_sb['Player'].value_counts().to_frame(name='SB_MVPs')
mvps_sb.rename_axis('Player ID', inplace=True)

###Individual Player Data

#####Quarterbacks

In [364]:
##List of Super Bowl QBs is currently organized by the team they played for,
##which isn't necessary
qb_sb_games = qb_sb_games.stack()
qb_sb_games.reset_index(drop=True, inplace=True)

##Isolate win-loss records for each Super Bowl QB into individual columns 
qb_sb_games = qb_sb_games.str.rpartition(' ')
qb_sb_games.rename(columns={0: 'Name', 2: 'Record'}, inplace=True)
qb_sb_games.drop(columns=[1], inplace=True)

qb_sb_games['Wins'] = qb_sb_games['Record'].str.partition('-')[0]
qb_sb_games['Wins'] = qb_sb_games['Wins'].str.partition('(')[2]

qb_sb_games['Played'] = qb_sb_games['Record'].str.partition('-')[2]
qb_sb_games['Played'] = qb_sb_games['Played'].str.partition(')')[0]

qb_sb_games = qb_sb_games.astype({'Wins': 'int32', 'Played': 'int32'})
qb_sb_games['Played'] += qb_sb_games['Wins']
qb_sb_games.drop(columns=['Record'], inplace=True)

##The provided 'Player' column is used to uniquely identify individuals, however
##there are inconsistencies with this column between some CSV files
qb_passing.rename(columns={'Player': 'ID'}, inplace=True)
qb_passing['ID'] = qb_passing['ID'].str.replace(" \*", "")
qb_passing['ID'] = qb_passing['ID'].str.replace("\*", "")

qb_totals.rename(columns={'Player': 'ID'}, inplace=True)
qb_totals['ID'] = qb_totals['ID'].str.replace(" \*", "")
qb_totals['ID'] = qb_totals['ID'].str.replace("\*", "")

qb_misc.rename(columns={'Player': 'ID'}, inplace=True)
qb_misc['ID'] = qb_misc['ID'].str.replace(" \*", "")
qb_misc['ID'] = qb_misc['ID'].str.replace("\*", "")

##Create new columns for the name of each player and their Hall of Fame status;
##use 'ID' as the index for each DataFrame
qb_passing['Name'] = qb_passing['ID'].str.partition('\\')[0]
qb_passing['Name'] = qb_passing['Name'].str.partition('*')[0]
qb_passing.insert(0, 'Name', qb_passing.pop('Name'))

qb_misc['HOF'] = qb_misc['ID'].isin(hof_qbs)

qb_passing.set_index('ID', inplace=True)
qb_totals.set_index('ID', inplace=True)
qb_misc.set_index('ID', inplace=True)

##Combine all quarterback player data and remove redundant, unneeded, and/or
##unusable data due to a statistic not being recorded until a specific year
qb_passing.drop(columns=['From','Tm','Lg','Pick6','Sk','Yds.1','Sk%','ANY/A',
                         '4QC', 'GWD'], inplace=True)

qb_totals.drop(columns=['From','To','Draft','Tm','Lg','G','GS'], inplace=True)

qb_misc.drop(columns=['From','To','Draft','Tm','Lg','G','GS','AV'], inplace=True)

qb_data = qb_passing.join([qb_totals, qb_misc])
qb_data.sort_index(inplace=True)

##Players not eligible for HOF induction must be removed as well as those who
##never attempted a pass in a game
qb_data = qb_data[qb_data['To'] <= 2014]
qb_data = qb_data[qb_data['Att'] > 0]
qb_data.drop(columns=['To'], inplace=True)

##Ignore the round and league in which a player was drafted.
qb_data['Draft'] = qb_data['Draft'].str.rpartition('-')[2]
qb_data['Draft'] = qb_data['Draft'].str.partition("AFL")[0]

##Fill Null Values
#An empty value for 'Draft' indicates an undrafted player
qb_data = qb_data.astype({'Draft': 'float64'})
qb_data['Draft'].fillna(qb_data['Draft'].max() + 1, inplace=True)

#Three players have missing: 'W'(Win), 'L'(Lose), and 'T'(Tie) values, which can
#be attributed to never starting a game
qb_data['W'].fillna(0, inplace=True)
qb_data['L'].fillna(0, inplace=True)
qb_data['T'].fillna(0, inplace=True)

i = 'int64'
qb_data = qb_data.astype({'Draft': i, 'G': i, 'GS': i, 'W': i, 'L': i, 'T': i})
qb_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 760 entries, A.J. Feeley\FeelA.00 to Zeke Bratkowski\BratZe00
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    760 non-null    object 
 1   Draft   760 non-null    int64  
 2   G       760 non-null    int64  
 3   GS      760 non-null    int64  
 4   Cmp     760 non-null    int64  
 5   Att     760 non-null    int64  
 6   Cmp%    760 non-null    float64
 7   Yds     760 non-null    int64  
 8   TD      760 non-null    int64  
 9   Int     760 non-null    int64  
 10  TD%     760 non-null    float64
 11  Int%    760 non-null    float64
 12  Rate    760 non-null    float64
 13  Y/A     760 non-null    float64
 14  AY/A    760 non-null    float64
 15  Y/G     760 non-null    float64
 16  W       760 non-null    int64  
 17  L       760 non-null    int64  
 18  T       760 non-null    int64  
 19  Touch   760 non-null    int64  
 20  TotOff  760 non-null    int64  
 21  YScm