In [36]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [37]:
quarterbacks_df = pd.read_csv('../../ProData/csvs/qb_stats.csv', header=0)

In [38]:
drop_columns = [col for col in quarterbacks_df.columns if col.startswith(('catch', 'rec', 'targets'))]

In [39]:
drop_columns.extend(['Unnamed: 0', 'team', 'rush_receive_td', 'av'])

In [40]:
qbs_df = quarterbacks_df.drop(drop_columns, axis=1)

In [41]:
# Add column with value 1 so that when we group by player 
# and sum we get a count of the number of seasons
qbs_df['season_count'] = 1

# Fill nulls and zeros

## Fillna for each player with the player's average
QBR was only available starting in 2006 and snap data in 2012. For players that span across those time ranges, use their average to fill data from time before it was available.

In [42]:
fillna_cols = ['qbr', 'snap_played_percentage', 'snaps_played', 'total_snaps']

In [43]:
def fill_with_player_averages(group):
    group[fillna_cols] = group[fillna_cols].replace(0, group[fillna_cols].mean())
    group[fillna_cols] = group[fillna_cols].fillna(group[fillna_cols].mean())
    return group

In [44]:
new = qbs_df.groupby('player_id', as_index=False).apply(fill_with_player_averages)

## Fill zeros and nulls with all average

In [45]:
new[fillna_cols] = new[fillna_cols].fillna(new[fillna_cols].mean())

In [46]:
new[fillna_cols] = new[fillna_cols].replace(0, new[fillna_cols].mean())

In [47]:
new.dropna(inplace=True)

In [48]:
len(new)

1029

In [49]:
len(qbs_df)

1099

# Aggregate career statistics for each player

In [50]:
career_stats = new.groupby('player_id').sum()

In [51]:
per_game_avg_fields = [
    ('pass_att_per_game', 'pass_att'), 
    ('pass_yds_per_g', 'pass_yds'), 
    ('rush_yds_per_g', 'rush_yds'),
    ('rush_att_per_g', 'rush_att'),
    ('fumbles_per_g', 'fumbles'),
    ('pass_td_per_g', 'pass_td'),
    ('rush_td_per_g', 'rush_td')
]

In [52]:
for (new_field, metric) in per_game_avg_fields:
    career_stats[new_field] = career_stats[metric] / career_stats.g

In [53]:
career_stats['win_percentage'] = career_stats['wins'] / career_stats.gs
career_stats['pass_yds_per_cmp'] = career_stats.pass_yds / career_stats.pass_cmp
career_stats['rush_yds_per_att'] = career_stats.rush_yds / career_stats.rush_att
career_stats['qbr_avg'] = career_stats.qbr / career_stats.season_count
career_stats['avg_pass_rating_per_season'] = career_stats.pass_rating / career_stats.season_count
career_stats['snap_percentage'] = career_stats.snaps_played / career_stats.total_snaps

In [54]:
per_attempt_stats = [
    ('pass_cmp_perc', 'pass_cmp'), 
    ('pass_td_perc', 'pass_td'), 
    ('pass_int_perc', 'pass_int'),
    ('pass_yds_per_att', 'pass_yds'),  
]

In [55]:
for (new_field, metric) in per_attempt_stats:
    career_stats[new_field] = career_stats[metric] / career_stats.pass_att

In [56]:
career_stats.query('player_id == "BradTo00"')[[
    'pass_yds_per_g', 'pass_cmp_perc', 'pass_yds_per_cmp', 'season_count',
    'qbr_avg', 'win_percentage', 'avg_pass_rating_per_season', 'snap_percentage'
]]

Unnamed: 0_level_0,pass_yds_per_g,pass_cmp_perc,pass_yds_per_cmp,season_count,qbr_avg,win_percentage,avg_pass_rating_per_season,snap_percentage
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BradTo00,266.358306,0.640707,11.680046,20,67.071591,0.767974,97.9,0.974461


# Remove unecessary fields

In [57]:
player_to_seasons_played = career_stats['season_count']

In [58]:
career_stats_clean = career_stats.drop([
    'age', 'fumbles', 'pass_long', 'pass_adj_yds_per_att',
    'pass_rating', 'qbr', 'pass_sacked_perc',
    'pass_net_yds_per_att', 'pass_adj_net_yds_per_att',
    'rush_long', 'touches', 'yds_per_touch', 
    'yds_from_scrimmage', 'total_snaps', 'snap_played_percentage',
    # Test
    'g', 'gs', 'snaps_played', 'season_count', 'rush_att', 'rush_td'
], axis=1)

## Remove Fields that penalize longevity
`[pass_int, pass_sacked, pass_sacked_yds, wins, losses, ties]`
* use pass_int_perc instead of total interceptions
* prefer win percentage to wins/ties/losses

In [59]:
career_stats_clean.drop([
    'pass_int', 'pass_sacked', 'pass_sacked_yds', 'wins', 'losses', 'ties',
    'pass_cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_first_down', 
    'rush_yds', 'rush_first_down',
], axis=1, inplace=True)

In [60]:
normalized_qb_df = career_stats_clean.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

## Penalize Negative fields (lower is better)
`[pass_int_perc, fumbles`]

In [61]:
negative_fields = ['pass_int_perc', 'fumbles_per_g']
normalized_qb_df[negative_fields] = normalized_qb_df[negative_fields] * -1

In [62]:
normalized_qb_df.query('player_id == "BradTo00"').to_dict()

{'pass_cmp_perc': {'BradTo00': 0.6407065068179738},
 'pass_td_perc': {'BradTo00': 0.27729477441200695},
 'pass_int_perc': {'BradTo00': -0.08968609865470852},
 'pass_yds_per_att': {'BradTo00': 0.6525754834531259},
 'pass_yds_per_cmp': {'BradTo00': 0.2995286387658906},
 'pass_yds_per_g': {'BradTo00': 0.8787632079299167},
 'comebacks': {'BradTo00': 1.0},
 'gwd': {'BradTo00': 0.9433962264150944},
 'rush_yds_per_att': {'BradTo00': 0.3332126696832579},
 'rush_yds_per_g': {'BradTo00': 0.09672335531225582},
 'rush_att_per_g': {'BradTo00': 0.19047126420712696},
 'win_percentage': {'BradTo00': 0.7679738562091504},
 'pass_att_per_game': {'BradTo00': 0.8953836759587969},
 'fumbles_per_g': {'BradTo00': -0.20846905537459284},
 'pass_td_per_g': {'BradTo00': 0.8101998152559676},
 'rush_td_per_g': {'BradTo00': 0.15728245695672405},
 'qbr_avg': {'BradTo00': 0.9224737039819683},
 'avg_pass_rating_per_season': {'BradTo00': 0.8389031705227078},
 'snap_percentage': {'BradTo00': 0.9735966661221026}}

In [63]:
normalized_qb_df.fillna(0, inplace=True)

In [64]:
len(normalized_qb_df.columns)

19

In [65]:
pca = PCA(n_components=1)

In [66]:
pca_analysis = pd.DataFrame(
    pca.fit_transform(normalized_qb_df), 
    columns=['PCA%i' % i for i in range(1)], 
    index=normalized_qb_df.index)

# Aggregate performance metric with career stats

In [67]:
career_stats_final = pd.merge(player_to_seasons_played, pca_analysis, left_index=True, right_index=True)

In [68]:
career_stats_final['career_score'] = career_stats_final.PCA0
career_stats_final.drop('PCA0', axis=1, inplace=True)

In [73]:
#career_stats_final.to_csv('../../ProData/qb_career_stats_no_career_totals.csv')

In [69]:
career_stats_final = pd.read_csv('../../ProData/qb_career_stats_no_career_totals.csv')

# Plot Age vs Feature

In [70]:
top_50 = career_stats_final.sort_values('career_score', axis=0, ascending=False).head(50)

In [72]:
top_50[['player_id', 'career_score', 'season_count']].head(20)

Unnamed: 0,player_id,career_score,season_count
10,BradTo00,1.299568,20
11,BreeDr00,1.258259,20
144,RoetBe00,1.142879,18
95,MahoPa00,1.137151,5
149,RyanMa00,1.107806,14
186,WilsRu00,1.105767,10
143,RodgAa00,1.078353,17
162,StafMa00,1.062311,13
92,LuckAn00,1.048096,6
179,WatsDe00,0.997253,4
