In [80]:
import pandas as pd
import numpy as np
import re
from sklearn.decomposition import PCA

In [3]:
skill_players = pd.read_csv('../../ProData/csvs/skill_player_stats.csv', header=0)

In [5]:
drop_columns = ['Unnamed: 0', 'team', 'av']

In [6]:
skills_df = skill_players.drop(drop_columns, axis=1)

In [20]:
# Add column with value 1 so that when we group by player 
# and sum we get a count of the number of seasons
skills_df['season_count'] = 1

# Fill nulls and zeros

## Fillna for each player with the player's average
QBR was only available starting in 2006 and snap data in 2012. For players that span across those time ranges, use their average to fill data from time before it was available.

In [8]:
fillna_cols = ['snap_played_percentage', 'snaps_played', 'total_snaps']

In [9]:
def fill_with_player_averages(group):
    group[fillna_cols] = group[fillna_cols].replace(0, group[fillna_cols].mean())
    group[fillna_cols] = group[fillna_cols].fillna(group[fillna_cols].mean())
    return group

In [10]:
new = skills_df.groupby('player_id', as_index=False).apply(fill_with_player_averages)

## Fill zeros and nulls with all average

In [11]:
new[fillna_cols] = new[fillna_cols].fillna(new[fillna_cols].mean())

In [12]:
new[fillna_cols] = new[fillna_cols].replace(0, new[fillna_cols].mean())

In [13]:
new.dropna(inplace=True)

In [14]:
len(new)

6330

In [15]:
len(skills_df)

6330

In [134]:
new.columns

Index(['age', 'g', 'gs', 'targets', 'rec', 'rec_yds', 'rec_yds_per_rec',
       'rec_td', 'rec_first_down', 'rec_long', 'rec_per_g', 'rec_yds_per_g',
       'catch_pct', 'rec_yds_per_tgt', 'rush_att', 'rush_yds', 'rush_td',
       'rush_first_down', 'rush_long', 'rush_yds_per_att', 'rush_yds_per_g',
       'rush_att_per_g', 'touches', 'yds_per_touch', 'yds_from_scrimmage',
       'rush_receive_td', 'fumbles', 'player_id', 'position', 'snaps_played',
       'total_snaps', 'snap_played_percentage', 'season_count'],
      dtype='object')

## Map player_id to position

In [163]:
player_to_position = new[['player_id', 'position']].drop_duplicates()
player_to_position

Unnamed: 0,player_id,position
0,CookJa00,FB
1,McHuSe00,FB
2,SmitTe00,FB
3,FeltJe00,FB
4,HillPe00,FB
...,...,...
5675,WilsQu00,RB
5839,NedxLa00,RB
5915,HenrLe20,RB
6289,AkerCa00,RB


# Aggregate career statistics for each player

In [136]:
career_stats = new.groupby('player_id').sum()

In [137]:
sorted(career_stats.columns)

['age',
 'catch_pct',
 'fumbles',
 'g',
 'gs',
 'rec',
 'rec_first_down',
 'rec_long',
 'rec_per_g',
 'rec_td',
 'rec_yds',
 'rec_yds_per_g',
 'rec_yds_per_rec',
 'rec_yds_per_tgt',
 'rush_att',
 'rush_att_per_g',
 'rush_first_down',
 'rush_long',
 'rush_receive_td',
 'rush_td',
 'rush_yds',
 'rush_yds_per_att',
 'rush_yds_per_g',
 'season_count',
 'snap_played_percentage',
 'snaps_played',
 'targets',
 'total_snaps',
 'touches',
 'yds_from_scrimmage',
 'yds_per_touch']

In [122]:
fields_to_divide_by_season = [
    'rec_yds_per_rec', 'catch_pct', 'rec_yds_per_tgt',
    'rush_yds_per_att', 'rush_att_per_g', 'yds_per_touch'
]

fields_to_divide_by_season.extend([col for col in career_stats.columns if col.endswith('per_g')])

In [123]:
for field in fields_to_divide_by_season:
    career_stats[field] = career_stats[field] / career_stats.season_count

In [124]:
per_game_avg_fields = [
    ('targets_per_g', 'targets'),
    ('touches_per_g', 'touches'),
    ('yds_from_scrimmage_per_g', 'yds_from_scrimmage'),
    ('rush_receive_td_per_g', 'rush_receive_td'),
    ('rec_td_per_g', 'rec_td'),
    ('rush_first_down_per_g', 'rush_first_down'),
    ('fumbles_per_g', 'fumbles'),
    ('rush_td_per_g', 'rush_td'),
    ('rec_first_down_per_g', 'rec_first_down'),
]

In [125]:
for (new_field, metric) in per_game_avg_fields:
    career_stats[new_field] = career_stats[metric] / career_stats.g

In [133]:
sorted(career_stats.columns)

['catch_pct',
 'fumbles_per_g',
 'rec_first_down_per_g',
 'rec_per_g',
 'rec_td_per_g',
 'rec_yds_per_g',
 'rec_yds_per_rec',
 'rec_yds_per_tgt',
 'rush_att_per_g',
 'rush_first_down_per_g',
 'rush_receive_td_per_g',
 'rush_td_per_g',
 'rush_yds_per_att',
 'rush_yds_per_g',
 'season_count',
 'snap_percentage',
 'targets_per_g',
 'touches_per_g',
 'yds_from_scrimmage_per_g',
 'yds_per_touch']

In [127]:
career_stats['snap_percentage'] = career_stats.snaps_played / career_stats.total_snaps

In [128]:
per_game_fields = [col for col in career_stats.columns if col.endswith('per_g')]
not_per_game_fields = [col for col in career_stats.columns if not col.endswith('per_g')]

fields_to_remove = []

for field in not_per_game_fields:
    for field_2 in per_game_fields:
        base_field = re.match('.*(?=_per_g)', field_2)
        if base_field[0] == field:
            fields_to_remove.append(field)
            continue

fields_to_remove.extend([
    'age', 'rec_long', 'rush_long', 'total_snaps', 
    'snap_played_percentage','g', 'gs', 'snaps_played'
])

In [129]:
career_stats = career_stats.drop(fields_to_remove, axis=1)

In [131]:
career_stats.query('player_id == "AdamDa01"')

Unnamed: 0_level_0,rec_yds_per_rec,rec_per_g,rec_yds_per_g,catch_pct,rec_yds_per_tgt,rush_yds_per_att,rush_yds_per_g,rush_att_per_g,yds_per_touch,season_count,targets_per_g,touches_per_g,yds_from_scrimmage_per_g,rush_receive_td_per_g,rec_td_per_g,rush_first_down_per_g,fumbles_per_g,rush_td_per_g,rec_first_down_per_g,snap_percentage
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AdamDa01,12.0875,5.75,70.3125,63.875,7.75,0.0,0.0,0.0,12.0875,8,8.611111,5.592593,68.092593,0.601852,0.601852,0.0,0.046296,0.0,3.37963,0.832297


# Add player Position back to data

In [160]:
temp = career_stats.reset_index()

In [162]:
temp

Unnamed: 0,player_id,age,g,gs,targets,rec,rec_yds,rec_yds_per_rec,rec_td,rec_first_down,...,rush_att_per_g,touches,yds_per_touch,yds_from_scrimmage,rush_receive_td,fumbles,snaps_played,total_snaps,snap_played_percentage,season_count
0,AbbrJa00,78,22,0,25,13,163,35.0,0,7,...,0.0,13,35.0,163,0,1,240.000000,1518.000000,0.460000,3
1,AbduAm00,175,89,22,130,94,675,61.6,6,44,...,34.7,475,44.4,2174,12,9,3136.963341,6139.087095,3.722884,8
2,AdamDa01,204,108,101,930,604,7354,96.7,65,365,...,0.0,604,96.7,7354,65,5,6025.000000,7239.000000,6.680000,8
3,AdamJe01,76,30,3,32,24,214,19.1,1,13,...,0.0,24,19.1,214,1,1,1176.361253,2302.157661,1.396081,3
4,AdamJo01,23,9,0,1,1,7,7.0,0,0,...,0.3,4,5.0,20,0,4,22.000000,550.000000,0.040000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,WynnDe00,102,26,4,24,14,122,27.6,0,6,...,10.2,78,22.2,454,5,1,1960.602088,3836.929434,2.326802,5
1274,WynnMi00,47,4,0,7,4,69,17.3,0,3,...,0.0,4,17.3,69,0,0,784.240835,1534.771774,0.930721,2
1275,YeldT.00,147,60,30,252,185,1448,61.9,7,72,...,42.3,677,35.3,3453,13,7,2352.722506,4604.315321,2.792163,6
1276,YounTi00,45,26,17,142,81,990,24.2,10,53,...,0.3,85,23.8,1021,10,1,1206.000000,1454.000000,1.660000,2


In [161]:
temp['position'] = play_to_position[temp.player_id]

TypeError: unhashable type: 'Series'

# Remove unecessary fields

In [132]:
player_to_seasons_played = career_stats['season_count']

## Groupby position and standardize fields
`[pass_int, pass_sacked, pass_sacked_yds, wins, losses, ties]`
* use pass_int_perc instead of total interceptions
* prefer win percentage to wins/ties/losses

In [59]:
career_groups = career_stats.groupby('')

In [60]:
normalized_qb_df = career_stats_clean.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

## Penalize Negative fields (lower is better)
`[pass_int_perc, fumbles`]

In [61]:
negative_fields = ['pass_int_perc', 'fumbles_per_g']
normalized_qb_df[negative_fields] = normalized_qb_df[negative_fields] * -1

In [62]:
normalized_qb_df.query('player_id == "BradTo00"').to_dict()

{'pass_cmp_perc': {'BradTo00': 0.6407065068179738},
 'pass_td_perc': {'BradTo00': 0.27729477441200695},
 'pass_int_perc': {'BradTo00': -0.08968609865470852},
 'pass_yds_per_att': {'BradTo00': 0.6525754834531259},
 'pass_yds_per_cmp': {'BradTo00': 0.2995286387658906},
 'pass_yds_per_g': {'BradTo00': 0.8787632079299167},
 'comebacks': {'BradTo00': 1.0},
 'gwd': {'BradTo00': 0.9433962264150944},
 'rush_yds_per_att': {'BradTo00': 0.3332126696832579},
 'rush_yds_per_g': {'BradTo00': 0.09672335531225582},
 'rush_att_per_g': {'BradTo00': 0.19047126420712696},
 'win_percentage': {'BradTo00': 0.7679738562091504},
 'pass_att_per_game': {'BradTo00': 0.8953836759587969},
 'fumbles_per_g': {'BradTo00': -0.20846905537459284},
 'pass_td_per_g': {'BradTo00': 0.8101998152559676},
 'rush_td_per_g': {'BradTo00': 0.15728245695672405},
 'qbr_avg': {'BradTo00': 0.9224737039819683},
 'avg_pass_rating_per_season': {'BradTo00': 0.8389031705227078},
 'snap_percentage': {'BradTo00': 0.9735966661221026}}

In [63]:
normalized_qb_df.fillna(0, inplace=True)

In [64]:
len(normalized_qb_df.columns)

19

In [65]:
pca = PCA(n_components=1)

In [66]:
pca_analysis = pd.DataFrame(
    pca.fit_transform(normalized_qb_df), 
    columns=['PCA%i' % i for i in range(1)], 
    index=normalized_qb_df.index)

# Aggregate performance metric with career stats

In [67]:
career_stats_final = pd.merge(player_to_seasons_played, pca_analysis, left_index=True, right_index=True)

In [68]:
career_stats_final['career_score'] = career_stats_final.PCA0
career_stats_final.drop('PCA0', axis=1, inplace=True)

In [35]:
#career_stats_final.to_csv('../../ProData/qb_career_stats_no_career_totals.csv')

OSError: Cannot save file into a non-existent directory: 'ProData'

In [69]:
career_stats_final = pd.read_csv('../../ProData/qb_career_stats_no_career_totals.csv')

# Plot Age vs Feature

In [70]:
top_50 = career_stats_final.sort_values('career_score', axis=0, ascending=False).head(50)

In [72]:
top_50[['player_id', 'career_score', 'season_count']].head(20)

Unnamed: 0,player_id,career_score,season_count
10,BradTo00,1.299568,20
11,BreeDr00,1.258259,20
144,RoetBe00,1.142879,18
95,MahoPa00,1.137151,5
149,RyanMa00,1.107806,14
186,WilsRu00,1.105767,10
143,RodgAa00,1.078353,17
162,StafMa00,1.062311,13
92,LuckAn00,1.048096,6
179,WatsDe00,0.997253,4
