In [80]:
import pandas as pd
import numpy as np
import re
from sklearn.decomposition import PCA

In [3]:
skill_players = pd.read_csv('../../ProData/csvs/skill_player_stats.csv', header=0)

In [5]:
drop_columns = ['Unnamed: 0', 'team', 'av']

In [6]:
skills_df = skill_players.drop(drop_columns, axis=1)

In [20]:
# Add column with value 1 so that when we group by player 
# and sum we get a count of the number of seasons
skills_df['season_count'] = 1

# Fill nulls and zeros

## Fillna for each player with the player's average

In [8]:
fillna_cols = ['snap_played_percentage', 'snaps_played', 'total_snaps']

In [9]:
def fill_with_player_averages(group):
    group[fillna_cols] = group[fillna_cols].replace(0, group[fillna_cols].mean())
    group[fillna_cols] = group[fillna_cols].fillna(group[fillna_cols].mean())
    return group

In [10]:
new = skills_df.groupby('player_id', as_index=False).apply(fill_with_player_averages)

## Fill zeros and nulls with all average

In [11]:
new[fillna_cols] = new[fillna_cols].fillna(new[fillna_cols].mean())

In [12]:
new[fillna_cols] = new[fillna_cols].replace(0, new[fillna_cols].mean())

In [13]:
new.dropna(inplace=True)

In [14]:
len(new)

6330

In [15]:
len(skills_df)

6330

In [134]:
new.columns

Index(['age', 'g', 'gs', 'targets', 'rec', 'rec_yds', 'rec_yds_per_rec',
       'rec_td', 'rec_first_down', 'rec_long', 'rec_per_g', 'rec_yds_per_g',
       'catch_pct', 'rec_yds_per_tgt', 'rush_att', 'rush_yds', 'rush_td',
       'rush_first_down', 'rush_long', 'rush_yds_per_att', 'rush_yds_per_g',
       'rush_att_per_g', 'touches', 'yds_per_touch', 'yds_from_scrimmage',
       'rush_receive_td', 'fumbles', 'player_id', 'position', 'snaps_played',
       'total_snaps', 'snap_played_percentage', 'season_count'],
      dtype='object')

## Map player_id to position

In [200]:
player_to_position = new[['player_id', 'position']].drop_duplicates().set_index('player_id')

# Aggregate career statistics for each player

In [179]:
career_stats = new.groupby('player_id').sum()

In [181]:
fields_to_divide_by_season = [
    'rec_yds_per_rec', 'catch_pct', 'rec_yds_per_tgt',
    'rush_yds_per_att', 'rush_att_per_g', 'yds_per_touch'
]

fields_to_divide_by_season.extend([col for col in career_stats.columns if col.endswith('per_g')])

In [182]:
for field in fields_to_divide_by_season:
    career_stats[field] = career_stats[field] / career_stats.season_count

In [183]:
per_game_avg_fields = [
    ('targets_per_g', 'targets'),
    ('touches_per_g', 'touches'),
    ('yds_from_scrimmage_per_g', 'yds_from_scrimmage'),
    ('rush_receive_td_per_g', 'rush_receive_td'),
    ('rec_td_per_g', 'rec_td'),
    ('rush_first_down_per_g', 'rush_first_down'),
    ('fumbles_per_g', 'fumbles'),
    ('rush_td_per_g', 'rush_td'),
    ('rec_first_down_per_g', 'rec_first_down'),
]

In [184]:
for (new_field, metric) in per_game_avg_fields:
    career_stats[new_field] = career_stats[metric] / career_stats.g

In [186]:
career_stats['snap_percentage'] = career_stats.snaps_played / career_stats.total_snaps

In [187]:
per_game_fields = [col for col in career_stats.columns if col.endswith('per_g')]
not_per_game_fields = [col for col in career_stats.columns if not col.endswith('per_g')]

fields_to_remove = []

for field in not_per_game_fields:
    for field_2 in per_game_fields:
        base_field = re.match('.*(?=_per_g)', field_2)
        if base_field[0] == field:
            fields_to_remove.append(field)
            continue

fields_to_remove.extend([
    'age', 'rec_long', 'rush_long', 'total_snaps', 
    'snap_played_percentage','g', 'gs', 'snaps_played'
])

In [188]:
career_stats = career_stats.drop(fields_to_remove, axis=1)

In [189]:
career_stats.query('player_id == "AdamDa01"')

Unnamed: 0_level_0,rec_yds_per_rec,rec_per_g,rec_yds_per_g,catch_pct,rec_yds_per_tgt,rush_yds_per_att,rush_yds_per_g,rush_att_per_g,yds_per_touch,season_count,targets_per_g,touches_per_g,yds_from_scrimmage_per_g,rush_receive_td_per_g,rec_td_per_g,rush_first_down_per_g,fumbles_per_g,rush_td_per_g,rec_first_down_per_g,snap_percentage
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AdamDa01,12.0875,5.75,70.3125,63.875,7.75,0.0,0.0,0.0,12.0875,8,8.611111,5.592593,68.092593,0.601852,0.601852,0.0,0.046296,0.0,3.37963,0.832297


# Add player Position back to data

In [190]:
career_stats= career_stats.join(player_to_position)

# Remove unecessary fields

In [201]:
player_to_seasons_played = career_stats['season_count']

## Groupby position and standardize fields

In [196]:
career_groups = career_stats.groupby('position')

In [197]:
position_group_std_data = {}

for group_name, group_dataframe in career_groups:
    group_dataframe.drop('position', axis=1, inplace=True)
    data = group_dataframe.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    data['fumbles_per_g'] = data['fumbles_per_g'] * -1
    
    position_group_std_data[group_name] = data

In [198]:
len(position_group_std_data['WR'].columns)

20

# Perform PCA on each group

In [202]:
def perform_position_pca(position_df):
    pca = PCA(n_components=1)
    pca_analysis = pd.DataFrame(
    pca.fit_transform(position_df), 
    columns=['PCA%i' % i for i in range(1)], 
    index=position_df.index)
    
    career_stats_final = pd.merge(player_to_seasons_played, pca_analysis, left_index=True, right_index=True)
    career_stats_final['career_score'] = career_stats_final.PCA0
    career_stats_final.drop('PCA0', axis=1, inplace=True)
    
#     career_stats_final.to_csv(f'../../ProData/{}_career_stats_no_career_totals.csv')
    return career_stats_final.sort_values('career_score', axis=0, ascending=False)

In [207]:
for pos, df in position_group_std_data.items():
    pca_df = perform_position_pca(df)
    pca_df.to_csv(f'../../ProData/{pos}_career_stats_scored.csv')

# Aggregate performance metric with career stats

In [208]:
rb_scored = pd.read_csv('../../ProData/RB_career_stats_scored.csv')
wr_scored = pd.read_csv('../../ProData/WR_career_stats_scored.csv')
te_scored = pd.read_csv('../../ProData/TE_career_stats_scored.csv')

# Plot Age vs Feature

In [212]:
top_50_wr = wr_scored.sort_values('career_score', axis=0, ascending=False).head(50)
top_50_wr.head(20)

Unnamed: 0,player_id,season_count,career_score
53,BrowAn04,12,1.754575
285,JoneJu02,11,1.735662
269,JohnCa00,9,1.716101
509,ThomMi05,5,1.650334
29,BeckOd00,8,1.609603
239,HopkDe00,9,1.59044
86,ChasJa00,1,1.536993
233,HillTy00,6,1.529592
7,AlleKe00,9,1.52825
155,EvanMi00,8,1.506239


In [215]:
top_50_rb = rb_scored.sort_values('career_score', axis=0, ascending=False).head(50)
top_50_rb.head(40)

Unnamed: 0,player_id,season_count,career_score
231,McCaCh01,5,1.854627
95,ElliEz00,6,1.770068
351,TomlLa00,11,1.767135
204,KamaAl00,5,1.718884
68,CookDa01,5,1.670815
140,HarrNa00,1,1.653554
76,DaviDo01,3,1.625565
342,TaylJo02,2,1.599314
135,GurlTo01,6,1.475
287,PortCl00,9,1.449085


In [214]:
top_50_te = te_scored.sort_values('career_score', axis=0, ascending=False).head(50)
top_50_te.head(20)

Unnamed: 0,player_id,season_count,career_score
124,KelcTr00,9,1.989751
93,GronRo00,11,1.98356
129,KittGe00,5,1.732385
102,HernAa00,3,1.578132
179,PittKy00,1,1.487152
85,GrahJi00,12,1.432702
186,ReedJo02,7,1.429264
5,AndrMa00,4,1.426805
57,ErtzZa00,10,1.413148
263,WittJa00,17,1.412564
