### Data preparation functions
* Use this to prepare data for use by the models we are applying:
* linear & polynomial regression (keep it to the min explanatory variables) => explanation
* random forest (throw more stuff in)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
# To plot matplotlib figures inline on the notebook
%matplotlib inline

from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
from luther_common import season_str_to_season_year

In [5]:
# read in team's seasons
team_seasons_df = pd.read_csv('team_seasons_list.csv', index_col=0)
# add season_year column
team_seasons_df['season_year'] = team_seasons_df['season'].apply(season_str_to_season_year)

In [6]:
# read in the player individual stats
players_df = pd.read_csv('player_list.csv', index_col=0)

In [7]:
# read in the performance of the player for each season
player_seasons_df = pd.read_csv('player_seasons_list.csv', index_col=0)
# add season_year column
player_seasons_df['season_year'] = player_seasons_df['season'].apply(season_str_to_season_year)

In [8]:
# calculate player's individual stats 
# height and weight for the player (seems like those are the useful values)
# rookie year
players_df['height'] = players_df['height'].fillna('6-6')
players_df['weight'] = players_df['weight'].fillna(230)
def height_str_to_height_inches(height_str):
    ft_str, in_str = (height_str.split('-'))
    return float(ft_str) * 12 + float(in_str)
players_df['height_inches'] = players_df['height'].apply(height_str_to_height_inches)
players_df['rookie_yr'] = players_df['year_min']

In [9]:
#general cleanup of player_seasons_df
#get rid of percentage stats (not useful)
player_seasons_df = player_seasons_df.select(lambda x: not re.search('_pct', x), axis=1)
#get rid of other not useful columns
player_seasons_df.drop('lg_id', axis=1, inplace=True)

In [10]:
# FIRST. reduce the data we are working with: only data from analysis_start_year onwards
analysis_start_year = 1985 # 1985 is the max data set (bc of team names, etc.)
player_seasons_df = player_seasons_df[player_seasons_df['season_year'] >= analysis_start_year]

# cleaning player seasons
# 1. if there are multiple rows for the same season, drop any row that is 'TOT'
player_seasons_df = player_seasons_df[(player_seasons_df['team_id'] != 'TOT')]
# 2. dummy variables for pos
player_seasons_df = pd.get_dummies(player_seasons_df, columns=['pos'])
# 3. hack 'categorical' variable for position (1=PG ... 5=C)
player_seasons_df['poscat'] = \
(player_seasons_df['pos_PG'] * 1 + 
player_seasons_df['pos_SG'] * 2 + 
player_seasons_df['pos_SF'] * 3 + 
player_seasons_df['pos_PF'] * 4 + 
player_seasons_df['pos_C'] * 5 ) / \
(player_seasons_df['pos_PG'] * 1 + 
player_seasons_df['pos_SG'] * 1 + 
player_seasons_df['pos_SF'] * 1 + 
player_seasons_df['pos_PF'] * 1 + 
player_seasons_df['pos_C'] * 1 )

# 4. get an efficiency score for the player:
#    (PTS + REB + AST + STL + BLK − ((FGA − FGM) + (FTA − FTM) + TO)) multiply by g to weight it
player_seasons_df['eff_raw'] = (player_seasons_df['pts_per_g'] +\
                            player_seasons_df['trb_per_g'] +\
                            player_seasons_df['ast_per_g'] +\
                            player_seasons_df['stl_per_g'] +\
                            player_seasons_df['blk_per_g'] -\
                           ((player_seasons_df['fga_per_g'] - player_seasons_df['fg_per_g']) +\
                            (player_seasons_df['fta_per_g'] - player_seasons_df['ft_per_g']) +\
                             player_seasons_df['tov_per_g'])) * player_seasons_df['g']

In [11]:
# merge player's team's season into player_seasons_df BEFORE player_seasons_df is separated
temp_team_seasons_df = team_seasons_df.loc[:,['season_year',
                                            'initials',
                                            'pace']]
temp_team_seasons_df.rename(columns={'initials':'team_id',
                                     'pace':'team_pace'}, inplace=True)
player_seasons_df = pd.merge(player_seasons_df, 
                             temp_team_seasons_df, 
                             how='left', 
                             left_on=['season_year','team_id'], 
                             right_on=['season_year','team_id'])

In [12]:
%%time
# merge player's season into the roster - relative strengths of players in the same position
for index, player_season in player_seasons_df.iterrows():
    #get a df of all the people who played in the position on the same team that year and sum their contribution scores
    teammates_df = player_seasons_df[(player_seasons_df['season_year'] == player_season['season_year']) &
                                   (player_seasons_df['team_id'] == player_season['team_id']) &
                                   (player_seasons_df['poscat'] == player_season['poscat'])
                                  ]
    #take contribution score and divide contribution score of position of the team
    denom = sum(teammates_df['eff_raw'])
    num = player_season['eff_raw']
    player_seasons_df.loc[index, 'eff_ratio'] = num/denom

CPU times: user 1min 2s, sys: 451 ms, total: 1min 2s
Wall time: 1min 3s


In [13]:
# save the interim player_seasons (it's useful..)
player_seasons_df.to_csv('player_seasons_list_processed.csv')

In [None]:
%%time
# 5. replace the season rows with weighted averages where relevant
# Define lambda functions to:
# compute the weighted average (by g)
# keep the value of non-averaged rows
mean_wt_by_g = lambda x: np.average(x, weights=player_seasons_df.loc[x.index, 'g'])
keep = lambda x: x.iloc[0]
# Define a dictionary with the functions to apply for each column:
f = {
'age':keep,
'ast_per_g':mean_wt_by_g,
'blk_per_g':mean_wt_by_g,
'canonical':keep,
'drb_per_g':mean_wt_by_g,
'eff_raw':sum,
'eff_ratio':sum,
'fg2_per_g':mean_wt_by_g,
'fg2a_per_g':mean_wt_by_g,
'fg3_per_g':mean_wt_by_g,
'fg3a_per_g':mean_wt_by_g,
'fg_per_g':mean_wt_by_g,
'fga_per_g':mean_wt_by_g,
'ft_per_g':mean_wt_by_g,
'fta_per_g':mean_wt_by_g,
'g':np.sum,
'mp_per_g':mean_wt_by_g,
'name':keep,
'orb_per_g':mean_wt_by_g,
'pf_per_g':mean_wt_by_g,
'pts_per_g':mean_wt_by_g,
'season':keep,
'stl_per_g':mean_wt_by_g,
'team_id':keep,
'tov_per_g':mean_wt_by_g,
'trb_per_g':mean_wt_by_g,
'season_year':keep,
'pos_C':mean_wt_by_g,
'pos_PF':mean_wt_by_g,
'pos_PG':mean_wt_by_g,
'pos_SF':mean_wt_by_g,
'pos_SG':mean_wt_by_g,
'poscat':mean_wt_by_g,
'team_pace':mean_wt_by_g
}
# Groupby and aggregate with the dictionary:
final_player_seasons_df = player_seasons_df.groupby(['season_year', 'canonical']).agg(f)

In [None]:
df = final_player_seasons_df

In [None]:
%%time
# process: iterate through each row in the dataset, 
# if the previous num_yrs_back years stats exist:
# append prev num_yrs_back year stats to X (break multiple rows into cols)
# append ppg to y
num_yrs_back = 3
X_df = pd.DataFrame()
y_df = pd.DataFrame()
# for each row, count it as a row if we can get something from the year before that row
for _ , row in df.iterrows():
    temp_df = df[(df['season_year'] >= row['season_year']-num_yrs_back) &
                 (df['season_year'] <= row['season_year']) &
                 (df['canonical'] == row['canonical'])]
    if temp_df.shape[0] == num_yrs_back + 1:
        year_row_wide = pd.DataFrame()
        years_ago = num_yrs_back
        for _ , year_row in temp_df.iterrows():
            #need to go across the rows and append with 'years_ago'
            for col_name , year_row_item in year_row.iteritems():
                year_row_wide[col_name + '_{}_ya'.format(years_ago)] = [year_row_item]
            years_ago -= 1 #not used yet, put into column name
        X_df = X_df.append(year_row_wide)
        y_df = y_df.append(row)
X_df.reset_index(inplace=True, drop=True)
y_df.reset_index(inplace=True, drop=True)

In [None]:
# AFTER player_seasons_df is separated, merge player's individual stats into X_df 
temp_players_df = players_df.loc[:,['canonical','height_inches','weight','rookie_yr']]
X_df = pd.merge(X_df, temp_players_df, how='left', left_on='canonical_0_ya', right_on='canonical')


In [None]:
# Calculate number of years in league and drop rookie year
for years_ago in range(0,4):
    suffix = '_{}_ya'.format(years_ago)
    X_df['yrs_in_league'+suffix] = X_df['season_year'+suffix] - X_df['rookie_yr']
X_df.drop('rookie_yr', axis=1, inplace=True)

In [None]:
#check that there are no nulls
X_df[X_df.isnull().any(axis=1)]

In [None]:
#X_df.shape
#check nulls
#X_df[X_df.isnull().any(axis=1)]
#fill nans?
#X_df.info()

In [None]:
#X_df.tail(2000)

In [None]:
# our columns got messed up in all that copying and pasting...
X_df = X_df.reindex_axis(sorted(X_df.columns), axis=1)
y_df = y_df.reindex_axis(sorted(y_df.columns), axis=1)

In [None]:
####### WE HAVE OUR DATA AND TARGET SO WRITE IT TO FILE #######
X_df.to_csv('LEBRON_data.csv')
y_df.to_csv('LEBRON_target.csv')