In [86]:
import numpy as np
import sklearn as sk
import scipy as sp
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from sqlite_api import *
from sklearn import linear_model
from math import sqrt

import warnings
warnings.filterwarnings("ignore")

In [91]:
fdb = 'fantasy.db'

player_lbls = ['Player', 'Position', 'Team', 'Season', 'Plays', 'Games', 'RushAttempts', 
              'RushYrds', 'RushTDs', 'PassAttempts', 'Complete', 'PassYrds', 'PassTDs', 'Fumbles', 'Interceptions']

In [92]:
## Scoring metrics

qb = {'PassYrds': 0.04, # QBs get 0.04 points per passing yard 
        'PassTDs': 4, 
        'Interceptions': -2,
        'RushYrds': 0.1,
        'RushTDs': 6,
        'Fumbles': -2}

wr = {'PassYrds': 0.01,
         'PassTDs': 6,
         'Interceptions': -2,
         'RushYrds': 0.1,
         'RushTDs': 6,
         'Fumbles': -2}

rb = wr.copy()

te = wr.copy()

scores = {'QB': qb,
             'RB': rb,
             'WR': wr,
             'TE': te}

# need 2 points for 2point conversions

In [93]:
def score_row(row, qb_scores, wr_scores, rb_scores, te_scores):
    score = 0
    if row['Position'] == 'QB':
        metrics = qb_scores
    elif row['Position'] == 'WR':
        metrics = wr_scores
    elif row['Position'] == 'RB':
        metrics = rb_scores
    elif row['Position'] == 'TE':
        metrics = te_scores
    
    for var in metrics:
        score += row[var] * metrics[var]
        
    return score

In [94]:
plyr = select_to_df(fdb, 'PlayerSeason', player_lbls, where='WHERE Season=2016')  

In [5]:
# create 2016 mock season database
rdb = 'rankings.db'
conn = sqlite3.connect(rdb)
c = conn.cursor()

In [96]:
for pos in ['Quarterbacks', 'RunningBacks', 'WideReceivers', 'TightEnds', 'Kickers']:
    c.execute('''CREATE Table ''' + pos +
                '''(Player VARCHAR(30),
                    Team VARCHAR(3),
                    Position CHARACTER(2),
                    PRIMARY KEY (Player))''')

In [97]:
c.execute('''CREATE Table Defenses
                (Team VARCHAR(3),
                Position CHARACTER(2),
                PRIMARY KEY (Team))''')

<sqlite3.Cursor at 0x21886351f10>

In [102]:
df = select_to_df(fdb, 'PlayerSeason', player_lbls, where='WHERE Season=2016')[['Player', 'Position', 'Team']]
QBs = df[df.Position == 'QB']
for idx in range(len(QBs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO Quarterbacks VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [103]:
RBs = df[df.Position == 'RB']
for idx in range(len(RBs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO RunningBacks VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [104]:
WRs = df[df.Position == 'WR']
for idx in range(len(WRs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO WideReceivers VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [105]:
TEs = df[df.Position == 'TE']
for idx in range(len(TEs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO TightEnds VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [107]:
df = select_to_df(fdb, 'KickerSeason', ['Player', 'Team', 'Position'], 'WHERE Season = 2016')
for idx in range(len(df)):
    c.execute('INSERT INTO Kickers VALUES (?, ?, ?)', df.iloc[idx])

In [108]:
df = select_to_df(fdb, 'DefenseSeason', ['Team', 'Position'], 'WHERE Season = 2016')
for idx in range(len(df)):
    c.execute('INSERT INTO Defenses VALUES (?, ?)', df.iloc[idx])

In [30]:
df = select_to_df(fdb, 'PlayerSeason', player_lbls, 'WHERE Season != 2016')
df['Score'] = ply.apply(score_row, args=(qb, wr, rb, te), axis=1)

## Offensive Features

In [95]:
def add_feature(df, name, func):
    df[name] = df.apply(func, axis=1)
    return df

In [96]:
offense_lbls = ['Team', 
                 'Season', 
                 'Yards',
                 'PassYrds',
                 'RushYrds',
                 'Points']

In [97]:
def pass_rush_rat(row):
    return row['PassYrds'] / row['RushYrds']

def yards_per_game(row):
    return row['Yards'] / 16

def pass_yrds_per_game(row):
    return row['PassYrds'] / 16

def rush_yrds_per_game(row):
    return row['RushYrds'] / 16

def points_per_game(row):
    return row['Points'] / 16

In [98]:
off = select_to_df(fdb, 'OffenseSeason', offense_lbls)

In [99]:
off = add_feature(off, 'OffPassRushRatio', pass_rush_rat)
off = add_feature(off, 'OffYardsPerGame', yards_per_game)
off = add_feature(off, 'OffPassYrdsPerGame', pass_yrds_per_game)
off = add_feature(off, 'OffRushYrdsPerGame', rush_yrds_per_game)
off = add_feature(off, 'OffPointsPerGame', points_per_game)
off = off.drop(['Yards', 'PassYrds', 'RushYrds', 'Points'], axis=1)
off.head()

Unnamed: 0,Team,Season,OffPassRushRatio,OffYardsPerGame,OffPassYrdsPerGame,OffRushYrdsPerGame,OffPointsPerGame
0,NE,2007,2.55868,411.25,295.6875,115.5625,36.8125
1,DAL,2007,2.351088,365.6875,256.5625,109.125,28.4375
2,IND,2007,2.364009,358.6875,252.0625,106.625,28.125
3,JAX,2007,1.391886,357.4375,208.0,149.4375,25.6875
4,SEA,2007,2.448425,348.9375,247.75,101.1875,24.5625


## Additional Player Features

In [100]:
def pass_tdatt_rat(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['PassTDs'] / row['PassAttempts']

def rush_tdatt_rat(row):
    if row['RushAttempts'] == 0:
        return 0
    return row['RushTDs'] / row['RushAttempts']

def pass_rush_att_rat(row):
    if row['RushAttempts'] == 0:
        return 1
    return row['PassAttempts'] / row['RushAttempts']

def complete_perc(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['Complete'] / row['PassAttempts']

def avg_rush_yrds(row):
    if row['RushAttempts'] == 0:
        return 0
    return row['RushYrds'] / row['RushAttempts']

def avg_pass_yrds(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['PassYrds'] / row['PassAttempts']

def avg_plays(row):
    return row['Plays'] / row['Games']

def score_per_play(row):
    return row['Score'] / row['Plays']

def score_per_game(row):
    return row['Score'] / row['Games']

def games_perc(row):
    return row['Games'] / 16

def intercept_rat(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['Interceptions'] / row['PassAttempts']

def pass_att_per_game(row):
    return row['PassAttempts'] / row['Games']

def rush_att_per_game(row):
    return row['RushAttempts'] / row['Games']

def fumb_per_game(row):
    return row['Fumbles'] / row['Games']

In [101]:
df = select_to_df(fdb, 'PlayerSeason', player_lbls)

def add_features(df):
    df['PassTDAttRatio'] = df.apply(pass_tdatt_rat, axis=1)
    df['RushTDAttRatio'] = df.apply(rush_tdatt_rat, axis=1)
    df['AvgPassYrds'] = df.apply(avg_pass_yrds, axis=1)
    df['AvgRushYrds'] = df.apply(avg_rush_yrds, axis=1)
    df['PercentComplete'] = df.apply(complete_perc, axis=1)
    df['PlaysPerGame'] = df.apply(avg_plays, axis=1)
    df['GamesPlayedPercent'] = df.apply(games_perc, axis=1)
    df['InterceptionRatio'] = df.apply(intercept_rat, axis=1)
    df['PassAttPerGame'] = df.apply(pass_att_per_game, axis=1)
    df['RushAttPerGame'] = df.apply(rush_att_per_game, axis=1)
    df['FumblesPerGame'] = df.apply(fumb_per_game, axis=1)
    df['PassRushAttRatio'] = df.apply(pass_rush_att_rat, axis=1)
    
    return df

df = add_features(df)

In [102]:
players = pd.merge(df, off, on=['Team', 'Season'])
players.tail()

Unnamed: 0,Player,Position,Team,Season,Plays,Games,RushAttempts,RushYrds,RushTDs,PassAttempts,...,InterceptionRatio,PassAttPerGame,RushAttPerGame,FumblesPerGame,PassRushAttRatio,OffPassRushRatio,OffYardsPerGame,OffPassYrdsPerGame,OffRushYrdsPerGame,OffPointsPerGame
5224,Danny Vitale,RB,CLE,2016,5.0,9.0,0.0,0.0,0.0,5.0,...,0.0,0.555556,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5225,Gary Barnidge,TE,CLE,2016,82.0,16.0,0.0,0.0,0.0,82.0,...,0.0,5.125,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5226,Seth DeValve,TE,CLE,2016,12.0,12.0,0.0,0.0,0.0,12.0,...,0.0,1.0,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5227,Connor Hamlett,TE,CLE,2016,1.0,3.0,0.0,0.0,0.0,1.0,...,0.0,0.333333,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5228,Randall Telfer,TE,CLE,2016,7.0,14.0,0.0,0.0,0.0,7.0,...,0.0,0.5,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5


## Creating the Models

In [103]:
class Model:
    def __init__(self, feature_list):
        self.features = feature_list
    
    def set_model(self, model):
        self.model = model
        
    def predict(self, feature_dict):
        parameters = list()
        for idx in range(len(self.features)):
            if self.features[idx][2:] in feature_dict:
                parameters.append(feature_dict[self.features[idx][2:]])
            else:
                parameters.append(feature_dict[self.features[idx]])
        
        return self.model.predict(parameters)
        
class PositionModels:
    def __init__(self, position):
        self.models = dict()
        self.position = position
    
    def add_model(self, predict, model):
        self.models[predict] = model
        
    def predict_player(self, feature_dict):
        predictions = dict()
        for cur_predict in self.models:
            if cur_predict[2:] in self.models:
                cur_model = self.models[cur_predict[2:]]
            else:
                cur_model = self.models[cur_predict]
            
            # print(str(cur_predict)+': '+str(cur_model.predict(feature_dict)))
            predictions[cur_predict] = cur_model.predict(feature_dict)
        
        return predictions
        
    def predict_df(self, df):
        def predict_var(row, to_predict):
            #print(len(self.predict_player(row.to_dict())[to_predict]))
            return self.predict_player(row.to_dict())[to_predict][0]
        
        # add interactions
        interacts = set()
        for to_predict in self.models:
            for cur_feat in self.models[to_predict].features:
                if '*' in cur_feat:
                    interacts.add(cur_feat)
        
        for cur_interact in interacts:
            feat1 = cur_interact.split('*')[0][2:]
            feat2 = cur_interact.split('*')[1][2:]
            df[cur_interact] = df.apply(lambda r, f1, f2: r[f1]*r[f2], args=(feat1, feat2), axis=1)
        
        for to_predict in self.models:
            print('Predicting '+to_predict+'...')
            new_col = 'Pred' + to_predict
            df[new_col] = df.apply(predict_var, args=(to_predict,), axis=1)
        
        return df
    
    def create_model(self, df, to_predict, features, game_limit=0, test_cutoff=2014, add_limits=dict(), interacts=list()):
        new_model = create_model(df, self.position, to_predict, features, game_limit=game_limit, 
                                 test_cutoff=test_cutoff, add_limits=add_limits, interacts=interacts)
        self.add_model(to_predict, new_model)

In [104]:
def create_model(df, pos, predict, features, reg_type='linear', game_limit=0, test_cutoff=2016, 
                 add_limits=dict(), interacts=list(), cat_features=list()):
    df = df[df.Position == pos]
    left_vars = ['Player', 'Team', 'Season'] + [predict]
    right_vars = ['Player', 'Team', 'Season'] + features
    model_vars = left_vars.copy()
    orig_vars = model_vars.copy()
    for idx in range(len(features)):
        features[idx] = 'PS' + features[idx]
    
    for cat_var in cat_features:
        dummy_df = pd.get_dummies(df[cat_var])
        for var_name in dummy_df.columns:
            features.append(str(var_name))
    model_vars += features
        
    # build train dataframe
    model_df = pd.DataFrame(columns=model_vars)
    for season in range(2008, test_cutoff):
        prev_season = str(season - 1)
        season = str(season)
        left_df = df[df.GamesPlayedPercent >= (game_limit/16)]
        for limit in add_limits:
            left_df = left_df[left_df[limit] >= add_limits[limit]]
        left_df = left_df[left_vars][df.Season == season]
        right_df = df[right_vars][df.Season == prev_season]
        right_df['Season'] = str(season)
        # add categorical dummy variables
        for cat_var in cat_features:
            dummy_df = pd.get_dummies(df[cat_var])
            right_df = right_df.join(dummy_df)
        
        left_df = pd.merge(left_df, right_df, on=['Player', 'Team', 'Season'])
        left_df.columns = model_vars
        model_df = model_df.append(left_df)
    
    # build interactions
    for cur_inter in interacts:
        feat1 = str(features[cur_inter[0]])
        feat2 = str(features[cur_inter[1]])
        model_df[feat1+'*'+feat2] = model_df.apply(lambda row, feat1, feat2: row[feat1] * row[feat2], 
                                                 args=(feat1, feat2), axis=1)
        features.append(feat1+'*'+feat2)
            
    
    # build test dataframe
    #model_vars = orig_vars
    test_df = pd.DataFrame(columns=model_vars)
    for season in range(test_cutoff, 2017):
        left_df = df[df.GamesPlayedPercent >= (game_limit/16)]
        for limit in add_limits:
            left_df = left_df[left_df[limit] >= add_limits[limit]]
        left_df = left_df[left_vars][df.Season == str(season)]
        right_df = df[right_vars][df.Season == str(season-1)]
        right_df['Season'] = str(season)
        # add categorical dummy variables
        for cat_var in cat_features:
            dummy_df = pd.get_dummies(df[cat_var])
            right_df = right_df.join(dummy_df)

        left_df = pd.merge(left_df, right_df, on=['Player', 'Team', 'Season'])
        left_df.columns = model_vars
        test_df = test_df.append(left_df)
        
    # build interactions
    for cur_inter in interacts:
        feat1 = str(features[cur_inter[0]])
        feat2 = str(features[cur_inter[1]])
        test_df[feat1+'*'+feat2] = test_df.apply(lambda row, feat1, feat2: row[feat1] * row[feat2], 
                                                 args=(feat1, feat2), axis=1)     
    

    if reg_type == 'linear':
        model = linear_model.LinearRegression()
    elif reg_type == 'logistic':
        model = linear_model.LogisticRegression()
    model.fit(model_df[features], model_df[predict])
    print('|=== Predicting '+str(predict)+' ===|\n')
    print('|====== Model Coefficients ======|')
    for idx in range(len(model.coef_)):
        print(str(features[idx])+': '+str(model.coef_[idx]))
    print('Intercept: '+str(model.intercept_))
    print('\n|========== Performance ==========|')
    print('R-Sqr on Seasons '+str(test_cutoff)+' to 2016: '  + str(model.score(model_df[features], model_df[predict])))
    print('Train Data Size: '+str(len(model_df)))
    print('Test Data Size:'+str(len(test_df)))
    model_obj = Model(features)
    model_obj.set_model(model)
    return model_obj

## Quarterbacks

In [105]:
ModelsQB = PositionModels('QB')

In [106]:
AvgPassYrds = create_model(players, 'QB', 'AvgPassYrds', 
                           ['AvgPassYrds', 'PercentComplete', 'OffPassYrdsPerGame'], 
                   game_limit=10, test_cutoff=2014, interacts=[])
ModelsQB.add_model('AvgPassYrds', AvgPassYrds)

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: 0.281325343102
PSPercentComplete: 6.41163920165
PSOffPassYrdsPerGame: 0.000708867372633
Intercept: 1.13018502735

R-Sqr on Seasons 2014 to 2016: 0.52100841563
Train Data Size: 143
Test Data Size:70


In [107]:
PassAttPerGame = create_model(players, 'QB', 'PassAttPerGame', 
                           ['PassAttPerGame', 'AvgPassYrds', 'PassTDAttRatio', 'OffPointsPerGame', 'GamesPlayedPercent'], 
                   game_limit=6, test_cutoff=2014, interacts=[(0,4), (1,3)])
ModelsQB.add_model('PassAttPerGame', PassAttPerGame)

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 0.585682389826
PSAvgPassYrds: 4.66416477994
PSPassTDAttRatio: -19.9779011058
PSOffPointsPerGame: 1.46935242751
PSGamesPlayedPercent: -7.49648014459
PSPassAttPerGame*PSGamesPlayedPercent: 0.230692346385
PSAvgPassYrds*PSOffPointsPerGame: -0.173339137249
Intercept: -23.8767212999

R-Sqr on Seasons 2014 to 2016: 0.595266049834
Train Data Size: 184
Test Data Size:91


In [108]:
PassTDAttRatio = create_model(players, 'QB', 'PassTDAttRatio', 
                           ['PassTDAttRatio', 'AvgPassYrds', 'PercentComplete', 'GamesPlayedPercent', 'RushTDAttRatio'], 
                   game_limit=8, test_cutoff=2014)
ModelsQB.add_model('PassTDAttRatio', PassTDAttRatio)

|=== Predicting PassTDAttRatio ===|

PSPassTDAttRatio: 0.118074081227
PSAvgPassYrds: 0.00221885064841
PSPercentComplete: 0.0325126213073
PSGamesPlayedPercent: 0.0075644469316
PSRushTDAttRatio: 0.0508856621922
Intercept: -0.00569435143606

R-Sqr on Seasons 2014 to 2016: 0.236048860829
Train Data Size: 161
Test Data Size:78


In [109]:
AvgRushYrds = create_model(players, 'QB', 'AvgRushYrds', 
                           ['AvgRushYrds', 'RushAttPerGame', 'PassAttPerGame', 'OffPointsPerGame'], 
                   game_limit=8, test_cutoff=2014)
ModelsQB.add_model('AvgRushYrds', AvgRushYrds)

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.186097495049
PSRushAttPerGame: 0.606931980498
PSPassAttPerGame: -0.0429372088817
PSOffPointsPerGame: -0.0951528762998
Intercept: 4.5258439588

R-Sqr on Seasons 2014 to 2016: 0.36818219927
Train Data Size: 161
Test Data Size:78


In [110]:
RushAttPerGame = create_model(players, 'QB', 'RushAttPerGame', 
                           ['RushAttPerGame', 'RushTDAttRatio', 'PassRushAttRatio', 'GamesPlayedPercent'], 
                   game_limit=6, test_cutoff=2014, interacts=[(0,0)])
ModelsQB.add_model('RushAttPerGame', RushAttPerGame)

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.320401292597
PSRushTDAttRatio: 4.6558133053
PSPassRushAttRatio: -0.0159485482807
PSGamesPlayedPercent: -0.225122925987
PSRushAttPerGame*PSRushAttPerGame: 0.0453150359155
Intercept: 1.53310205211

R-Sqr on Seasons 2014 to 2016: 0.510334569854
Train Data Size: 184
Test Data Size:91


In [111]:
FumblesPerGame = create_model(players, 'QB', 'FumblesPerGame', 
                           ['FumblesPerGame', 'OffPassYrdsPerGame', 'PassAttPerGame'], 
                   game_limit=8, test_cutoff=2014, interacts=[(0,1)])
ModelsQB.add_model('FumblesPerGame', FumblesPerGame)

|=== Predicting FumblesPerGame ===|

PSFumblesPerGame: 0.00225734741466
PSOffPassYrdsPerGame: -0.00064973396875
PSPassAttPerGame: 0.00243968422387
PSFumblesPerGame*PSOffPassYrdsPerGame: 0.000395880115763
Intercept: 0.275483838641

R-Sqr on Seasons 2014 to 2016: 0.0699783290304
Train Data Size: 161
Test Data Size:78


In [112]:
playersQB = players[players.Position == 'QB']

In [113]:
PredictQB = ModelsQB.predict_df(playersQB)
PredictQB[PredictQB.Player == 'Tom Brady'][['Player', 'Team', 'Season', 'AvgPassYrds', 'PredAvgPassYrds']].tail()

Predicting FumblesPerGame...
Predicting AvgPassYrds...
Predicting PassAttPerGame...
Predicting AvgRushYrds...
Predicting RushAttPerGame...
Predicting PassTDAttRatio...


Unnamed: 0,Player,Team,Season,AvgPassYrds,PredAvgPassYrds
2609,Tom Brady,NE,2012,7.577708,7.504745
3312,Tom Brady,NE,2013,6.915605,7.136446
3785,Tom Brady,NE,2014,7.060137,7.408136
4164,Tom Brady,NE,2015,7.644231,7.6145
5012,Tom Brady,NE,2016,8.226852,7.954421


In [114]:
QB2015 = PredictQB[PredictQB.Season == '2015'][['Player', 'Season', 'Team', 'AvgPassYrds', 
            'PredAvgPassYrds', 'PassAttPerGame', 'PredPassAttPerGame']]
QB2016 = PredictQB[PredictQB.Season == '2016'][['Player', 'Season', 'Team', 'AvgPassYrds', 'PassAttPerGame']]
QB2016.columns = ['Player', 'Season', 'Team', '2016AvgPassYrds', '2016PassAttPerGame']
PredictQB2016 = pd.merge(QB2015, QB2016, on=('Player', 'Team'))
PredictQB2016 = PredictQB2016[['Player', 'Team', 'PredAvgPassYrds', '2016AvgPassYrds', 'PredPassAttPerGame', '2016PassAttPerGame']].sort(columns=('2016AvgPassYrds'), ascending=False)
PredictQB2016.head()

Unnamed: 0,Player,Team,PredAvgPassYrds,2016AvgPassYrds,PredPassAttPerGame,2016PassAttPerGame
22,Matt Ryan,ATL,7.677779,9.258427,37.817755,33.375
15,Geno Smith,NYJ,7.206724,9.0,38.422704,4.666667
1,Derek Anderson,CAR,7.251571,8.54717,17.376727,10.6
21,Matt Moore,MIA,11.649222,8.287356,23.003619,17.4
3,Tom Brady,NE,7.6145,8.226852,39.160747,36.0


In [115]:
def predict_total_pass(row):
    return row.PredAvgPassYrds * row.PredPassAttPerGame * 16

def actual_total_pass(row):
    return row['2016AvgPassYrds'] * row['2016PassAttPerGame'] * 16

PredictQB2016['PredPassYrds'] = PredictQB2016.apply(predict_total_pass, axis=1)
PredictQB2016['2016PassYrds'] = PredictQB2016.apply(actual_total_pass, axis=1)
PredictQB2016 = PredictQB2016.sort(columns=('2016PassYrds'), ascending=False)
print('Total Correlation:')
print(PredictQB2016[['Player', 'PredPassYrds', '2016PassYrds']].corr())
print('Top 15 Correlation:')
print(PredictQB2016[['Player', 'PredPassYrds', '2016PassYrds']][:15].corr())
PredictQB2016[['Player', 'PredPassYrds', '2016PassYrds']]

Total Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds       1.00000       0.68197
2016PassYrds       0.68197       1.00000
Top 15 Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.618141
2016PassYrds      0.618141      1.000000


Unnamed: 0,Player,PredPassYrds,2016PassYrds
8,Drew Brees,5185.476151,5208.0
22,Matt Ryan,4645.702003,4944.0
11,Kirk Cousins,4377.06154,4917.0
3,Tom Brady,4771.031841,4738.666667
33,Andrew Luck,4032.291539,4522.666667
6,Carson Palmer,4293.847302,4515.2
16,Aaron Rodgers,3955.949606,4428.0
12,Philip Rivers,4816.15932,4386.0
28,Ben Roethlisberger,5000.604193,4364.571429
10,Matthew Stafford,4442.966268,4327.0


#### Current Metrics

Total: 0.682

Top 15: 0.618

## Running Backs

In [200]:
ModelsRB = PositionModels('RB')

#### Primary Stats

In [201]:
AvgRushYrds = create_model(players, 'RB', 'AvgRushYrds', 
                           ['AvgRushYrds', 'RushTDAttRatio', 'RushAttPerGame'], 
                   game_limit=14, test_cutoff=2014, add_limits={'RushAttPerGame': 8}, interacts=[])
ModelsRB.add_model('AvgRushYrds', AvgRushYrds)

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.233007230324
PSRushTDAttRatio: -1.91252474272
PSRushAttPerGame: -0.0100144505186
Intercept: 3.50947407628

R-Sqr on Seasons 2014 to 2016: 0.112014723196
Train Data Size: 133
Test Data Size:51


In [202]:
RushAttPerGame = create_model(players, 'RB', 'RushAttPerGame', 
                           ['RushAttPerGame', 'AvgRushYrds'], 
                   game_limit=14, test_cutoff=2014)
ModelsRB.add_model('RushAttPerGame', RushAttPerGame)

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.829612058324
PSAvgRushYrds: 0.350216329528
Intercept: 0.0965459334221

R-Sqr on Seasons 2014 to 2016: 0.742479651202
Train Data Size: 329
Test Data Size:149


In [203]:
RushTDAttRatio = create_model(players, 'RB', 'RushTDAttRatio', 
                           ['RushTDAttRatio', 'AvgRushYrds', 'OffRushYrdsPerGame'], 
                   game_limit=14, test_cutoff=2014, interacts=[(0,2)])
ModelsRB.add_model('RushAttPerGame', RushAttPerGame)

|=== Predicting RushTDAttRatio ===|

PSRushTDAttRatio: 3.35812749939
PSAvgRushYrds: -0.00274617361922
PSOffRushYrdsPerGame: 0.000452560830186
PSRushTDAttRatio*PSOffRushYrdsPerGame: -0.0269480824138
Intercept: -0.0172543176205

R-Sqr on Seasons 2014 to 2016: 0.217237107317
Train Data Size: 329
Test Data Size:149


#### Secondary Stats

In [204]:
# create boolean ispasser as addition feature for AvgPassYrds?
# maybe limit train set to only passers and only predict if predicted to be a passer/pass att > certain margin

In [205]:
AvgPassYrds = create_model(players, 'RB', 'AvgPassYrds', 
                           ['AvgPassYrds'], 
                   game_limit=14, test_cutoff=2014, interacts=[(0,0)])
ModelsRB.add_model('AvgPassYrds', AvgPassYrds)

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: 0.0355588811577
PSAvgPassYrds*PSAvgPassYrds: 0.00361711252065
Intercept: 5.18121883081

R-Sqr on Seasons 2014 to 2016: 0.00508318315398
Train Data Size: 329
Test Data Size:149


In [206]:
PassAttPerGame = create_model(players, 'RB', 'PassAttPerGame', 
                           ['PassAttPerGame'], 
                   game_limit=14, test_cutoff=2014, interacts=[(0,0)])
ModelsRB.add_model('PassAttPerGame', PassAttPerGame)

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 1.02159768294
PSPassAttPerGame*PSPassAttPerGame: -0.0568342700929
Intercept: 0.389867015535

R-Sqr on Seasons 2014 to 2016: 0.545003585184
Train Data Size: 329
Test Data Size:149


In [207]:
FumblesPerGame = create_model(players, 'RB', 'FumblesPerGame', 
                           ['FumblesPerGame', 'RushAttPerGame'], 
                   game_limit=14, test_cutoff=2014, interacts=[])
ModelsRB.add_model('FumblesPerGame', FumblesPerGame)

|=== Predicting FumblesPerGame ===|

PSFumblesPerGame: 0.120723512678
PSRushAttPerGame: 0.00465479290049
Intercept: 0.0159142261803

R-Sqr on Seasons 2014 to 2016: 0.274732796335
Train Data Size: 329
Test Data Size:149


In [208]:
def predict_total_rush(row):
    return row.PredAvgRushYrds * row.PredRushAttPerGame * 16

def actual_total_rush(row):
    return row['2016AvgRushYrds'] * row['2016RushAttPerGame'] * 16

PredictRB = ModelsRB.predict_df(players[players.Position == 'RB'])
RB2015 = PredictRB[PredictRB.Season == '2015'][['Player', 'Season', 'Team', 'AvgRushYrds', 
            'PredAvgRushYrds', 'RushAttPerGame', 'PredRushAttPerGame']]
RB2016 = PredictRB[PredictRB.Season == '2016'][['Player', 'Season', 'Team', 'AvgRushYrds', 'RushAttPerGame']]
RB2016.columns = ['Player', 'Season', 'Team', '2016AvgRushYrds', '2016RushAttPerGame']
PredictRB2016 = pd.merge(RB2015, RB2016, on=('Player', 'Team'))
PredictRB2016 = PredictRB2016[['Player', 'Team', 'PredAvgRushYrds', '2016AvgRushYrds', 'PredRushAttPerGame', '2016RushAttPerGame']].sort(columns=('2016AvgRushYrds'), ascending=False)
PredictRB2016['PredRushYrds'] = PredictRB2016.apply(predict_total_rush, axis=1)
PredictRB2016['2016RushYrds'] = PredictRB2016.apply(actual_total_rush, axis=1)
PredictRB2016 = PredictRB2016.sort(columns=('2016RushYrds'), ascending=False)
print('Total Correlation:')
print(PredictRB2016[['Player', 'PredRushYrds', '2016RushYrds']].corr())
print('Top 15 Correlation:')
print(PredictRB2016[['Player', 'PredRushYrds', '2016RushYrds']][:15].corr())
PredictRB2016[['Player', 'PredRushYrds', '2016RushYrds']]

Predicting AvgRushYrds...
Predicting FumblesPerGame...
Predicting AvgPassYrds...
Predicting RushAttPerGame...
Predicting PassAttPerGame...
Total Correlation:
              PredRushYrds  2016RushYrds
PredRushYrds      1.000000      0.699992
2016RushYrds      0.699992      1.000000
Top 15 Correlation:
              PredRushYrds  2016RushYrds
PredRushYrds      1.000000      0.200854
2016RushYrds      0.200854      1.000000


Unnamed: 0,Player,PredRushYrds,2016RushYrds
60,Le'Veon Bell,1232.689505,1690.666667
39,Jay Ajayi,409.840957,1356.800000
50,LeSean McCoy,1088.236749,1351.466667
10,David Johnson,576.611240,1239.000000
28,Melvin Gordon,819.151456,1227.076923
84,Carlos Hyde,1030.047191,1216.000000
4,LeGarrette Blount,893.070565,1161.000000
31,Eddie Lacy,815.577100,1152.000000
41,Devonta Freeman,1080.451754,1079.000000
47,Spencer Ware,549.692464,1052.571429


#### Current Metrics
Total: 0.7

Top 15: 0.19

there is a bigger issue with unpredictable number of games played. most overpredictions
are related to not controlling for games played, but performance is still poor
especially for the top running backs. hopefully will improve with interactions

## Wide Receivers

In [124]:
ModelsWR = PositionModels('WR')

#### Primary Stats

In [125]:
AvgPassYrds = create_model(players, 'WR', 'AvgPassYrds', 
                           ['AvgPassYrds', 'OffPassYrdsPerGame'], 
                   game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[(0,1)])
ModelsWR.add_model('AvgPassYrds', AvgPassYrds)

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: -0.423650868497
PSOffPassYrdsPerGame: -0.0163688784131
PSAvgPassYrds*PSOffPassYrdsPerGame: 0.00252492548065
Intercept: 10.3358299284

R-Sqr on Seasons 2014 to 2016: 0.0888486413396
Train Data Size: 322
Test Data Size:145


In [126]:
PassAttPerGame = create_model(players, 'WR', 'PassAttPerGame', 
                           ['PassAttPerGame'], 
                   game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 0})
ModelsWR.add_model('PassAttPerGame', PassAttPerGame)

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 0.764821757917
Intercept: 1.47599181756

R-Sqr on Seasons 2014 to 2016: 0.613892036713
Train Data Size: 486
Test Data Size:228


In [127]:
PassTDAttRatio = create_model(players, 'WR', 'PassTDAttRatio', 
                           ['PassTDAttRatio', 'OffPassYrdsPerGame', 'OffPointsPerGame'], 
                   game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[(0,0), (0,3), (0,1)])
ModelsWR.add_model('PassTDAttRatio', PassTDAttRatio)

|=== Predicting PassTDAttRatio ===|

PSPassTDAttRatio: -0.303232791479
PSOffPassYrdsPerGame: -5.0709442713e-05
PSOffPointsPerGame: 0.000265962867502
PSPassTDAttRatio*PSPassTDAttRatio: 1.27255018912
PSPassTDAttRatio*PSPassTDAttRatio*PSPassTDAttRatio: -10.9152195032
PSPassTDAttRatio*PSOffPassYrdsPerGame: 0.00202838561932
Intercept: 0.0433359431419

R-Sqr on Seasons 2014 to 2016: 0.0963872329005
Train Data Size: 322
Test Data Size:145


#### Secondary Stats

In [128]:
ModelsWR.create_model(players, 'AvgRushYrds', ['AvgRushYrds', 'RushAttPerGame', 'OffRushYrdsPerGame'], 
                      game_limit=12, test_cutoff=2014, add_limits={})

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.147727772425
PSRushAttPerGame: 3.52344002527
PSOffRushYrdsPerGame: -0.0181129923799
Intercept: 4.03411535313

R-Sqr on Seasons 2014 to 2016: 0.0773486183827
Train Data Size: 486
Test Data Size:228


In [129]:
ModelsWR.create_model(players, 'RushAttPerGame', ['RushAttPerGame'], 
                      game_limit=12, test_cutoff=2014, add_limits={}, interacts=[(0,0)])

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.983191135993
PSRushAttPerGame*PSRushAttPerGame: -0.105245457362
Intercept: 0.00850077169574

R-Sqr on Seasons 2014 to 2016: 0.480727395623
Train Data Size: 486
Test Data Size:228


In [130]:
# static RushTDAttRatio for WRs

In [131]:
PredictWR = ModelsWR.predict_df(players[players.Position == 'WR'])
WR2015 = PredictWR[PredictWR.Season == '2015'][['Player', 'Season', 'Team', 'AvgPassYrds', 
            'PredAvgPassYrds', 'PassAttPerGame', 'PredPassAttPerGame']]
WR2016 = PredictWR[PredictWR.Season == '2016'][['Player', 'Season', 'Team', 'AvgPassYrds', 'PassAttPerGame']]
WR2016.columns = ['Player', 'Season', 'Team', '2016AvgPassYrds', '2016PassAttPerGame']
PredictWR2016 = pd.merge(WR2015, WR2016, on=('Player', 'Team'))
PredictWR2016 = PredictWR2016[['Player', 'Team', 'PredAvgPassYrds', '2016AvgPassYrds', 'PredPassAttPerGame', '2016PassAttPerGame']].sort(columns=('2016AvgPassYrds'), ascending=False)
PredictWR2016['PredPassYrds'] = PredictWR2016.apply(predict_total_pass, axis=1)
PredictWR2016['2016PassYrds'] = PredictWR2016.apply(actual_total_pass, axis=1)
PredictWR2016 = PredictWR2016.sort(columns=('PredPassYrds'), ascending=False)
print('Total Correlation:')
print(PredictWR2016[['Player', 'PredPassYrds', '2016PassYrds']].corr())
print('Top 15 Correlation:')
print(PredictWR2016[['Player', 'PredPassYrds', '2016PassYrds']][:15].corr())
#PredictWR2016[['Player', 'PredPassYrds', '2016PassYrds']]

Predicting AvgRushYrds...
Predicting RushAttPerGame...
Predicting PassAttPerGame...
Predicting AvgPassYrds...
Predicting PassTDAttRatio...
Total Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.739544
2016PassYrds      0.739544      1.000000
Top 15 Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.484234
2016PassYrds      0.484234      1.000000


#### Current Metrics

Total: 0.739

Top 15: 0.48

## Tight Ends

In [216]:
ModelsTE = PositionModels('TE')

#### Primary Stats

In [217]:
ModelsTE.create_model(players, 'AvgPassYrds', ['AvgPassYrds'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[])

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: 0.276619249957
Intercept: 5.42182785496

R-Sqr on Seasons 2014 to 2016: 0.101981319253
Train Data Size: 108
Test Data Size:53


In [218]:
ModelsTE.create_model(players, 'PassAttPerGame', ['PassAttPerGame', 'OffPassYrdsPerGame', 'GamesPlayedPercent'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 2}, interacts=[(0,2)])

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 0.261182571605
PSOffPassYrdsPerGame: 0.00687205792516
PSGamesPlayedPercent: -0.307633412213
PSPassAttPerGame*PSGamesPlayedPercent: 0.29136443834
Intercept: 1.32734612382

R-Sqr on Seasons 2014 to 2016: 0.446777129779
Train Data Size: 178
Test Data Size:80


In [219]:
ModelsTE.create_model(players, 'PassTDAttRatio', ['PassTDAttRatio'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[])

|=== Predicting PassTDAttRatio ===|

PSPassTDAttRatio: 0.21132866975
Intercept: 0.041303701398

R-Sqr on Seasons 2014 to 2016: 0.0761600670352
Train Data Size: 108
Test Data Size:53


In [220]:
ModelsTE.create_model(players, 'AvgRushYrds', ['AvgRushYrds'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[])

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.106086642053
Intercept: 0.353413684091

R-Sqr on Seasons 2014 to 2016: 0.0255345646289
Train Data Size: 108
Test Data Size:53


In [221]:
ModelsTE.create_model(players, 'RushAttPerGame', ['RushAttPerGame'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[])

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.607057579655
Intercept: 0.00724529774755

R-Sqr on Seasons 2014 to 2016: 0.105662979561
Train Data Size: 108
Test Data Size:53


In [222]:
PredictTE = ModelsTE.predict_df(players[players.Position == 'TE'])
TE2015 = PredictTE[PredictTE.Season == '2015'][['Player', 'Season', 'Team', 'AvgPassYrds', 
            'PredAvgPassYrds', 'PassAttPerGame', 'PredPassAttPerGame']]
TE2016 = PredictTE[PredictTE.Season == '2016'][['Player', 'Season', 'Team', 'AvgPassYrds', 'PassAttPerGame']]
TE2016.columns = ['Player', 'Season', 'Team', '2016AvgPassYrds', '2016PassAttPerGame']
PredictTE2016 = pd.merge(TE2015, TE2016, on=('Player', 'Team'))
PredictTE2016 = PredictTE2016[['Player', 'Team', 'PredAvgPassYrds', '2016AvgPassYrds', 'PredPassAttPerGame', '2016PassAttPerGame']].sort(columns=('2016AvgPassYrds'), ascending=False)
PredictTE2016['PredPassYrds'] = PredictTE2016.apply(predict_total_pass, axis=1)
PredictTE2016['2016PassYrds'] = PredictTE2016.apply(actual_total_pass, axis=1)
PredictTE2016 = PredictTE2016.sort(columns=('PredPassYrds'), ascending=False)
print('Total Correlation:')
print(PredictTE2016[['Player', 'PredPassYrds', '2016PassYrds']].corr())
print('Top 15 Correlation:')
print(PredictTE2016[['Player', 'PredPassYrds', '2016PassYrds']][:15].corr())
PredictTE2016[['Player', 'PredPassYrds', '2016PassYrds']]

Predicting AvgRushYrds...
Predicting RushAttPerGame...
Predicting PassAttPerGame...
Predicting AvgPassYrds...
Predicting PassTDAttRatio...
Total Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.763585
2016PassYrds      0.763585      1.000000
Top 15 Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.512718
2016PassYrds      0.512718      1.000000


Unnamed: 0,Player,PredPassYrds,2016PassYrds
2,Rob Gronkowski,947.786632,1080.000000
45,Delanie Walker,895.270610,853.333333
12,Jordan Reed,870.587042,914.666667
0,Greg Olsen,863.328109,1073.000000
56,Gary Barnidge,860.913460,612.000000
39,Zach Ertz,817.186428,932.571429
14,Antonio Gates,795.482197,626.285714
29,Travis Kelce,742.950370,1125.000000
3,Jimmy Graham,718.951890,923.000000
62,Jason Witten,714.505729,673.000000


## Adding Depth Chart Info

In [2]:
conn = sqlite3.connect('fantasy.db')
c = conn.cursor()

In [3]:
c.execute('ALTER TABLE PlayerSeason ADD Rank INT')

OperationalError: duplicate column name: Rank

In [3]:
c.execute('ALTER TABLe PlayerSeason ADD PrevRank INT')

OperationalError: duplicate column name: PrevRank

In [4]:
c.execute('''
    UPDATE PlayerSeason 
    SET Rank = (SELECT Roster.Rank FROM Roster 
        WHERE REPLACE(PlayerSeason.Player, \'.\', \'\') = REPLACE(Roster.Player, \'.\', \'\') 
        COLLATE NOCASE AND PlayerSeason.Season = Roster.Season
        AND PlayerSeason.Team = Roster.Team AND PlayerSeason.Position = Roster.Position)
    ''')

<sqlite3.Cursor at 0x94d9b20>

In [19]:
n = 0
for row in c.execute('SELECT * FROM PlayerSeason WHERE Rank IS NOT NULL ORDER BY Games DESC, Season DESC'):
    print(row)
    n += 1
    if n == 100:
        break

('Joe Banyard', 'MIN', 14, 28, 8, 43, 0, 6, 6, 42, 0, 0, 0, 'RB', '2014', 3, None)
('James Hanna', 'DAL', 11, 18, 0, 0, 0, 11, 8, 86, 0, 0, 0, 'TE', '2012', 3, None)
('Chaz Schilens', 'OAK', 33, 18, 1, -2, 0, 32, 17, 236, 2, 0, 0, 'WR', '2008', 1, None)
('Zach Miller', 'OAK', 86, 18, 0, 0, 0, 86, 64, 902, 1, 0, 0, 'TE', '2008', 1, 1)
('Jerricho Cotchery', 'NYJ', 113, 17, 2, 8, 0, 111, 72, 858, 5, 1, 0, 'WR', '2008', 1, 1)
('Davone Bess', 'MIA', 76, 17, 1, 13, 0, 75, 57, 604, 1, 0, 0, 'WR', '2008', 1, None)
('Thomas Jones', 'NYJ', 356, 17, 314, 1471, 13, 42, 37, 207, 2, 1, 0, 'RB', '2008', 1, 1)
('Ronnie Brown', 'MIA', 276, 17, 230, 1017, 10, 43, 34, 265, 0, 2, 0, 'RB', '2008', 1, 4)
('Leon Washington', 'NYJ', 141, 17, 79, 467, 7, 62, 50, 376, 2, 2, 0, 'RB', '2008', 2, 2)
('Ricky Williams', 'MIA', 212, 17, 173, 708, 4, 39, 31, 225, 1, 2, 0, 'RB', '2008', 2, None)
('Patrick Cobbs', 'MIA', 42, 17, 15, 111, 2, 27, 22, 291, 2, 0, 0, 'RB', '2008', 3, 2)
('Anthony Fasano', 'MIA', 53, 17, 0, 0

In [13]:
for row in c.execute('SELECT * FROM Roster WHERE Player = \'David Johnson\''):
    print(row)

('David Johnson', 'ARI', 'RB', '2015', 2)
('David Johnson', 'ARI', 'RB', '2016', 1)
('David Johnson', 'LAC', 'TE', '2014', 2)
('David Johnson', 'LAC', 'TE', '2015', 1)
('David Johnson', 'PIT', 'TE', '2009', 3)
('David Johnson', 'PIT', 'TE', '2016', 2)


In [17]:
for row in c.execute('SELECT * FROM Roster WHERE Player LIKE \'%Tyreek Hill%\''):
    print(row)

('Tyreek Hill', 'KC', 'WR', '2016', 2)


In [6]:
conn.commit()

In [87]:
conn.close()

#### Rank Difference Feature

Adding a feature of player's rank change since his previous season

In [14]:
sql_input = '''UPDATE PlayerSeason
                SET PrevRank = 
                    (SELECT Rank FROM PlayerSeason p2 
                    WHERE p2.Player = PlayerSeason.Player AND 
                    p2.Team = PlayerSeason.Team AND
                    p2.Position = PlayerSeason.Position AND 
                    CAST(PlayerSeason.Season AS DECIMAL)-1 = CAST(p2.Season AS DECIMAL))'''

c.execute(sql_input)

<sqlite3.Cursor at 0x94d9b20>

In [11]:
conn.commit()

## Predicting Games Played

In [7]:
rank_lbls = player_lbls + ['Rank', 'PrevRank']
rank_df = select_to_df('fantasy.db', 'PlayerSeason', rank_lbls, where='WHERE Season > 2007')
rank_df = add_features(rank_df)

Because the number of fourth and fifth strings are so low, and because I'm planning to treat ranks and rank differences as categorical variables, I'm forcing every fourth and fifth string to be considered a third string.

In [8]:
def rank_fix(row, col):
    if row[col] == 5 or row[col] == 4:
        return 3
    
    if row[col] is None:
        return 'Bench'
    
    return row[col]

rank_df['Rank'] = rank_df.apply(rank_fix, args=('Rank',), axis=1)
rank_df['PrevRank'] = rank_df.apply(rank_fix, args=('PrevRank',), axis=1)

The first plan is to treat the combination of (Rank, PrevRank) as a categorical variable. This is because I wanted the best way of finding an interaction between the two, which is difficult to do in the standard mathematical way as the difference between the two does not include information about the player's current season rank. For example, if I just took the rank difference, then the model would see no difference from a 3rd string going to 2nd string and a 2nd string going to 1st string, which should be different behaviors. It seems above that I have just enough data available to where this might work.

In [9]:
def rank_change(row):
    return '('+str(row['PrevRank']).replace('.0', '')+' -> '+str(row['Rank']).replace('.0', '')+')'

rank_df['RankChange'] = rank_df.apply(rank_change, axis=1)

In [13]:
qb_games_mdl = create_model(rank_df, 'QB', 'GamesPlayedPercent', 
                           ['GamesPlayedPercent'], 
                   game_limit=0, test_cutoff=2013, interacts=[(0,n) for n in range(1,len(rank_df['RankChange'].unique()))], 
             cat_features=['RankChange'])

|=== Predicting GamesPlayedPercent ===|

PSGamesPlayedPercent: 1.5
(1 -> 1): 0.377361898717
(1 -> 2): 0.286483455922
(1 -> 3): 0.00538524705777
(1 -> Bench): 1.16727049296
(2 -> 1): -0.334669162213
(2 -> 2): -0.144074691811
(2 -> 3): 0.0378062072451
(2 -> Bench): -0.103265221326
(3 -> 1): -0.292742947901
(3 -> 2): 0.247862598223
(3 -> 3): -0.170229507041
(3 -> Bench): -0.442183976613
(Bench -> 1): -0.0282767963747
(Bench -> 2): -0.148920451922
(Bench -> 3): -0.187577637882
(Bench -> Bench): -0.270229507041
PSGamesPlayedPercent*(1 -> 1): -1.31956293339
PSGamesPlayedPercent*(1 -> 2): -1.57407407407
PSGamesPlayedPercent*(1 -> 3): -0.327868852459
PSGamesPlayedPercent*(1 -> Bench): -9.0
PSGamesPlayedPercent*(2 -> 1): -0.575862068966
PSGamesPlayedPercent*(2 -> 2): -1.00683090705
PSGamesPlayedPercent*(2 -> 3): -1.59523809524
PSGamesPlayedPercent*(2 -> Bench): -0.471428571429
PSGamesPlayedPercent*(3 -> 1): -0.672043010753
PSGamesPlayedPercent*(3 -> 2): -1.86842105263
PSGamesPlayedPercent*(3 ->

In [14]:
sim_player = {'PSGamesPlayedPercent': 0.8,
                '(1 -> 1)': 0,
                '(1 -> 2)': 0,
                '(1 -> 3)': 0,
                '(1 -> Bench)': 0,
                '(2 -> 1)': 1,
                '(2 -> 2)': 0,
                '(2 -> 3)': 0,
                '(2 -> Bench)': 0,
                '(3 -> 1)': 0,
                '(3 -> 2)': 0, 
                '(3 -> 3)': 0,
                '(3 -> Bench)': 0,
                '(Bench -> 1)': 0,
                '(Bench -> 2)': 0, 
                '(Bench -> 3)': 0, 
                '(Bench -> Bench)': 0,
                 'RankChange': '(2 -> 1)',
                 'Position': 'QB'}

sim_player2 = {'PSGamesPlayedPercent': 0.8,
                  '1': 1,
                  '2': 0, 
                  '3': 0, 
                  'Bench': 0}

for var in sim_player.copy():
    if var != 'PSGamesPlayedPercent':
        sim_player['PSGamesPlayedPercent*'+var] = sim_player['PSGamesPlayedPercent']*sim_player[var]

TypeError: can't multiply sequence by non-int of type 'float'

While R-squared scores increased greatly compared to the old strategy of treating ranks, it is still less than desirable. Not only that, but much of it seems unintuitive. For example, the current model for WR's predict someone going from 2nd string to 1st string to play less than they did the previous season where they were a lower rank. 

The main problem is simply not enough data (as you can see above). By making the categories unique to (Position, Rank, PreviousRank) it reduces the amount of data for many of the categories to be less than 50. Even with this issue, predicting based on this combination still performs much better than just using the Rank and PrevRanks as the categorical variables, so I'm going to stick with this general method.

My next plan is to use the means of each rank (displayed below) to help with this prediction. I want to make my end prediction calculated by some combination of my model's prediction and the means for the ranks. The idea is to weigh my model's prediction heavier when it has more data available to it. So predicting a QB going from 1st string to 1st string (207 data points) will rely more on the model, whereas a QB going from 1st string to 3rd string (12 data points) will rely more on the current rank's mean.

In [15]:
rank_df.groupby(['Position', 'Rank'])['GamesPlayedPercent'].mean()

Position  Rank 
QB        1.0      0.859486
          2.0      0.323123
          3.0      0.287500
          Bench    0.230682
RB        1.0      0.863308
          2.0      0.862284
          3.0      0.701027
          Bench    0.616969
TE        1.0      0.908008
          2.0      0.871336
          3.0      0.762041
          Bench    0.432842
WR        1.0      0.914356
          2.0      0.822476
          3.0      0.657328
          Bench    0.467201
Name: GamesPlayedPercent, dtype: float64

In [16]:
sizes = pd.DataFrame(rank_df.groupby(['Position', 'RankChange']).size())
means = pd.DataFrame(rank_df.groupby(['Position', 'Rank'])['GamesPlayedPercent'].mean())

In [17]:
def get_size(pos, rank_change, groupby):
    return int(groupby.loc[(pos, rank_change)])

In [18]:
def predict_games_played(player, model, sizes, means, pref_games=200):
    model_predict = model.predict(player)
    if model_predict < 0:
        model_predict = 0
    if model_predict > 1.0:
        model_predict = 1
        
    model_prop = sqrt(sizes.loc[(player['Position'], player['RankChange'])]) / sqrt(pref_games)
    if model_prop > 1:
        model_prop = 1
        
    mean_predict = means.loc[(player['Position'], player['Rank'])]
    
    return (model_predict * model_prop) + (mean_predict * (1 - model_prop))

In [471]:
sim_player = {'Position': 'RB',
                'Rank': 2.0,
                'RankChange': '(1 -> 2)',
                'PSGamesPlayedPercent': 0.9,
                '(1 -> 1)': 0,
                '(1 -> 2)': 1,
                '(1 -> 3)': 0,
                '(1 -> Bench)': 0,
                '(2 -> 1)': 0,
                '(2 -> 2)': 0,
                '(2 -> 3)': 0,
                '(2 -> Bench)': 0,
                '(3 -> 1)': 0,
                '(3 -> 2)': 0, 
                '(3 -> 3)': 0,
                '(3 -> Bench)': 0,
                '(Bench -> 1)': 0,
                '(Bench -> 2)': 0, 
                '(Bench -> 3)': 0, 
                '(Bench -> Bench)': 0,
             }

for var in sim_player.copy():
    if var not in ['Position', 'Rank', 'RankChange', 'PSGamesPlayedPercent']:
        sim_player['PSGamesPlayedPercent*'+var] = sim_player['PSGamesPlayedPercent']*sim_player[var]

In [None]:
del rank_df['PrevRank']

In [54]:
def build_season_df(df, interacts=False, categorical=[]):
    primary_key = ['Player', 'Position', 'Team']
    left_vars = df.columns.copy()
    right_vars = ['PS'+str(var) for var in df.columns.copy() if var not in primary_key]
    season_vars = list(left_vars.copy()) + list(right_vars)
    
    season_df = pd.DataFrame(columns=left_vars)
    
    # build base dataframe
    for season in range(2009, 2016):
        prev_season = str(season - 1)
        season = str(season)
        
        left_df = df[left_vars][df.Season == season]
        right_df = df[left_vars][df.Season == prev_season]
        right_df.columns = primary_key + right_vars
        left_df = pd.merge(left_df, right_df, on=primary_key)
        season_df = season_df.append(left_df)
    
    # add categorical dummy variables
    for cur_cat_var in categorical:
        dummy_df = pd.get_dummies(season_df[cur_cat_var])
        season_vars.extend(dummy_df.columns)
        season_df = pd.concat([season_df, dummy_df], axis=1)
    
    # add interaction variables
    numerical_vars = []
    for cur_var in season_vars:
        if cur_var not in categorical and cur_var not in primary_key and cur_var != 'Season':
            numerical_vars.append(cur_var)
    
    def interact_apply(row, numerical_vars, outer_idx, inner_idx):
        return row[numerical_vars[outer_idx]] * row[numerical_vars[inner_idx]]
    
    '''
    print(numerical_vars)
    for outer_idx in range(len(numerical_vars)):
        for inner_idx in range(outer_idx, len(numerical_vars)):
            if numerical_vars[outer_idx] not in categorical and numerical_vars[inner_idx] not in categorical:
                interact_var = numerical_vars[outer_idx]+'*'+numerical_vars[inner_idx]
                if season_df[numerical_vars[inner_idx]] is pd.DataFrame:
                    print(season_df[numerical_vars[inner_idx]].head())
                season_df[interact_var] = season_df.apply(interact_apply, args=(numerical_vars, outer_idx, inner_idx,), axis=1)
                season_vars.append(interact_var)
    '''
       
    return season_df.reindex(columns=season_vars)

In [56]:
season_df = build_season_df(rank_df, interacts=True, 
                            categorical=['RankChange', 'Rank'])
season_df.head()

Unnamed: 0,Player,Position,Team,Season,Plays,Games,RushAttempts,RushYrds,RushTDs,PassAttempts,...,(3 -> 3),(3 -> Bench),(Bench -> 1),(Bench -> 2),(Bench -> 3),(Bench -> Bench),1,2,3,Bench
0,Aaron Rodgers,QB,GB,2009,599.0,16.0,58.0,316.0,5.0,541.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Drew Brees,QB,NO,2009,537.0,15.0,22.0,33.0,2.0,514.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Matt Schaub,QB,HOU,2009,631.0,16.0,48.0,57.0,0.0,583.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Peyton Manning,QB,IND,2009,590.0,16.0,19.0,-13.0,0.0,571.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Tony Romo,QB,DAL,2009,585.0,16.0,35.0,105.0,1.0,550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
def add_interacts(df, interacts):
    def apply_int(row, var1, var2):
        return row[var1] * row[var2]
    
    for tup in interacts:
        df[tup[0]+'*'+tup[1]] = df.apply(apply_int, args=(tup[0], tup[1],), axis=1)
    
    return df

In [70]:
for var in season_df['RankChange'].unique():
    season_df = add_interacts(season_df, [('PSGamesPlayedPercent', var)])

In [72]:
def apply_pred_games(row, games_mdl, sizes, means):
    return predict_games_played(row.to_dict(), games_mdl, sizes, means)

In [78]:
season_df['PredGamesPlayedPercent'] = season_df.apply(apply_pred_games, args=(qb_games_mdl, sizes, means,), axis=1)

In [83]:
season_df[['Player', 'Team', 'Season', 'Position', 'PredGamesPlayedPercent', 
           'GamesPlayedPercent', 'PSGamesPlayedPercent']].tail()

Unnamed: 0,Player,Team,Season,Position,PredGamesPlayedPercent,GamesPlayedPercent,PSGamesPlayedPercent
293,Andrew Quarless,GB,2015,TE,0.292585,0.3125,1.0
294,Cory Harkey,LAR,2015,TE,0.715234,1.0,1.0
295,Matt Spaeth,PIT,2015,TE,0.775544,0.8125,0.9375
296,David Johnson,LAC,2015,TE,0.935743,1.0,0.875
297,Nic Jacobs,JAC,2015,TE,0.517266,0.6875,0.8125


In [88]:
sp.stats.pearsonr(season_df['PredGamesPlayedPercent'], season_df['GamesPlayedPercent'])

(0.71225031010854312, 0.0)

The strategies used seemed to have worked exceptionally well at improving the correlation between the predicted games played and the actual games played. The R-squared has increased from 0.21 to 0.71.

## Final Predictions

In [342]:
GamesPredictQB = season_df[season_df.Position == 'QB']
GamesPredictRB = season_df[season_df.Position == 'RB']
GamesPredictWR = season_df[season_df.Position == 'WR']
GamesPredictTE = season_df[season_df.Position == 'TE']

In [343]:
StatsPredictQB = ModelsQB.predict_df(players[players.Position == 'QB'])
StatsPredictRB = ModelsRB.predict_df(players[players.Position == 'RB'])
StatsPredictWR = ModelsWR.predict_df(players[players.Position == 'WR'])
StatsPredictTE = ModelsTE.predict_df(players[players.Position == 'TE'])

Predicting FumblesPerGame...
Predicting AvgPassYrds...
Predicting PassAttPerGame...
Predicting AvgRushYrds...
Predicting RushAttPerGame...
Predicting PassTDAttRatio...
Predicting AvgRushYrds...
Predicting FumblesPerGame...
Predicting AvgPassYrds...
Predicting RushAttPerGame...
Predicting PassAttPerGame...
Predicting AvgRushYrds...
Predicting RushAttPerGame...
Predicting PassAttPerGame...
Predicting AvgPassYrds...
Predicting PassTDAttRatio...
Predicting AvgRushYrds...
Predicting RushAttPerGame...
Predicting PassAttPerGame...
Predicting AvgPassYrds...
Predicting PassTDAttRatio...


In [344]:
FinalPredictQB = pd.merge(GamesPredictQB, StatsPredictQB, on=['Player', 'Team', 'Position', 'Season'])
FinalPredictRB = pd.merge(GamesPredictRB, StatsPredictRB, on=['Player', 'Team', 'Position', 'Season'])
FinalPredictWR = pd.merge(GamesPredictWR, StatsPredictWR, on=['Player', 'Team', 'Position', 'Season'])
FinalPredictTE = pd.merge(GamesPredictTE, StatsPredictTE, on=['Player', 'Team', 'Position', 'Season'])

In [330]:
def total_pass(row):
    return row['PredAvgPassYrds'] * row['PredPassAttPerGame'] * row['PredGamesPlayedPercent'] * 16

def total_rush(row):
    return row['PredAvgRushYrds'] * row['PredRushAttPerGame'] * row['PredGamesPlayedPercent'] * 16

In [345]:
FinalPredictQB['PredSeasonPassYrds'] = FinalPredictQB.apply(total_pass, axis=1)
FinalPredictQB['PredSeasonRushYrds'] = FinalPredictQB.apply(total_rush, axis=1)
FinalPredictRB['PredSeasonPassYrds'] = FinalPredictRB.apply(total_pass, axis=1)
FinalPredictRB['PredSeasonRushYrds'] = FinalPredictRB.apply(total_rush, axis=1)
FinalPredictWR['PredSeasonPassYrds'] = FinalPredictWR.apply(total_pass, axis=1)
FinalPredictWR['PredSeasonRushYrds'] = FinalPredictWR.apply(total_rush, axis=1)
FinalPredictTE['PredSeasonPassYrds'] = FinalPredictTE.apply(total_pass, axis=1)
FinalPredictTE['PredSeasonRushYrds'] = FinalPredictTE.apply(total_rush, axis=1)

In [300]:
FinalPredictTE[['Player', 'Team', 'Season', 'PredSeasonPassYrds', 'PassYrds_x']][FinalPredictTE.Season == '2015'] \
    .sort('PassYrds_x', ascending=False)

Unnamed: 0,Player,Team,Season,PredSeasonPassYrds,PassYrds_x
369,Rob Gronkowski,NE,2015,803.724160,1176.0
372,Greg Olsen,CAR,2015,741.839258,1104.0
373,Delanie Walker,TEN,2015,759.190512,1088.0
371,Gary Barnidge,CLE,2015,821.240360,1043.0
370,Jordan Reed,WAS,2015,728.934205,960.0
376,Travis Kelce,KC,2015,668.204488,875.0
378,Zach Ertz,PHI,2015,530.306945,853.0
375,Benjamin Watson,NO,2015,750.076189,825.0
380,Jason Witten,DAL,2015,613.959391,713.0
379,Antonio Gates,LAC,2015,310.658447,630.0


Normalize and make predictions more extreme (farther away from the mean than they currently are)
Best multiplication factors so far:

QB: 1.3
RB: 1.2
WR: 1.3
TE: 1.55

In [354]:
sp.stats.pearsonr(FinalPredictQB['PredSeasonPassYrds'], FinalPredictQB['PassYrds_x'])

(0.95448586853139705, 2.0959322762654151e-154)

In [346]:
def extremify_col(df, var, mult_factor):
    mean = df[var].mean()
    def extremify_row(row, var, mean, mult_factor):
        new_val = (row[var] - mean) * mult_factor + mean
        
        if new_val < 0:
            return 0
        return new_val
    
    df[var] = df.apply(extremify_row, args=(var, mean, mult_factor,), axis=1)
    return df

In [348]:
FinalPredictQB = extremify_col(FinalPredictQB, 'PredSeasonPassYrds', 1.3)
FinalPredictRB = extremify_col(FinalPredictRB, 'PredSeasonRushYrds', 1.4)
FinalPredictWR = extremify_col(FinalPredictWR, 'PredSeasonPassYrds', 1.6)
FinalPredictTE = extremify_col(FinalPredictTE, 'PredSeasonPassYrds', 1.55)

In [353]:
FinalPredictQB[['Player', 'Team', 'Season', 'PredSeasonPassYrds', 'PassYrds_x']][FinalPredictQB.Season == '2015'] \
    .sort('PassYrds_x', ascending=False)

Unnamed: 0,Player,Team,Season,PredSeasonPassYrds,PassYrds_x
250,Drew Brees,NO,2015,4546.293063,4870.0
254,Philip Rivers,LAC,2015,4765.723166,4792.0
247,Tom Brady,NE,2015,4715.312972,4770.0
249,Carson Palmer,ARI,2015,4967.776028,4671.0
258,Matt Ryan,ATL,2015,4575.311772,4591.0
251,Eli Manning,NYG,2015,4437.375984,4436.0
252,Matthew Stafford,DET,2015,4348.843383,4262.0
257,Ryan Tannehill,MIA,2015,4018.73007,4208.0
253,Kirk Cousins,WAS,2015,4570.177332,4166.0
248,Russell Wilson,SEA,2015,3879.210463,4024.0
