In [4]:
import numpy as np
import sklearn as sk
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from sqlite_api import *
from sklearn import linear_model

import warnings
warnings.filterwarnings("ignore")

In [8]:
fdb = 'fantasy.db'

player_lbls = ['Player', 'Position', 'Team', 'Season', 'Plays', 'Games', 'RushAttempts', 
              'RushYrds', 'RushTDs', 'PassAttempts', 'Complete', 'PassYrds', 'PassTDs', 'Fumbles', 'Interceptions']

In [3]:
## Scoring metrics

qb = {'PassYrds': 0.04, # QBs get 0.04 points per passing yard 
        'PassTDs': 4, 
        'Interceptions': -2,
        'RushYrds': 0.1,
        'RushTDs': 6,
        'Fumbles': -2}

wr = {'PassYrds': 0.01,
         'PassTDs': 6,
         'Interceptions': -2,
         'RushYrds': 0.1,
         'RushTDs': 6,
         'Fumbles': -2}

rb = wr.copy()

te = wr.copy()

scores = {'QB': qb,
             'RB': rb,
             'WR': wr,
             'TE': te}

# need 2 points for 2point conversions

In [4]:
def score_row(row, qb_scores, wr_scores, rb_scores, te_scores):
    score = 0
    if row['Position'] == 'QB':
        metrics = qb_scores
    elif row['Position'] == 'WR':
        metrics = wr_scores
    elif row['Position'] == 'RB':
        metrics = rb_scores
    elif row['Position'] == 'TE':
        metrics = te_scores
    
    for var in metrics:
        score += row[var] * metrics[var]
        
    return score

In [12]:
select_to_df(fdb, 'OffenseSeason', ['Team', 'Season', 'Yards']).tail()

Unnamed: 0,Team,Season,Yards
91,SEA,2014,6414.0
92,SF,2014,5771.0
93,TB,2014,5215.0
94,TEN,2014,5159.0
95,WAS,2014,6177.0


In [5]:
plyr = select_to_df(fdb, 'PlayerSeason', player_lbls, where='WHERE Season=2016')  

In [5]:
# create 2016 mock season database
rdb = 'rankings.db'
conn = sqlite3.connect(rdb)
c = conn.cursor()

In [95]:
c.execute('DROP TABLE Kickers')

OperationalError: no such table: Kickers

In [96]:
for pos in ['Quarterbacks', 'RunningBacks', 'WideReceivers', 'TightEnds', 'Kickers']:
    c.execute('''CREATE Table ''' + pos +
                '''(Player VARCHAR(30),
                    Team VARCHAR(3),
                    Position CHARACTER(2),
                    PRIMARY KEY (Player))''')

In [97]:
c.execute('''CREATE Table Defenses
                (Team VARCHAR(3),
                Position CHARACTER(2),
                PRIMARY KEY (Team))''')

<sqlite3.Cursor at 0x21886351f10>

In [102]:
df = select_to_df(fdb, 'PlayerSeason', player_lbls, where='WHERE Season=2016')[['Player', 'Position', 'Team']]
QBs = df[df.Position == 'QB']
for idx in range(len(QBs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO Quarterbacks VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [103]:
RBs = df[df.Position == 'RB']
for idx in range(len(RBs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO RunningBacks VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [104]:
WRs = df[df.Position == 'WR']
for idx in range(len(WRs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO WideReceivers VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [105]:
TEs = df[df.Position == 'TE']
for idx in range(len(TEs)):
    row = df.iloc[idx]
    c.execute('INSERT INTO TightEnds VALUES (?, ?, ?)', (row[0], row[2], row[1]))

In [107]:
df = select_to_df(fdb, 'KickerSeason', ['Player', 'Team', 'Position'], 'WHERE Season = 2016')
for idx in range(len(df)):
    c.execute('INSERT INTO Kickers VALUES (?, ?, ?)', df.iloc[idx])

In [108]:
df = select_to_df(fdb, 'DefenseSeason', ['Team', 'Position'], 'WHERE Season = 2016')
for idx in range(len(df)):
    c.execute('INSERT INTO Defenses VALUES (?, ?)', df.iloc[idx])

In [30]:
df = select_to_df(fdb, 'PlayerSeason', player_lbls, 'WHERE Season != 2016')
df['Score'] = ply.apply(score_row, args=(qb, wr, rb, te), axis=1)

## Offensive Features

In [7]:
def add_feature(df, name, func):
    df[name] = df.apply(func, axis=1)
    return df

In [8]:
offense_lbls = ['Team', 
                 'Season', 
                 'Yards',
                 'PassYrds',
                 'RushYrds',
                 'Points']

In [9]:
def pass_rush_rat(row):
    return row['PassYrds'] / row['RushYrds']

def yards_per_game(row):
    return row['Yards'] / 16

def pass_yrds_per_game(row):
    return row['PassYrds'] / 16

def rush_yrds_per_game(row):
    return row['RushYrds'] / 16

def points_per_game(row):
    return row['Points'] / 16

In [10]:
off = select_to_df(fdb, 'OffenseSeason', offense_lbls)

In [11]:
off = add_feature(off, 'OffPassRushRatio', pass_rush_rat)
off = add_feature(off, 'OffYardsPerGame', yards_per_game)
off = add_feature(off, 'OffPassYrdsPerGame', pass_yrds_per_game)
off = add_feature(off, 'OffRushYrdsPerGame', rush_yrds_per_game)
off = add_feature(off, 'OffPointsPerGame', points_per_game)
off = off.drop(['Yards', 'PassYrds', 'RushYrds', 'Points'], axis=1)
off.head()

Unnamed: 0,Team,Season,OffPassRushRatio,OffYardsPerGame,OffPassYrdsPerGame,OffRushYrdsPerGame,OffPointsPerGame
0,NE,2007,2.55868,411.25,295.6875,115.5625,36.8125
1,DAL,2007,2.351088,365.6875,256.5625,109.125,28.4375
2,IND,2007,2.364009,358.6875,252.0625,106.625,28.125
3,JAX,2007,1.391886,357.4375,208.0,149.4375,25.6875
4,SEA,2007,2.448425,348.9375,247.75,101.1875,24.5625


## Additional Player Features

In [12]:
def pass_tdatt_rat(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['PassTDs'] / row['PassAttempts']

def rush_tdatt_rat(row):
    if row['RushAttempts'] == 0:
        return 0
    return row['RushTDs'] / row['RushAttempts']

def pass_rush_att_rat(row):
    if row['RushAttempts'] == 0:
        return 1
    return row['PassAttempts'] / row['RushAttempts']

def complete_perc(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['Complete'] / row['PassAttempts']

def avg_rush_yrds(row):
    if row['RushAttempts'] == 0:
        return 0
    return row['RushYrds'] / row['RushAttempts']

def avg_pass_yrds(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['PassYrds'] / row['PassAttempts']

def avg_plays(row):
    return row['Plays'] / row['Games']

def score_per_play(row):
    return row['Score'] / row['Plays']

def score_per_game(row):
    return row['Score'] / row['Games']

def games_perc(row):
    return row['Games'] / 16

def intercept_rat(row):
    if row['PassAttempts'] == 0:
        return 0
    return row['Interceptions'] / row['PassAttempts']

def pass_att_per_game(row):
    return row['PassAttempts'] / row['Games']

def rush_att_per_game(row):
    return row['RushAttempts'] / row['Games']

def fumb_per_game(row):
    return row['Fumbles'] / row['Games']

In [13]:
df = select_to_df(fdb, 'PlayerSeason', player_lbls)
df['PassTDAttRatio'] = df.apply(pass_tdatt_rat, axis=1)
df['RushTDAttRatio'] = df.apply(rush_tdatt_rat, axis=1)
df['AvgPassYrds'] = df.apply(avg_pass_yrds, axis=1)
df['AvgRushYrds'] = df.apply(avg_rush_yrds, axis=1)
df['PercentComplete'] = df.apply(complete_perc, axis=1)
df['PlaysPerGame'] = df.apply(avg_plays, axis=1)
df['GamesPlayedPercent'] = df.apply(games_perc, axis=1)
df['InterceptionRatio'] = df.apply(intercept_rat, axis=1)
df['PassAttPerGame'] = df.apply(pass_att_per_game, axis=1)
df['RushAttPerGame'] = df.apply(rush_att_per_game, axis=1)
df['FumblesPerGame'] = df.apply(fumb_per_game, axis=1)
df['PassRushAttRatio'] = df.apply(pass_rush_att_rat, axis=1)
#df['ScorePerPlay'] = df.apply(score_per_play, axis=1)
#df['ScorePerGame'] = df.apply(score_per_game, axis=1)

In [14]:
players = pd.merge(df, off, on=['Team', 'Season'])
players.tail()

Unnamed: 0,Player,Position,Team,Season,Plays,Games,RushAttempts,RushYrds,RushTDs,PassAttempts,...,InterceptionRatio,PassAttPerGame,RushAttPerGame,FumblesPerGame,PassRushAttRatio,OffPassRushRatio,OffYardsPerGame,OffPassYrdsPerGame,OffRushYrdsPerGame,OffPointsPerGame
5224,Danny Vitale,RB,CLE,2016,5.0,9.0,0.0,0.0,0.0,5.0,...,0.0,0.555556,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5225,Gary Barnidge,TE,CLE,2016,82.0,16.0,0.0,0.0,0.0,82.0,...,0.0,5.125,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5226,Seth DeValve,TE,CLE,2016,12.0,12.0,0.0,0.0,0.0,12.0,...,0.0,1.0,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5227,Connor Hamlett,TE,CLE,2016,1.0,3.0,0.0,0.0,0.0,1.0,...,0.0,0.333333,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5
5228,Randall Telfer,TE,CLE,2016,7.0,14.0,0.0,0.0,0.0,7.0,...,0.0,0.5,0.0,0.0,1.0,1.906542,311.0,204.0,107.0,16.5


## Creating the Models

In [15]:
class Model:
    def __init__(self, feature_list):
        self.features = feature_list
    
    def set_model(self, model):
        self.model = model
        
    def predict(self, feature_dict):
        parameters = list()
        for idx in range(len(self.features)):
            if self.features[idx][2:] in feature_dict:
                parameters.append(feature_dict[self.features[idx][2:]])
            else:
                parameters.append(feature_dict[self.features[idx]])
        
        return self.model.predict(parameters)
        
class PositionModels:
    def __init__(self, position):
        self.models = dict()
        self.position = position
    
    def add_model(self, predict, model):
        self.models[predict] = model
        
    def predict_player(self, feature_dict):
        predictions = dict()
        for cur_predict in self.models:
            if cur_predict[2:] in self.models:
                cur_model = self.models[cur_predict[2:]]
            else:
                cur_model = self.models[cur_predict]
            
            # print(str(cur_predict)+': '+str(cur_model.predict(feature_dict)))
            predictions[cur_predict] = cur_model.predict(feature_dict)
        
        return predictions
        
    def predict_df(self, df):
        def predict_var(row, to_predict):
            #print(len(self.predict_player(row.to_dict())[to_predict]))
            return self.predict_player(row.to_dict())[to_predict][0]
        
        # add interactions
        interacts = set()
        for to_predict in self.models:
            for cur_feat in self.models[to_predict].features:
                if '*' in cur_feat:
                    interacts.add(cur_feat)
        
        for cur_interact in interacts:
            feat1 = cur_interact.split('*')[0][2:]
            feat2 = cur_interact.split('*')[1][2:]
            df[cur_interact] = df.apply(lambda r, f1, f2: r[f1]*r[f2], args=(feat1, feat2), axis=1)
        
        for to_predict in self.models:
            print('Predicting '+to_predict+'...')
            new_col = 'Pred' + to_predict
            df[new_col] = df.apply(predict_var, args=(to_predict,), axis=1)
        
        return df
    
    def create_model(self, df, to_predict, features, game_limit=0, test_cutoff=2014, add_limits=dict(), interacts=list()):
        new_model = create_model(df, self.position, to_predict, features, game_limit=game_limit, 
                                 test_cutoff=test_cutoff, add_limits=add_limits, interacts=interacts)
        self.add_model(to_predict, new_model)

In [16]:
# todo: enable interactions
# get rid of zeros
# maybe get dont train on low games played
# maybe binary predict upward trend/downward trend

def create_model(df, pos, predict, features, reg_type='linear', game_limit=0, test_cutoff=2016, 
                 add_limits=dict(), interacts=list()):
    df = df[df.Position == pos]
    left_vars = ['Player', 'Team', 'Season'] + [predict]
    right_vars = ['Player', 'Team', 'Season'] + features
    model_vars = left_vars.copy()
    orig_vars = model_vars.copy()
    for idx in range(len(features)):
        features[idx] = 'PS' + features[idx]
    model_vars += features
        
    # build train dataframe
    model_df = pd.DataFrame(columns=model_vars)
    for season in range(2008, test_cutoff):
        prev_season = str(season - 1)
        season = str(season)
        left_df = df[df.GamesPlayedPercent >= (game_limit/16)]
        for limit in add_limits:
            left_df = left_df[left_df[limit] >= add_limits[limit]]
        left_df = left_df[left_vars][df.Season == season]
        right_df = df[right_vars][df.Season == prev_season]
        right_df['Season'] = str(season)
        left_df = pd.merge(left_df, right_df, on=['Player', 'Team', 'Season'])
        left_df.columns = model_vars
        model_df = model_df.append(left_df)
    
    # build interactions
    for cur_inter in interacts:
        feat1 = features[cur_inter[0]]
        feat2 = features[cur_inter[1]]
        model_df[feat1+'*'+feat2] = model_df.apply(lambda row, feat1, feat2: row[feat1] * row[feat2], 
                                                 args=(feat1, feat2), axis=1)
        features.append(feat1+'*'+feat2)
            
        
    # build test dataframe
    #model_vars = orig_vars
    test_df = pd.DataFrame(columns=model_vars)
    for season in range(test_cutoff, 2017):
        left_df = df[df.GamesPlayedPercent >= (game_limit/16)]
        for limit in add_limits:
            left_df = left_df[left_df[limit] >= add_limits[limit]]
        left_df = left_df[left_vars][df.Season == str(season)]
        right_df = df[right_vars][df.Season == str(season-1)]
        right_df['Season'] = str(season)
        left_df = pd.merge(left_df, right_df, on=['Player', 'Team', 'Season'])
        left_df.columns = model_vars
        test_df = test_df.append(left_df)
        
    # build interactions
    for cur_inter in interacts:
        feat1 = features[cur_inter[0]]
        feat2 = features[cur_inter[1]]
        test_df[feat1+'*'+feat2] = test_df.apply(lambda row, feat1, feat2: row[feat1] * row[feat2], 
                                                 args=(feat1, feat2), axis=1)     
  
    if reg_type == 'linear':
        model = linear_model.LinearRegression()
    elif reg_type == 'logistic':
        model = linear_model.LogisticRegression()
    model.fit(model_df[features], model_df[predict])
    print('|=== Predicting '+str(predict)+' ===|\n')
    print('|====== Model Coefficients ======|')
    for idx in range(len(model.coef_)):
        print(features[idx]+': '+str(model.coef_[idx]))
    print('Intercept: '+str(model.intercept_))
    print('\n|========== Performance ==========|')
    print('R-Sqr on Seasons '+str(test_cutoff)+' to 2016: '  + str(model.score(model_df[features], model_df[predict])))
    print('Train Data Size: '+str(len(model_df)))
    print('Test Data Size:'+str(len(test_df)))
    model_obj = Model(features)
    model_obj.set_model(model)
    return model_obj

## Quarterbacks

In [29]:
ModelsQB = PositionModels('QB')

In [75]:
AvgPassYrds = create_model(players, 'QB', 'AvgPassYrds', 
                           ['AvgPassYrds', 'PercentComplete', 'OffPassYrdsPerGame'], 
                   game_limit=10, test_cutoff=2014, interacts=[])
ModelsQB.add_model('AvgPassYrds', AvgPassYrds)

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: 0.281325343102
PSPercentComplete: 6.41163920165
PSOffPassYrdsPerGame: 0.000708867372633
Intercept: 1.13018502735

R-Sqr on Seasons 2014 to 2016: 0.52100841563
Train Data Size: 143
Test Data Size:70


In [95]:
PassAttPerGame = create_model(players, 'QB', 'PassAttPerGame', 
                           ['PassAttPerGame', 'AvgPassYrds', 'PassTDAttRatio', 'OffPointsPerGame', 'GamesPlayedPercent'], 
                   game_limit=6, test_cutoff=2014, interacts=[(0,4), (1,3)])
ModelsQB.add_model('PassAttPerGame', PassAttPerGame)

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 0.585682389826
PSAvgPassYrds: 4.66416477994
PSPassTDAttRatio: -19.9779011058
PSOffPointsPerGame: 1.46935242751
PSGamesPlayedPercent: -7.49648014459
PSPassAttPerGame*PSGamesPlayedPercent: 0.230692346385
PSAvgPassYrds*PSOffPointsPerGame: -0.173339137249
Intercept: -23.8767212999

R-Sqr on Seasons 2014 to 2016: 0.595266049834
Train Data Size: 184
Test Data Size:91


In [100]:
PassTDAttRatio = create_model(players, 'QB', 'PassTDAttRatio', 
                           ['PassTDAttRatio', 'AvgPassYrds', 'PercentComplete', 'GamesPlayedPercent', 'RushTDAttRatio'], 
                   game_limit=8, test_cutoff=2014)
ModelsQB.add_model('PassTDAttRatio', PassTDAttRatio)

|=== Predicting PassTDAttRatio ===|

PSPassTDAttRatio: 0.118074081227
PSAvgPassYrds: 0.00221885064841
PSPercentComplete: 0.0325126213073
PSGamesPlayedPercent: 0.0075644469316
PSRushTDAttRatio: 0.0508856621922
Intercept: -0.00569435143606

R-Sqr on Seasons 2014 to 2016: 0.236048860829
Train Data Size: 161
Test Data Size:78


In [105]:
AvgRushYrds = create_model(players, 'QB', 'AvgRushYrds', 
                           ['AvgRushYrds', 'RushAttPerGame', 'PassAttPerGame', 'OffPointsPerGame'], 
                   game_limit=8, test_cutoff=2014)
ModelsQB.add_model('AvgRushYrds', AvgRushYrds)

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.186097495049
PSRushAttPerGame: 0.606931980498
PSPassAttPerGame: -0.0429372088817
PSOffPointsPerGame: -0.0951528762998
Intercept: 4.5258439588

R-Sqr on Seasons 2014 to 2016: 0.36818219927
Train Data Size: 161
Test Data Size:78


In [187]:
RushAttPerGame = create_model(players, 'QB', 'RushAttPerGame', 
                           ['RushAttPerGame', 'RushTDAttRatio', 'PassRushAttRatio', 'GamesPlayedPercent'], 
                   game_limit=6, test_cutoff=2014, interacts=[(0,0)])
ModelsQB.add_model('RushAttPerGame', RushAttPerGame)

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.320401292597
PSRushTDAttRatio: 4.6558133053
PSPassRushAttRatio: -0.0159485482807
PSGamesPlayedPercent: -0.225122925987
PSRushAttPerGame*PSRushAttPerGame: 0.0453150359155
Intercept: 1.53310205211

R-Sqr on Seasons 2014 to 2016: 0.510334569854
Train Data Size: 184
Test Data Size:91


In [108]:
FumblesPerGame = create_model(players, 'QB', 'FumblesPerGame', 
                           ['FumblesPerGame', 'OffPassYrdsPerGame', 'PassAttPerGame'], 
                   game_limit=8, test_cutoff=2014, interacts=[(0,1)])
ModelsQB.add_model('FumblesPerGame', FumblesPerGame)

|=== Predicting FumblesPerGame ===|

PSFumblesPerGame: 0.00225734741466
PSOffPassYrdsPerGame: -0.00064973396875
PSPassAttPerGame: 0.00243968422387
PSFumblesPerGame*PSOffPassYrdsPerGame: 0.000395880115763
Intercept: 0.275483838641

R-Sqr on Seasons 2014 to 2016: 0.0699783290304
Train Data Size: 161
Test Data Size:78


In [91]:
playersQB = players[players.Position == 'QB']

In [97]:
PredictQB = ModelsQB.predict_df(playersQB)
PredictQB[PredictQB.Player == 'Tom Brady'][['Player', 'Team', 'Season', 'AvgPassYrds', 'PredAvgPassYrds']].tail()

Predicting RushAttPerGame...
Predicting PassTDAttRatio...
Predicting AvgRushYrds...
Predicting PassAttPerGame...
Predicting FumblesPerGame...
Predicting AvgPassYrds...


Unnamed: 0,Player,Team,Season,AvgPassYrds,PredAvgPassYrds
2609,Tom Brady,NE,2012,7.577708,7.504745
3312,Tom Brady,NE,2013,6.915605,7.136446
3785,Tom Brady,NE,2014,7.060137,7.408136
4164,Tom Brady,NE,2015,7.644231,7.6145
5012,Tom Brady,NE,2016,8.226852,7.954421


In [98]:
QB2015 = PredictQB[PredictQB.Season == '2015'][['Player', 'Season', 'Team', 'AvgPassYrds', 
            'PredAvgPassYrds', 'PassAttPerGame', 'PredPassAttPerGame']]
QB2016 = PredictQB[PredictQB.Season == '2016'][['Player', 'Season', 'Team', 'AvgPassYrds', 'PassAttPerGame']]
QB2016.columns = ['Player', 'Season', 'Team', '2016AvgPassYrds', '2016PassAttPerGame']
PredictQB2016 = pd.merge(QB2015, QB2016, on=('Player', 'Team'))
PredictQB2016 = PredictQB2016[['Player', 'Team', 'PredAvgPassYrds', '2016AvgPassYrds', 'PredPassAttPerGame', '2016PassAttPerGame']].sort(columns=('2016AvgPassYrds'), ascending=False)
PredictQB2016.head()

Unnamed: 0,Player,Team,PredAvgPassYrds,2016AvgPassYrds,PredPassAttPerGame,2016PassAttPerGame
22,Matt Ryan,ATL,7.677779,9.258427,37.817755,33.375
15,Geno Smith,NYJ,7.206724,9.0,38.422704,4.666667
1,Derek Anderson,CAR,7.251571,8.54717,17.376727,10.6
21,Matt Moore,MIA,11.649222,8.287356,23.003619,17.4
3,Tom Brady,NE,7.6145,8.226852,39.160747,36.0


In [99]:
def predict_total_pass(row):
    return row.PredAvgPassYrds * row.PredPassAttPerGame * 16

def actual_total_pass(row):
    return row['2016AvgPassYrds'] * row['2016PassAttPerGame'] * 16

PredictQB2016['PredPassYrds'] = PredictQB2016.apply(predict_total_pass, axis=1)
PredictQB2016['2016PassYrds'] = PredictQB2016.apply(actual_total_pass, axis=1)
PredictQB2016 = PredictQB2016.sort(columns=('2016PassYrds'), ascending=False)
print('Total Correlation:')
print(PredictQB2016[['Player', 'PredPassYrds', '2016PassYrds']].corr())
print('Top 15 Correlation:')
print(PredictQB2016[['Player', 'PredPassYrds', '2016PassYrds']][:15].corr())
PredictQB2016[['Player', 'PredPassYrds', '2016PassYrds']]

Total Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds       1.00000       0.68197
2016PassYrds       0.68197       1.00000
Top 15 Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.618141
2016PassYrds      0.618141      1.000000


Unnamed: 0,Player,PredPassYrds,2016PassYrds
8,Drew Brees,5185.476151,5208.0
22,Matt Ryan,4645.702003,4944.0
11,Kirk Cousins,4377.06154,4917.0
3,Tom Brady,4771.031841,4738.666667
33,Andrew Luck,4032.291539,4522.666667
6,Carson Palmer,4293.847302,4515.2
16,Aaron Rodgers,3955.949606,4428.0
12,Philip Rivers,4816.15932,4386.0
28,Ben Roethlisberger,5000.604193,4364.571429
10,Matthew Stafford,4442.966268,4327.0


#### Current Metrics

Total: 0.682

Top 15: 0.618

## Running Backs

In [110]:
ModelsRB = PositionModels('RB')

#### Primary Stats

In [114]:
AvgRushYrds = create_model(players, 'RB', 'AvgRushYrds', 
                           ['AvgRushYrds', 'RushTDAttRatio', 'RushAttPerGame'], 
                   game_limit=14, test_cutoff=2014, add_limits={'RushAttPerGame': 8}, interacts=[])
ModelsRB.add_model('AvgRushYrds', AvgRushYrds)

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.233007230324
PSRushTDAttRatio: -1.91252474272
PSRushAttPerGame: -0.0100144505186
Intercept: 3.50947407628

R-Sqr on Seasons 2014 to 2016: 0.112014723196
Train Data Size: 133
Test Data Size:51


In [117]:
RushAttPerGame = create_model(players, 'RB', 'RushAttPerGame', 
                           ['RushAttPerGame', 'AvgRushYrds'], 
                   game_limit=14, test_cutoff=2014)
ModelsRB.add_model('RushAttPerGame', RushAttPerGame)

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.829612058324
PSAvgRushYrds: 0.350216329528
Intercept: 0.0965459334221

R-Sqr on Seasons 2014 to 2016: 0.742479651202
Train Data Size: 329
Test Data Size:149


In [121]:
RushTDAttRatio = create_model(players, 'RB', 'RushTDAttRatio', 
                           ['RushTDAttRatio', 'AvgRushYrds', 'OffRushYrdsPerGame'], 
                   game_limit=14, test_cutoff=2014, interacts=[(0,2)])
ModelsRB.add_model('RushAttPerGame', RushAttPerGame)

|=== Predicting RushTDAttRatio ===|

PSRushTDAttRatio: 3.35812749939
PSAvgRushYrds: -0.00274617361922
PSOffRushYrdsPerGame: 0.000452560830186
PSRushTDAttRatio*PSOffRushYrdsPerGame: -0.0269480824138
Intercept: -0.0172543176205

R-Sqr on Seasons 2014 to 2016: 0.217237107317
Train Data Size: 329
Test Data Size:149


#### Secondary Stats

In [577]:
# create boolean ispasser as addition feature for AvgPassYrds?
# maybe limit train set to only passers and only predict if predicted to be a passer/pass att > certain margin

In [191]:
PassAttPerGame = create_model(players, 'RB', 'PassAttPerGame', 
                           ['PassAttPerGame'], 
                   game_limit=14, test_cutoff=2014, interacts=[(0,0)])
ModelsRB.add_model('PassAttPerGame', PassAttPerGame)

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 1.02159768294
PSPassAttPerGame*PSPassAttPerGame: -0.0568342700929
Intercept: 0.389867015535

R-Sqr on Seasons 2014 to 2016: 0.545003585184
Train Data Size: 329
Test Data Size:149


In [195]:
FumblesPerGame = create_model(players, 'RB', 'FumblesPerGame', 
                           ['FumblesPerGame', 'RushAttPerGame'], 
                   game_limit=14, test_cutoff=2014, interacts=[])
ModelsRB.add_model('FumblesPerGame', FumblesPerGame)

|=== Predicting FumblesPerGame ===|

PSFumblesPerGame: 0.120723512678
PSRushAttPerGame: 0.00465479290049
Intercept: 0.0159142261803

R-Sqr on Seasons 2014 to 2016: 0.274732796335
Train Data Size: 329
Test Data Size:149


In [202]:
def predict_total_rush(row):
    return row.PredAvgRushYrds * row.PredRushAttPerGame * 16

def actual_total_rush(row):
    return row['2016AvgRushYrds'] * row['2016RushAttPerGame'] * 16

PredictRB = ModelsRB.predict_df(players[players.Position == 'RB'])
RB2015 = PredictRB[PredictRB.Season == '2015'][['Player', 'Season', 'Team', 'AvgRushYrds', 
            'PredAvgRushYrds', 'RushAttPerGame', 'PredRushAttPerGame']]
RB2016 = PredictRB[PredictRB.Season == '2016'][['Player', 'Season', 'Team', 'AvgRushYrds', 'RushAttPerGame']]
RB2016.columns = ['Player', 'Season', 'Team', '2016AvgRushYrds', '2016RushAttPerGame']
PredictRB2016 = pd.merge(RB2015, RB2016, on=('Player', 'Team'))
PredictRB2016 = PredictRB2016[['Player', 'Team', 'PredAvgRushYrds', '2016AvgRushYrds', 'PredRushAttPerGame', '2016RushAttPerGame']].sort(columns=('2016AvgRushYrds'), ascending=False)
PredictRB2016['PredRushYrds'] = PredictRB2016.apply(predict_total_rush, axis=1)
PredictRB2016['2016RushYrds'] = PredictRB2016.apply(actual_total_rush, axis=1)
PredictRB2016 = PredictRB2016.sort(columns=('2016RushYrds'), ascending=False)
print('Total Correlation:')
print(PredictRB2016[['Player', 'PredRushYrds', '2016RushYrds']].corr())
print('Top 15 Correlation:')
print(PredictRB2016[['Player', 'PredRushYrds', '2016RushYrds']][:15].corr())
PredictRB2016[['Player', 'PredRushYrds', '2016RushYrds']]

Predicting RushAttPerGame...
Predicting FumblesPerGame...
Predicting PassAttPerGame...
Predicting AvgRushYrds...
Total Correlation:
              PredRushYrds  2016RushYrds
PredRushYrds      1.000000      0.699992
2016RushYrds      0.699992      1.000000
Top 15 Correlation:
              PredRushYrds  2016RushYrds
PredRushYrds      1.000000      0.200854
2016RushYrds      0.200854      1.000000


Unnamed: 0,Player,PredRushYrds,2016RushYrds
60,Le'Veon Bell,1232.689505,1690.666667
39,Jay Ajayi,409.840957,1356.800000
50,LeSean McCoy,1088.236749,1351.466667
10,David Johnson,576.611240,1239.000000
28,Melvin Gordon,819.151456,1227.076923
84,Carlos Hyde,1030.047191,1216.000000
4,LeGarrette Blount,893.070565,1161.000000
31,Eddie Lacy,815.577100,1152.000000
41,Devonta Freeman,1080.451754,1079.000000
47,Spencer Ware,549.692464,1052.571429


#### Current Metrics
Total: 0.7

Top 15: 0.19

there is a bigger issue with unpredictable number of games played. most overpredictions
are related to not controlling for games played, but performance is still poor
especially for the top running backs. hopefully will improve with interactions

## Wide Receivers

In [125]:
ModelsWR = PositionModels('WR')

#### Primary Stats

In [157]:
AvgPassYrds = create_model(players, 'WR', 'AvgPassYrds', 
                           ['AvgPassYrds', 'OffPassYrdsPerGame'], 
                   game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[(0,1)])
ModelsWR.add_model('AvgPassYrds', AvgPassYrds)

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: -0.423650868497
PSOffPassYrdsPerGame: -0.0163688784131
PSAvgPassYrds*PSOffPassYrdsPerGame: 0.00252492548065
Intercept: 10.3358299284

R-Sqr on Seasons 2014 to 2016: 0.0888486413396
Train Data Size: 322
Test Data Size:145


In [149]:
PassAttPerGame = create_model(players, 'WR', 'PassAttPerGame', 
                           ['PassAttPerGame'], 
                   game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 0})
ModelsWR.add_model('PassAttPerGame', PassAttPerGame)

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 0.764821757917
Intercept: 1.47599181756

R-Sqr on Seasons 2014 to 2016: 0.613892036713
Train Data Size: 486
Test Data Size:228


In [184]:
PassTDAttRatio = create_model(players, 'WR', 'PassTDAttRatio', 
                           ['PassTDAttRatio', 'OffPassYrdsPerGame', 'OffPointsPerGame'], 
                   game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[(0,0), (0,3), (0,1)])
ModelsWR.add_model('PassTDAttRatio', PassTDAttRatio)

|=== Predicting PassTDAttRatio ===|

PSPassTDAttRatio: -0.303232791479
PSOffPassYrdsPerGame: -5.0709442713e-05
PSOffPointsPerGame: 0.000265962867502
PSPassTDAttRatio*PSPassTDAttRatio: 1.27255018912
PSPassTDAttRatio*PSPassTDAttRatio*PSPassTDAttRatio: -10.9152195032
PSPassTDAttRatio*PSOffPassYrdsPerGame: 0.00202838561932
Intercept: 0.0433359431419

R-Sqr on Seasons 2014 to 2016: 0.0963872329005
Train Data Size: 322
Test Data Size:145


#### Secondary Stats

In [151]:
ModelsWR.create_model(players, 'AvgRushYrds', ['AvgRushYrds', 'RushAttPerGame', 'OffRushYrdsPerGame'], 
                      game_limit=12, test_cutoff=2014, add_limits={})

|=== Predicting AvgRushYrds ===|

PSAvgRushYrds: 0.147727772425
PSRushAttPerGame: 3.52344002527
PSOffRushYrdsPerGame: -0.0181129923799
Intercept: 4.03411535313

R-Sqr on Seasons 2014 to 2016: 0.0773486183827
Train Data Size: 486
Test Data Size:228


In [196]:
ModelsWR.create_model(players, 'RushAttPerGame', ['RushAttPerGame'], 
                      game_limit=12, test_cutoff=2014, add_limits={}, interacts=[(0,0)])

|=== Predicting RushAttPerGame ===|

PSRushAttPerGame: 0.983191135993
PSRushAttPerGame*PSRushAttPerGame: -0.105245457362
Intercept: 0.00850077169574

R-Sqr on Seasons 2014 to 2016: 0.480727395623
Train Data Size: 486
Test Data Size:228


In [153]:
# static RushTDAttRatio for WRs

In [158]:
PredictWR = ModelsWR.predict_df(players[players.Position == 'WR'])
WR2015 = PredictWR[PredictWR.Season == '2015'][['Player', 'Season', 'Team', 'AvgPassYrds', 
            'PredAvgPassYrds', 'PassAttPerGame', 'PredPassAttPerGame']]
WR2016 = PredictWR[PredictWR.Season == '2016'][['Player', 'Season', 'Team', 'AvgPassYrds', 'PassAttPerGame']]
WR2016.columns = ['Player', 'Season', 'Team', '2016AvgPassYrds', '2016PassAttPerGame']
PredictWR2016 = pd.merge(WR2015, WR2016, on=('Player', 'Team'))
PredictWR2016 = PredictWR2016[['Player', 'Team', 'PredAvgPassYrds', '2016AvgPassYrds', 'PredPassAttPerGame', '2016PassAttPerGame']].sort(columns=('2016AvgPassYrds'), ascending=False)
PredictWR2016['PredPassYrds'] = PredictWR2016.apply(predict_total_pass, axis=1)
PredictWR2016['2016PassYrds'] = PredictWR2016.apply(actual_total_pass, axis=1)
PredictWR2016 = PredictWR2016.sort(columns=('PredPassYrds'), ascending=False)
print('Total Correlation:')
print(PredictWR2016[['Player', 'PredPassYrds', '2016PassYrds']].corr())
print('Top 15 Correlation:')
print(PredictWR2016[['Player', 'PredPassYrds', '2016PassYrds']][:15].corr())
PredictWR2016[['Player', 'PredPassYrds', '2016PassYrds']]

Predicting RushAttPerGame...
Predicting PassTDAttRatio...
Predicting PassAttPerGame...
Predicting AvgPassYrds...
Predicting AvgRushYrds...
Total Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.739544
2016PassYrds      0.739544      1.000000
Top 15 Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.484234
2016PassYrds      0.484234      1.000000


Unnamed: 0,Player,PredPassYrds,2016PassYrds
52,Julio Jones,1488.297483,1610.285714
71,Antonio Brown,1456.025902,1369.600000
88,DeAndre Hopkins,1338.143512,954.000000
30,Keenan Allen,1292.361168,1008.000000
21,Odell Beckham Jr,1265.279021,1367.000000
33,Brandon Marshall,1257.672805,840.533333
86,Steve Smith Sr.,1249.789402,913.142857
98,Demaryius Thomas,1235.143099,1083.000000
65,Alshon Jeffery,1198.372980,1010.461538
49,Jarvis Landry,1155.808079,1136.000000


#### Current Metrics

Total: 0.739

Top 15: 0.48

## Tight Ends

In [159]:
ModelsTE = PositionModels('TE')

#### Primary Stats

In [163]:
ModelsTE.create_model(players, 'AvgPassYrds', ['AvgPassYrds'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[])

|=== Predicting AvgPassYrds ===|

PSAvgPassYrds: 0.276619249957
Intercept: 5.42182785496

R-Sqr on Seasons 2014 to 2016: 0.101981319253
Train Data Size: 108
Test Data Size:53


In [198]:
ModelsTE.create_model(players, 'PassAttPerGame', ['PassAttPerGame', 'OffPassYrdsPerGame', 'GamesPlayedPercent'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 2}, interacts=[(0,2)])

|=== Predicting PassAttPerGame ===|

PSPassAttPerGame: 0.261182571605
PSOffPassYrdsPerGame: 0.00687205792516
PSGamesPlayedPercent: -0.307633412213
PSPassAttPerGame*PSGamesPlayedPercent: 0.29136443834
Intercept: 1.32734612382

R-Sqr on Seasons 2014 to 2016: 0.446777129779
Train Data Size: 178
Test Data Size:80


In [168]:
ModelsTE.create_model(players, 'PassTDAttRatio', ['PassTDAttRatio'], 
                      game_limit=12, test_cutoff=2014, add_limits={'PassAttPerGame': 4}, interacts=[])

|=== Predicting PassTDAttRatio ===|

PSPassTDAttRatio: -0.0206970270841
PSGamesPlayedPercent: -0.0187078258131
PSPassTDAttRatio*PSGamesPlayedPercent: 0.255220122996
Intercept: 0.058282771252

R-Sqr on Seasons 2014 to 2016: 0.0788882147008
Train Data Size: 108
Test Data Size:53


In [200]:
PredictTE = ModelsTE.predict_df(players[players.Position == 'TE'])
TE2015 = PredictTE[PredictTE.Season == '2015'][['Player', 'Season', 'Team', 'AvgPassYrds', 
            'PredAvgPassYrds', 'PassAttPerGame', 'PredPassAttPerGame']]
TE2016 = PredictTE[PredictTE.Season == '2016'][['Player', 'Season', 'Team', 'AvgPassYrds', 'PassAttPerGame']]
TE2016.columns = ['Player', 'Season', 'Team', '2016AvgPassYrds', '2016PassAttPerGame']
PredictTE2016 = pd.merge(TE2015, TE2016, on=('Player', 'Team'))
PredictTE2016 = PredictTE2016[['Player', 'Team', 'PredAvgPassYrds', '2016AvgPassYrds', 'PredPassAttPerGame', '2016PassAttPerGame']].sort(columns=('2016AvgPassYrds'), ascending=False)
PredictTE2016['PredPassYrds'] = PredictTE2016.apply(predict_total_pass, axis=1)
PredictTE2016['2016PassYrds'] = PredictTE2016.apply(actual_total_pass, axis=1)
PredictTE2016 = PredictTE2016.sort(columns=('PredPassYrds'), ascending=False)
print('Total Correlation:')
print(PredictTE2016[['Player', 'PredPassYrds', '2016PassYrds']].corr())
print('Top 15 Correlation:')
print(PredictTE2016[['Player', 'PredPassYrds', '2016PassYrds']][:15].corr())
PredictTE2016[['Player', 'PredPassYrds', '2016PassYrds']]

Predicting PassTDAttRatio...
Predicting PassAttPerGame...
Predicting AvgPassYrds...
Total Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.763585
2016PassYrds      0.763585      1.000000
Top 15 Correlation:
              PredPassYrds  2016PassYrds
PredPassYrds      1.000000      0.512718
2016PassYrds      0.512718      1.000000


Unnamed: 0,Player,PredPassYrds,2016PassYrds
2,Rob Gronkowski,947.786632,1080.000000
45,Delanie Walker,895.270610,853.333333
12,Jordan Reed,870.587042,914.666667
0,Greg Olsen,863.328109,1073.000000
56,Gary Barnidge,860.913460,612.000000
39,Zach Ertz,817.186428,932.571429
14,Antonio Gates,795.482197,626.285714
29,Travis Kelce,742.950370,1125.000000
3,Jimmy Graham,718.951890,923.000000
62,Jason Witten,714.505729,673.000000


## Adding Depth Chart Info

In [82]:
conn = sqlite3.connect('fantasy.db')
c = conn.cursor()

In [16]:
c.execute('ALTER TABLE PlayerSeason ADD Rank INT')

<sqlite3.Cursor at 0x96d8a40>

In [83]:
c.execute('''
    UPDATE PlayerSeason 
    SET Rank = (SELECT Roster.Rank FROM Roster 
        WHERE PlayerSeason.Player = Roster.Player AND PlayerSeason.Season = Roster.Season
        AND PlayerSeason.Team = Roster.Team AND PlayerSeason.Position = Roster.Position)
    ''')

<sqlite3.Cursor at 0x9717b20>

In [84]:
n = 0
for row in c.execute('SELECT * FROM PlayerSeason ORDER BY Season DESC'):
    print(row)
    n += 1
    if n == 100:
        break

('Aaron Rodgers', 'GB', 677, 16, 67, 369, 4, 610, 401, 4428, 40, 4, 7, 'QB', '2016', 1)
('Drew Brees', 'NO', 696, 16, 23, 20, 2, 673, 471, 5208, 37, 4, 15, 'QB', '2016', 1)
('Matt Ryan', 'ATL', 569, 16, 35, 117, 0, 534, 373, 4944, 38, 2, 7, 'QB', '2016', 1)
('Andrew Luck', 'IND', 609, 15, 64, 341, 2, 545, 346, 4240, 31, 5, 13, 'QB', '2016', 1)
('Kirk Cousins', 'WAS', 640, 16, 34, 96, 4, 606, 406, 4917, 25, 3, 12, 'QB', '2016', 1)
('Philip Rivers', 'LAC', 592, 16, 14, 35, 0, 578, 349, 4386, 33, 5, 21, 'QB', '2016', 1)
('Matthew Stafford', 'DET', 632, 16, 37, 207, 2, 594, 388, 4327, 24, 2, 10, 'QB', '2016', 1)
('Blake Bortles', 'JAC', 684, 16, 58, 359, 3, 625, 368, 3905, 23, 6, 16, 'QB', '2016', None)
('Dak Prescott', 'DAL', 516, 16, 57, 282, 6, 459, 311, 3667, 23, 4, 4, 'QB', '2016', 1)
('Jameis Winston', 'TB', 620, 16, 53, 165, 1, 567, 345, 4090, 28, 6, 18, 'QB', '2016', 1)
('Russell Wilson', 'SEA', 620, 16, 72, 259, 1, 546, 353, 4219, 21, 2, 11, 'QB', '2016', 1)
('Andy Dalton', 'CIN',

In [76]:
for row in c.execute('SELECT * FROM Roster WHERE Player = \'Larry Fitzgerald\''):
    print(row)

('Larry Fitzgerald', 'ARI', 'WR', '2007', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2008', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2009', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2010', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2011', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2012', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2013', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2014', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2015', 1)
('Larry Fitzgerald', 'ARI', 'WR', '2016', 1)


In [79]:
conn.close()

In [66]:
for row in c.execute('SELECT DISTINCT Team FROM Roster'):
    print(row)

ProgrammingError: Cannot operate on a closed database.