In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df1 = pd.read_csv('Advanced_Table_Clean/fantasyDB.csv')


df2 = pd.read_csv('Advanced_Table_Clean/advancedRec.csv')

df = pd.merge(df1,df2[['PlID', 'AvgDOT', 'YAC/G', 'YBC/G', 'YdsBC/Rec',
                       'YdsAC/Rec', 'PassRat']], on= 'PlID', how='inner')


df_join = df.to_csv('Joined.csv', index = True)


In [16]:
df['Yr'].count()

1471

In [4]:
def createPosModel(file, position):
    # creates a df from the csvFile, drops na values and rows where FantPos
    # does not equal the position parameter
    df = pd.read_csv(file)
    df = df.dropna()
    df = df[df['Pos'] == position]
    #train = df[df['Yr'] <= 2018]
    
    
    # the model will use different independent variables depending on position
    if position == 'QB':
        X = df[['PassAtt/G','PassYds/G', 'PassTD/G', 'Int/G', 'RushAtt/G', 'RushYds/G',
                    'RushYds/Att', 'RushTD/G','TotTD/G','PPR/G']]
    elif position == 'RB':
        X = df[['Age', 'RushAtt/G', 'RushYds/G','RushYds/Att', 'RushTD/G', 'Tgt/G', 
                    'Rec/G', 'RecYds/G', 'Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    elif position == 'WR' or 'TE':
        X = df[['Tgt/G','Rec/G','RecYds/G','Yds/Rec','RecTD/G','TotTD/G','PPR/G', 'AvgDOT', 'YAC/G', 'YBC/G', 'Tgt/G', 'YdsBC/Rec',
                       'YdsAC/Rec']]
    else:
        print('Invalid position entered')
        return

    y = df['Next_Yr_PPG']
    reg = linear_model.LinearRegression()
    reg.fit(X, y)
    return reg

In [5]:
def testModelAccuracy(model, file, position):
    # creates a df from the csvFile, drops na values and rows where FantPos
    # does not equal the position parameter
    df = pd.read_csv(file)
    df = df.dropna()
    df = df[df['Pos'] == position]

    # the model will use different independent variables depending on position
    if position == 'QB':
        XTest = df[['PassAtt/G','PassYds/G', 'PassTD/G', 'Int/G', 'RushAtt/G', 'RushYds/G',
                    'RushYds/Att', 'RushTD/G','TotTD/G','PPR/G']]
    elif position == 'RB':
        XTest = df[['Age', 'RushAtt/G', 'RushYds/G','RushYds/Att', 'RushTD/G', 'Tgt/G', 
                    'Rec/G', 'RecYds/G', 'Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    elif position == 'WR' or 'TE':
        XTest = df[['Tgt/G','Rec/G','RecYds/G','Yds/Rec','RecTD/G','TotTD/G','PPR/G', 'AvgDOT', 'YAC/G', 'YBC/G', 'Tgt/G', 'YdsBC/Rec',
                       'YdsAC/Rec']]
    else:
        print('Invalid position entered')
        return

    yTest = df['Next_Yr_PPG']
    results = model.score(XTest, yTest)
    return results

In [6]:
def testModelDifference(model, file, position):
    # creates a df from the csvFile, drops na values and rows where FantPos
    # does not equal the position parameter
    df = pd.read_csv(file)
    df = df.dropna()
    df = df[df['Pos'] == position]

    # the model will use different independent variables depending on position
    if position == 'QB':
        XTest = df[['PassAtt/G','PassYds/G', 'PassTD/G', 'Int/G', 'RushAtt/G', 'RushYds/G',
                    'RushYds/Att', 'RushTD/G','TotTD/G','PPR/G']]
    elif position == 'RB':
        XTest = df[['Age', 'RushAtt/G', 'RushYds/G','RushYds/Att', 'RushTD/G', 'Tgt/G', 
                    'Rec/G', 'RecYds/G', 'Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    elif position == 'WR' or 'TE':
        XTest = df[['Tgt/G','Rec/G','RecYds/G','Yds/Rec','RecTD/G','TotTD/G','PPR/G', 'AvgDOT', 'YAC/G', 'YBC/G', 'Tgt/G', 'YdsBC/Rec',
                       'YdsAC/Rec']]
    else:
        print('Invalid position entered')
        return


    yPred = model.predict(XTest)
    predAndActual = {'Name': df['Player'], 'Predicted PPG': yPred,
                     'Actual PPG': df['Next_Yr_PPG']}

    # creates df from dictionary above
    database = pd.DataFrame(predAndActual)

    # creates a difference column which depicts the difference between the
    # predicted PPG and actual PPG
    database['Predicted PPG'] = database['Predicted PPG'].round(decimals=3)
    database['Difference'] = database['Predicted PPG'] - database['Actual PPG']
    database['Difference'] = database['Difference'].round(decimals=3)
    database['AbsDifference'] = database['Difference'].abs()
    meanDiff = round(database['Difference'].mean(), 3)
    medianDiff = round(database['Difference'].median(), 3)
    meanAbsDiff = round(database['AbsDifference'].mean(), 3)
    medianAbsDiff = round(database['AbsDifference'].median(), 3)

    return database, meanDiff, medianDiff, meanAbsDiff, medianAbsDiff

In [7]:
def testModel(model, test, train, position):#, var):
    accuracy = testModelAccuracy(model, train, position)#, var)
    differences = testModelDifference(model, test, position)#, var)
    meanDiff = differences[1]
    medDiff = differences[2]
    meanAbsDiff = differences[3]
    medAbsDiff = differences[4]

    print('The accuracy of the {0} model is {1}'.format(position, accuracy))
    print('The {0} model has an average error of {1} PPG and an average absolute error of {2} PPG'.format(position, meanDiff, meanAbsDiff))
    print('The {0} model has a median error of {1} PPG and a median absolute error of {2} PPG'.format(position, medDiff, medAbsDiff))
    print('\n')
    return

In [8]:
def useModel(model, df1, df2, position, var):
    df1.dropna()
    df2.dropna()
    # checks where the Fant Pos is the position given and returns a data frame
    # with only the rows that include said position
    df1 = df1[df1['pos'] == position]
    df2 = df2[df2['pos'] == position]
    df2 = df2[['plid','player','ppr_g']]
    newdf = df1.merge(df2,how='inner',left_on='plid',right_on='plid')

    # the model will use difference parameters based on position
    if position == 'QB':
        X = df1[var]
    elif position == 'RB':
        X = df1[var]
    elif position == 'WR' or 'TE':
        X = df1[var]
    else:
        print('Invalid position entered')
        return
    yPred = model.predict(X)
    
    database = database.reset_index(drop = True)

    return database

In [9]:
WR_Model = createPosModel('Joined.csv',"WR")
WR_Model

LinearRegression()

In [10]:
# Separate into training, testing, using, and actual data by Yr (Fantasy) or Year (other dataframes)
train = df[df['Yr'] <= 2019]
test = df[(df['Yr'] >= 2019) & (df['Yr'] <= 2020)]
use = df[df['Yr'] == 2020]
use = use[use['Next_Yr_Starter'].notna()]
use = use.astype({"Next_Yr_Starter": int})
actuals = df[df['Yr'] == 2021]

test['Yr'].head(50)


1     2020
2     2019
4     2020
6     2020
8     2020
9     2019
11    2020
13    2020
14    2019
17    2020
18    2019
20    2020
21    2019
23    2020
25    2020
26    2019
28    2020
30    2019
32    2020
33    2019
35    2020
36    2019
38    2020
40    2020
41    2019
42    2019
44    2020
45    2019
46    2020
47    2019
49    2020
50    2019
52    2020
53    2019
54    2020
55    2019
57    2020
58    2019
60    2020
61    2019
64    2020
65    2019
67    2019
69    2020
70    2019
71    2019
72    2020
73    2019
75    2020
76    2019
Name: Yr, dtype: int64

In [21]:
testModel(WR_Model, 'Joined.csv', 'Joined.csv', "WR" )

The accuracy of the WR model is 0.6162917736679171
The WR model has an average error of -0.0 PPG and an average absolute error of 2.547 PPG
The WR model has a median error of 0.264 PPG and a median absolute error of 1.957 PPG


