In [1]:
import pandas as pd
# pd.set_option("display.max_rows", None)
def cleanffball(year): # add name to inputs if wanting to create csv
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm'
    df = pd.read_html(url,header=1)[0]
    df = df.drop(['Rk','Tm','VBD','FantPt','DKPt','FDPt'],axis=1)
    df = df[df['FantPos'].notna()]
    df = df[df['FantPos'] != 'FantPos']
    df['Y/A'] = df['Y/A'].fillna(0)
    df['Y/R'] = df['Y/R'].fillna(0)
    df['2PM'] = df['2PM'].fillna(0)
    df['2PP'] = df['2PP'].fillna(0)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    df['Player'] = df['Player'].str.rstrip()
    columns = df.columns[2:]
    for col in columns:
        df[col] = pd.to_numeric(df[col],errors='coerce')
    df['PPR_OvRank'] = df['PPR'].rank(method='first',ascending=False)
    df = df.drop(columns=['OvRank'],axis=1)
    df = df.dropna()
    df['Year'] = year
    df['PPG'] = df['PPR'] / df['G']
    df['PPG'] = df['PPG'].round(decimals = 3)
    return df
    # df.to_csv(f'{name}.csv',index=False)

In [2]:
def createDB(startYear, endYear):
    database = pd.DataFrame()
    years = []
    # iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(cleanffball(year))
    return database

In [3]:
def cleanDB(database):
    cols = []
    for column in database.columns:
        if column == 'Yds':
            cols.append('PassYds')
        elif column == 'Yds.1':
            cols.append('RushYds')
        elif column == 'Yds.2':
            cols.append('RecYds')
        elif column == 'Att':
            cols.append('PassAtt')
        elif column == 'Att.1':
            cols.append('RushAtt')
        elif column == 'TD':
            cols.append('PassTD')
        elif column == 'TD.1':
            cols.append('RushTD')
        elif column == 'TD.2':
            cols.append('RecTD')
        elif column == 'TD.3':
            cols.append('TotTD')
        else:
            cols.append(column)

    database.columns = cols
    
    listOfColumns = ['Cmp','PassAtt','PassYds','PassTD','Int','RushAtt','RushYds',
                     'Y/A','RushTD','Tgt','Rec','RecYds','Y/R','RecTD','Fmb',
                     'FL','TotTD','2PM','2PP','PPR','PosRank','PPR_OvRank','Year',
                     'PPG']
    
    # Creates columns for stats per game
    for column in listOfColumns:
        if column == 'Y/A':
            newColumn = 'RushYds/Att'
            database[newColumn] = database[column]
        elif column == 'Y/R':
            newColumn = 'Yds/Rec'
            database[newColumn] = database[column]
        elif column == 'PPR':
            newColumn = 'FPts'
            database[newColumn] = database[column]
        elif column == 'PosRank':
            newColumn = 'PosRk'
            database[newColumn] = database[column]
        elif column == 'PPR_OvRank':
            newColumn = 'OvRank'
            database[newColumn] = database[column]
        elif column == 'Year':
            newColumn = 'Yr'
            database[newColumn] = database[column]
        elif column == 'PPG':
            newColumn = 'PPR/G'
            database[newColumn] = database[column]
        else:
            newColumn = column + '/G'
            database[newColumn] = (pd.to_numeric(database[column]) / pd.to_numeric(database['G']))
            database[newColumn] = database[newColumn].round(decimals = 3)
            
    database = database.drop(columns = listOfColumns)
            
    database = database.sort_values(by = ['Player','Yr'], ascending = [True,False])
    nextYearPPG = []
    lastPlayer = 'NaN'
    lastPlayerPPG = 'NaN'

    for index, row in database.iterrows():
            player = row['Player']
            if lastPlayer == player:
                nextYearPPG.append(lastPlayerPPG)
            else:
                nextYearPPG.append('NaN')
            lastPlayer = row['Player']
            lastPlayerPPG = row['PPR/G']
    
    database['Next_Yr_PPG'] = nextYearPPG
    database = database.reset_index(drop = True)
    return database

In [4]:
from sklearn import linear_model
import pandas as pd

In [5]:
def createPosModel(file, position):
    # creates a df from the csvFile, drops na values and rows where FantPos
    # does not equal the position parameter
    df = pd.read_csv(file)
    df = df.dropna()
    df = df[df['FantPos'] == position]
    
    # the model will use different independent variables depending on position
    if position == 'QB':
        X = df[['PassAtt/G','PassYds/G', 'PassTD/G', 'Int/G', 'RushAtt/G', 'RushYds/G',
                    'RushYds/Att', 'RushTD/G','TotTD/G','PPR/G']]
    elif position == 'RB':
        X = df[['Age', 'RushAtt/G', 'RushYds/G','RushYds/Att', 'RushTD/G', 'Tgt/G', 
                    'Rec/G', 'RecYds/G', 'Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    elif position == 'WR' or 'TE':
        X = df[['Tgt/G','Rec/G','RecYds/G','Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    else:
        print('Invalid position entered')
        return

    y = df['Next_Yr_PPG']
    reg = linear_model.LinearRegression()
    reg.fit(X, y)
    return reg

In [6]:
def testModelAccuracy(model, file, position):
    # creates a df from the csvFile, drops na values and rows where FantPos
    # does not equal the position parameter
    df = pd.read_csv(file)
    df = df.dropna()
    df = df[df['FantPos'] == position]

    # the model will use different independent variables depending on position
    if position == 'QB':
        XTest = df[['PassAtt/G','PassYds/G', 'PassTD/G', 'Int/G', 'RushAtt/G', 'RushYds/G',
                    'RushYds/Att', 'RushTD/G','TotTD/G','PPR/G']]
    elif position == 'RB':
        XTest = df[['Age', 'RushAtt/G', 'RushYds/G','RushYds/Att', 'RushTD/G', 'Tgt/G', 
                    'Rec/G', 'RecYds/G', 'Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    elif position == 'WR' or 'TE':
        XTest = df[['Tgt/G','Rec/G','RecYds/G','Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    else:
        print('Invalid position entered')
        return

    yTest = df['Next_Yr_PPG']
    results = model.score(XTest, yTest)
    return results

In [7]:
def testModelDifference(model, file, position):
    # creates a df from the csvFile, drops na values and rows where FantPos
    # does not equal the position parameter
    df = pd.read_csv(file)
    df = df.dropna()
    df = df[df['FantPos'] == position]

    # the model will use different independent variables depending on position
    if position == 'QB':
        XTest = df[['PassAtt/G','PassYds/G', 'PassTD/G', 'Int/G', 'RushAtt/G', 'RushYds/G',
                    'RushYds/Att', 'RushTD/G','TotTD/G','PPR/G']]
    elif position == 'RB':
        XTest = df[['Age', 'RushAtt/G', 'RushYds/G','RushYds/Att', 'RushTD/G', 'Tgt/G', 
                    'Rec/G', 'RecYds/G', 'Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    elif position == 'WR' or 'TE':
        XTest = df[['Tgt/G','Rec/G','RecYds/G','Yds/Rec','RecTD/G','TotTD/G','PPR/G']]
    else:
        print('Invalid position entered')
        return


    yPred = model.predict(XTest)
    predAndActual = {'Name': df['Player'], 'Predicted PPG': yPred,
                     'Actual PPG': df['Next_Yr_PPG']}

    # creates df from dictionary above
    database = pd.DataFrame(predAndActual)

    # creates a difference column which depicts the difference between the
    # predicted PPG and actual PPG
    database['Predicted PPG'] = database['Predicted PPG'].round(decimals=3)
    database['Difference'] = database['Predicted PPG'] - database['Actual PPG']
    database['Difference'] = database['Difference'].round(decimals=3)
    database['AbsDifference'] = database['Difference'].abs()
    meanDiff = round(database['Difference'].mean(), 3)
    medianDiff = round(database['Difference'].median(), 3)
    meanAbsDiff = round(database['AbsDifference'].mean(), 3)
    medianAbsDiff = round(database['AbsDifference'].median(), 3)

    return database, meanDiff, medianDiff, meanAbsDiff, medianAbsDiff

In [8]:
def testModel(model, testCSV, trainingCSV, position):
    accuracy = testModelAccuracy(model, trainingCSV, position)
    differences = testModelDifference(model, testCSV, position)
    meanDiff = differences[1]
    medDiff = differences[2]
    meanAbsDiff = differences[3]
    medAbsDiff = differences[4]

    print('The accuracy of the {0} model is {1}'.format(position, accuracy))
    print('The {0} model has an average error of {1} PPG and an average absolute error of {2} PPG'.format(position, meanDiff, meanAbsDiff))
    print('The {0} model has a median error of {1} PPG and a median absolute error of {2} PPG'.format(position, medDiff, medAbsDiff))
    print('\n')
    return

In [10]:
trainingData = cleanDB(createDB(2012, 2018))
testingData = cleanDB(createDB(2018, 2019))

In [11]:
trainingData.to_csv('trainingData.csv')
testingData.to_csv('testingData.csv')

In [12]:
# creates ML models from data by position
qbModel = createPosModel('trainingData.csv', 'QB')
rbModel = createPosModel('trainingData.csv', 'RB')
wrModel = createPosModel('trainingData.csv', 'WR')
teModel = createPosModel('trainingData.csv', 'TE')

In [13]:
# tests models
testModel(qbModel, 'testingData.csv', 'trainingData.csv', 'QB')
testModel(rbModel, 'testingData.csv', 'trainingData.csv', 'RB')
testModel(wrModel, 'testingData.csv', 'trainingData.csv', 'WR')
testModel(teModel, 'testingData.csv', 'trainingData.csv', 'TE')

The accuracy of the QB model is 0.48370205661354737
The QB model has an average error of 0.186 PPG and an average absolute error of 4.347 PPG
The QB model has a median error of -0.352 PPG and a median absolute error of 2.848 PPG


The accuracy of the RB model is 0.4756002625974515
The RB model has an average error of -0.06 PPG and an average absolute error of 2.706 PPG
The RB model has a median error of 0.268 PPG and a median absolute error of 2.039 PPG


The accuracy of the WR model is 0.5522723446910583
The WR model has an average error of 0.237 PPG and an average absolute error of 2.862 PPG
The WR model has a median error of 0.609 PPG and a median absolute error of 2.601 PPG


The accuracy of the TE model is 0.5159745697291722
The TE model has an average error of 0.284 PPG and an average absolute error of 2.302 PPG
The TE model has a median error of 0.875 PPG and a median absolute error of 1.851 PPG


