In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

**This file is nearly identical to the other RB prediction file, however for this one, we are predicting using only stats based on a yearly basis, rather than accumulated stats. This is for the purpose of including more players in the top 100, but are perhaps a little bit younger.**

**The file titled rbstats.csv is also in the repository. It is a csv file of statistics that I compiled for the players. The file path should be edited before running this file. The purpose of this cell is to import the relevant csv file and then create a dataframe with this data.**

In [2]:
df = pd.read_csv('/home/john/Downloads/rbstats.csv')
#https://www.pro-football-reference.com/leaders/rush_yds_career.htm#leaders::none #has been supplemented with extra stats

**Here is the dataframe, simply titled 'df'. There are more than 100 players in this list because there are 10 non-Running Backs on the list.**

In [3]:
df

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Tm,Pos,Seasons,Yds/Season,TDs,Rec Tds,Rush Tds,TDs/Season,Height (In),Receptions,Rec/Season,Rec Yds,Rec Yds/Season,HOF,HOF_Eligible
0,1,Emmitt Smith,18355,1990,2004,2TM,RB,15,1223.666667,175,11,164,11.666667,69,515,34.333333,3224,214.933333,Y,Y
1,2,Walter Payton,16726,1975,1987,chi,RB,13,1286.615385,125,15,110,9.615385,70,492,37.846154,4538,349.076923,Y,Y
2,3,Frank Gore,15347,2005,2019,4TM,RB,15,1023.133333,97,18,79,6.466667,69,468,31.200000,3896,259.733333,N,N
3,4,Barry Sanders,15269,1989,1998,det,RB,10,1526.900000,109,10,99,10.900000,68,352,35.200000,2921,292.100000,Y,Y
4,5,Adrian Peterson,14216,2007,2019,4TM,RB,13,1093.538462,117,6,111,9.000000,73,289,22.230769,2365,181.923077,N,N
5,6,Curtis Martin,14101,1995,2005,2TM,RB,11,1281.909091,100,10,90,9.090909,71,484,44.000000,3329,302.636364,Y,Y
6,7,LaDainian Tomlinson,13684,2001,2011,2TM,RB,11,1244.000000,162,17,145,14.727273,70,624,56.727273,4772,433.818182,Y,Y
7,8,Jerome Bettis,13662,1993,2005,2TM,RB,13,1050.923077,94,3,91,7.230769,71,200,15.384615,1457,112.076923,Y,Y
8,9,Eric Dickerson,13259,1983,1993,4TM,RB,11,1205.363636,96,6,90,8.727273,75,281,25.545455,2137,194.272727,Y,Y
9,10,Tony Dorsett,12739,1977,1988,2TM,RB,12,1061.583333,90,13,77,7.500000,71,398,33.166667,3554,296.166667,Y,Y


**Now we begin to massage the data. We drop the teams the players played for, as we are not planning to use it in this analysis. We also drop players who are in the top 100 that are not running backs.**

In [4]:
df.drop(['Tm'], axis=1, inplace=True)

df.drop([84, 101, 102, 103, 104, 105, 106, 107, 108, 109], axis=0, inplace=True)

**Next we reset the index, so that the ranking numbers are continuous again.**

In [5]:
df = df.reset_index()

**Resetting the index has included a new column 'index', which we do not need, so we are going to drop it. After this, we change Ys and Ns in the HOF category to 1s and 0s, which are better suited for the logistic regression we plan to apply later.**

In [6]:
df.drop(['index'], axis=1, inplace=True)

In [7]:
df['HOF'] = df['HOF'].replace(['Y'], 1)
df['HOF'] = df['HOF'].replace(['N'], 0)
df['HOF_Eligible'] = df['HOF_Eligible'].replace(['Y'], 1)
df['HOF_Eligible'] = df['HOF_Eligible'].replace(['N'], 0)

In [8]:
df

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Pos,Seasons,Yds/Season,TDs,Rec Tds,Rush Tds,TDs/Season,Height (In),Receptions,Rec/Season,Rec Yds,Rec Yds/Season,HOF,HOF_Eligible
0,1,Emmitt Smith,18355,1990,2004,RB,15,1223.666667,175,11,164,11.666667,69,515,34.333333,3224,214.933333,1,1
1,2,Walter Payton,16726,1975,1987,RB,13,1286.615385,125,15,110,9.615385,70,492,37.846154,4538,349.076923,1,1
2,3,Frank Gore,15347,2005,2019,RB,15,1023.133333,97,18,79,6.466667,69,468,31.200000,3896,259.733333,0,0
3,4,Barry Sanders,15269,1989,1998,RB,10,1526.900000,109,10,99,10.900000,68,352,35.200000,2921,292.100000,1,1
4,5,Adrian Peterson,14216,2007,2019,RB,13,1093.538462,117,6,111,9.000000,73,289,22.230769,2365,181.923077,0,0
5,6,Curtis Martin,14101,1995,2005,RB,11,1281.909091,100,10,90,9.090909,71,484,44.000000,3329,302.636364,1,1
6,7,LaDainian Tomlinson,13684,2001,2011,RB,11,1244.000000,162,17,145,14.727273,70,624,56.727273,4772,433.818182,1,1
7,8,Jerome Bettis,13662,1993,2005,RB,13,1050.923077,94,3,91,7.230769,71,200,15.384615,1457,112.076923,1,1
8,9,Eric Dickerson,13259,1983,1993,RB,11,1205.363636,96,6,90,8.727273,75,281,25.545455,2137,194.272727,1,1
9,10,Tony Dorsett,12739,1977,1988,RB,12,1061.583333,90,13,77,7.500000,71,398,33.166667,3554,296.166667,1,1


**Now we split the data into 2 sets, HOF eligible and HOF ineligible. HOF eligible will be our training set, and we will then apply the model to the HOF ineligible players to predict future HOF players.**

In [9]:
dftrain = df[df.HOF_Eligible == 1]
dftest = df[df.HOF_Eligible == 0]

In [10]:
dftest

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Pos,Seasons,Yds/Season,TDs,Rec Tds,Rush Tds,TDs/Season,Height (In),Receptions,Rec/Season,Rec Yds,Rec Yds/Season,HOF,HOF_Eligible
2,3,Frank Gore,15347,2005,2019,RB,15,1023.133333,97,18,79,6.466667,69,468,31.2,3896,259.733333,0,0
4,5,Adrian Peterson,14216,2007,2019,RB,13,1093.538462,117,6,111,9.0,73,289,22.230769,2365,181.923077,0,0
17,18,Steven Jackson,11438,2004,2015,RB,12,953.166667,78,9,69,6.5,74,461,38.416667,3683,306.916667,0,0
21,22,LeSean McCoy,11071,2009,2019,RB,11,1006.454545,89,16,73,8.090909,71,503,45.727273,3797,345.181818,0,0
28,29,Marshawn Lynch,10413,2007,2019,RB,13,801.0,94,9,85,7.230769,71,287,22.076923,2214,170.307692,0,0
32,33,Matt Forte,9796,2008,2017,RB,10,979.6,75,21,54,7.5,74,554,55.4,4672,467.2,0,0
34,35,Chris Johnson,9651,2008,2017,RB,10,965.1,64,9,55,6.4,71,307,30.7,2255,225.5,0,0
47,48,DeAngelo Williams,8096,2006,2016,RB,11,736.0,70,9,61,6.363636,69,236,21.454545,2106,191.454545,0,0
55,56,Jamaal Charles,7563,2008,2018,RB,11,687.545455,64,20,44,5.818182,71,310,28.181818,2593,235.727273,0,0
58,59,Jonathan Stewart,7335,2008,2018,RB,11,666.818182,58,7,51,5.272727,70,162,14.727273,1295,117.727273,0,0


**Now we set up our Xtrain, ytrain, Xtest, and ytest datasets. We drop the non yearly stats from our dataframe: 'Rank', 'Player', 'Yds', 'First Year', 'Last Year', 'Pos', 'Seasons', 'TDs', 'Rec Tds', 'Rush Tds', 'Height (In)', 'Receptions', 'Rec/Season', 'HOF_Eligible', and 'HOF'. This leaves only 'yds/season', 'TDs/season', 'rec/season', and 'rec yds/season'.**

In [11]:
Xtrain = dftrain.drop(['Rank', 'Player', 'Yds', 'First Year', 'Last Year', 'Pos', 'Seasons', 'TDs', 'Rec Tds', 'Rush Tds', 'Height (In)', 'Receptions', 'Rec/Season', 'HOF_Eligible', 'HOF'], axis=1)
ytrain = dftrain['HOF']

Xtest = dftest.drop(['Rank', 'Player', 'Yds', 'First Year', 'Last Year', 'Pos', 'Seasons', 'TDs', 'Rec Tds', 'Rush Tds', 'Height (In)', 'Receptions', 'Rec/Season', 'HOF_Eligible', 'HOF'], axis=1)
ytest = dftest['HOF']

In [12]:
logistic_regression = LogisticRegression()

**Now we run our logistic regression for our datasets to predict the future HOF members.**

In [13]:
logistic_regression.fit(Xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
ypred = logistic_regression.predict(Xtest)

In [15]:
ypred

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

**Of the 17 ineligible players, three have been predicted to make it: Frank Gore, Adrian Peterson, and Marshawn Lynch. These are the same results as the other file, which predicts from full careeer stats. This is likely because the criteria to be included is being in the top 100 all time rushers, which is a career stat. This excludes anyone with great yearly stats, who hasn't established themselves in the top 100 just yet.**