In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

**The file titled wrstats.csv is also in the repository. It is a csv file of statistics that I compiled for the players. Edit the file path in the cell below when you run this file. The purpose of this cell is to import the relevant csv file and then create a dataframe with this data.**

In [2]:
df = pd.read_csv('/home/john/Downloads/wrstats.csv')
#https://www.pro-football-reference.com/leaders/rec_yds_career.htm #has been supplemented with additional data

**Here is the dataframe, simply titled 'df'. Now we drop the other positions besides WRs on the list who made the top 100 in receiving yards, which ends up being TEs and RBS.**

In [3]:
df

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Tm,Pos,Seasons,Yds/Season,TDs,TDs/Season,40 yd Dash,Height (In),Receptions,Rec/Season,HOF,HOF_Eligible
0,1,Jerry Rice,22895,1985,2004,3TM,WR,20,1144.750000,197,9.850000,4.71,74,1549,77.450000,Y,Y
1,2,Larry Fitzgerald,17083,2004,2019,crd,WR,16,1067.687500,120,7.500000,4.63,75,1378,86.125000,N,N
2,3,Terrell Owens,15934,1996,2010,5TM,WR,15,1062.266667,153,10.200000,4.45,75,1078,71.866667,Y,Y
3,4,Randy Moss,15292,1998,2012,5TM,WR,14,1092.285714,156,11.142857,4.25,76,982,70.142857,Y,Y
4,5,Isaac Bruce,15208,1994,2009,2TM,WR,16,950.500000,91,5.687500,4.48,72,1024,64.000000,Y,Y
5,6,Tony Gonzalez,15127,1997,2013,2TM,TE,17,889.823529,111,6.529412,4.49,77,1325,77.941176,Y,Y
6,7,Tim Brown,14934,1988,2004,2TM,WR,17,878.470588,100,5.882353,4.39,72,1094,64.352941,Y,Y
7,8,Steve Smith,14731,2001,2016,2TM,WR,16,920.687500,81,5.062500,4.39,69,1031,64.437500,N,N
8,9,Marvin Harrison,14580,1996,2008,clt,WR,13,1121.538462,128,9.846154,4.38,72,1102,84.769231,Y,Y
9,10,Reggie Wayne,14345,2001,2014,clt,WR,14,1024.642857,82,5.857143,4.45,72,1070,76.428571,N,Y


**Now we begin to massage the data. We drop the teams the players played for, as we are not planning to use it in this analysis. We also drop players who are in the top 100 that are not running backs.**

In [4]:
df.drop(['Tm'], axis=1, inplace=True)

df.drop([5, 18, 28, 47, 82, 83, 101, 102, 103, 107, 108], axis=0, inplace=True)

**Next we reset the index, so that the ranking numbers are continuous again.**

In [5]:
df = df.reset_index()

**Resetting the index has included a new column 'index', which we do not need, so we are going to drop it. After this, we change Ys and Ns in the HOF category to 1s and 0s, which are better suited for the logistic regression we plan to apply later.**

In [6]:
df.drop(['index'], axis=1, inplace=True)

In [7]:
df['HOF'] = df['HOF'].replace(['Y'], 1)
df['HOF'] = df['HOF'].replace(['N'], 0)
df['HOF_Eligible'] = df['HOF_Eligible'].replace(['Y'], 1)
df['HOF_Eligible'] = df['HOF_Eligible'].replace(['N'], 0)

In [8]:
df

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Pos,Seasons,Yds/Season,TDs,TDs/Season,40 yd Dash,Height (In),Receptions,Rec/Season,HOF,HOF_Eligible
0,1,Jerry Rice,22895,1985,2004,WR,20,1144.750000,197,9.850000,4.71,74,1549,77.450000,1,1
1,2,Larry Fitzgerald,17083,2004,2019,WR,16,1067.687500,120,7.500000,4.63,75,1378,86.125000,0,0
2,3,Terrell Owens,15934,1996,2010,WR,15,1062.266667,153,10.200000,4.45,75,1078,71.866667,1,1
3,4,Randy Moss,15292,1998,2012,WR,14,1092.285714,156,11.142857,4.25,76,982,70.142857,1,1
4,5,Isaac Bruce,15208,1994,2009,WR,16,950.500000,91,5.687500,4.48,72,1024,64.000000,1,1
5,7,Tim Brown,14934,1988,2004,WR,17,878.470588,100,5.882353,4.39,72,1094,64.352941,1,1
6,8,Steve Smith,14731,2001,2016,WR,16,920.687500,81,5.062500,4.39,69,1031,64.437500,0,0
7,9,Marvin Harrison,14580,1996,2008,WR,13,1121.538462,128,9.846154,4.38,72,1102,84.769231,1,1
8,10,Reggie Wayne,14345,2001,2014,WR,14,1024.642857,82,5.857143,4.45,72,1070,76.428571,0,1
9,11,Andre Johnson,14185,2003,2016,WR,14,1013.214286,70,5.000000,4.41,75,1062,75.857143,0,0


**Now we split the data into 2 sets, HOF eligible and HOF ineligible. HOF eligible will be our training set, and we will then apply the model to the HOF ineligible players to predict future HOF players.**

In [9]:
dftrain = df[df.HOF_Eligible == 1]
dftest = df[df.HOF_Eligible == 0]

In [10]:
dftest

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Pos,Seasons,Yds/Season,TDs,TDs/Season,40 yd Dash,Height (In),Receptions,Rec/Season,HOF,HOF_Eligible
1,2,Larry Fitzgerald,17083,2004,2019,WR,16,1067.6875,120,7.5,4.63,75,1378,86.125,0,0
6,8,Steve Smith,14731,2001,2016,WR,16,920.6875,81,5.0625,4.39,69,1031,64.4375,0,0
9,11,Andre Johnson,14185,2003,2016,WR,14,1013.214286,70,5.0,4.41,75,1062,75.857143,0,0
12,14,Anquan Boldin,13779,2003,2016,WR,14,984.214286,82,5.857143,4.71,73,1076,76.857143,0,0
19,22,Brandon Marshall,12351,2006,2018,WR,13,950.076923,83,6.384615,4.52,77,970,74.615385,0,0
22,25,Julio Jones,12125,2011,2019,WR,9,1347.222222,57,6.333333,4.39,75,797,88.555556,0,0
27,31,Calvin Johnson,11619,2007,2015,WR,9,1291.0,83,9.222222,4.35,77,731,81.222222,0,0
31,35,Antonio Brown,11263,2010,2019,WR,10,1126.3,80,8.0,4.47,70,841,84.1,0,0
34,38,Roddy White,10863,2005,2015,WR,11,987.545455,63,5.727273,4.45,72,808,73.454545,0,0
38,42,DeSean Jackson,10420,2008,2019,WR,12,868.333333,55,4.583333,4.35,70,598,49.833333,0,0


In [11]:
dftrain

Unnamed: 0,Rank,Player,Yds,First Year,Last Year,Pos,Seasons,Yds/Season,TDs,TDs/Season,40 yd Dash,Height (In),Receptions,Rec/Season,HOF,HOF_Eligible
0,1,Jerry Rice,22895,1985,2004,WR,20,1144.750000,197,9.850000,4.71,74,1549,77.450000,1,1
2,3,Terrell Owens,15934,1996,2010,WR,15,1062.266667,153,10.200000,4.45,75,1078,71.866667,1,1
3,4,Randy Moss,15292,1998,2012,WR,14,1092.285714,156,11.142857,4.25,76,982,70.142857,1,1
4,5,Isaac Bruce,15208,1994,2009,WR,16,950.500000,91,5.687500,4.48,72,1024,64.000000,1,1
5,7,Tim Brown,14934,1988,2004,WR,17,878.470588,100,5.882353,4.39,72,1094,64.352941,1,1
7,9,Marvin Harrison,14580,1996,2008,WR,13,1121.538462,128,9.846154,4.38,72,1102,84.769231,1,1
8,10,Reggie Wayne,14345,2001,2014,WR,14,1024.642857,82,5.857143,4.45,72,1070,76.428571,0,1
10,12,James Lofton,14004,1978,1993,WR,16,875.250000,75,4.687500,4.29,75,764,47.750000,1,1
11,13,Cris Carter,13899,1987,2002,WR,16,868.687500,130,8.125000,4.63,75,1101,68.812500,1,1
13,15,Henry Ellard,13777,1983,1998,WR,16,861.062500,65,4.062500,4.40,71,814,50.875000,0,1


**Now we set up our Xtrain, ytrain, Xtest, and ytest datasets. We drop from our dataframe 'Rank', 'Player', 'First Year', 'Pos', '40 yd Dash', 'HOF_Eligible', and 'HOF'. This leaves 'yds', 'last year', 'seasons', 'yds/season', 'TDs', 'TDs/season', 'height', 'rec', and 'rec/season' to train from.**

In [12]:
Xtrain = dftrain.drop(['Rank', 'Player', 'First Year', 'Pos', '40 yd Dash', 'HOF_Eligible', 'HOF'], axis=1)
ytrain = dftrain['HOF']

Xtest = dftest.drop(['Rank', 'Player', 'First Year', 'Pos', '40 yd Dash', 'HOF_Eligible', 'HOF'], axis=1)
ytest = dftest['HOF']

In [13]:
logistic_regression = LogisticRegression()

**Now we run our logistic regression for our datasets to predict the future HOF members.**

In [14]:
logistic_regression.fit(Xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
ypred = logistic_regression.predict(Xtest)

In [16]:
ypred

array([1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

**Of the 23 ineligible players, three have been predicted to make it: Larry Fitzgerald, Brandon Marshall, and Calvin Johnson.**