In [14]:
import numpy as np
import pandas as pd

In [15]:
# average rookie contract is 4
years_later = 2

drafts = pd.read_csv(
    "drafts.csv")
rosters = pd.read_csv(
    "rosters.csv")

# removes drafts data from before 2006
drafts.drop(
    drafts.index[drafts['season'] < 2006], inplace=True)
drafts.drop(
    drafts.index[drafts['season'] >= 2019-5], inplace=True)




In [16]:
players = {2006: [], 2007: [], 2008: [], 2009: [], 2010: [],
           2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [],  2017: [], 2018: [], 2019: []}

for index, row in rosters.iterrows():

    isPlaying = row['playerid'] not in drafts['pfr_id'].values
    for i in players.values():
        if row['playerid'] in i:
            isPlaying = True
            break

    if not isPlaying:



        drow = drafts.loc[(drafts['pfr_id'] ==
                           row['playerid']) & ((drafts['season'] + 5) == row['season'])]
        if drow.empty:
            continue

        players[int(drow['season']) + 5].append(row['playerid'])

In [17]:
# Matches abbreviations with the proper column
# Some teams have changed location over the years, their current abbreviation is used in column names
teams_dict = {
    "BUF": "BUF", "MIA": "MIA", "ATL": "ATL", "PIT": "PIT", "BAL": "BAL", "HOU": "HOU",
    "TB": "TB", "WAS": "WAS", "TEN": "TEN", "CAR": "CAR", "NYJ": "NYJ", "CLE": "CLE", "CIN": "CIN",
    "STL": "LAR", "LAR": "LAR", "LA": "LAR", "OAK": "LV", "LV": "LV", "LAC": "LAC", "SD": "LAC",
    "NYG": "NYG", "DEN": "DEN", "ARI": "ARI", "IND": "IND", "DET": "DET", "MIN": "MIN", "KC": "KC",
    "CAR": "CAR", "DAL": "DAL", "CHI": "CHI", "GB": "GB", "NE": "NE", "SF": "SF", "NO": "NO", "PHI": "PHI",
    "SEA": "SEA", "JAX": "JAX"
}

# For initializing column names
categories_list = ['DL', 'RB', 'QB', 'OL', 'LB', 'TE', 'DB', 'WR', 'K', 'P']
positions_list = ['DE', 'RB', 'QB', 'T', 'LB', 'TE', 'DB', 'DT', 'G', 'WR', 'C', 'K', 'NT', 'P', 'OL', 'DL'] # T is offensive tackle, G is guard, NT is nose tackle



# Matches possible side values with the proper column
side_dict = {
    "O": "is_offense", "D": "is_defense", "S": "is_special_teams"
}


In [18]:
new_cols = ['is_offense', 'is_defense', 'is_special_teams']

for year in range(years_later): # Columns that change value each year
    new_cols.append('games_year_{}'.format(year+1))
    new_cols.append('starts_year_{}'.format(year+1))
    new_cols.append('value_year_{}'.format(year+1))
    # Each team value is stored as a 1 or 0 (1 means the player is on that team's roster)
    for team in teams_dict.values():
        if 'on_{}_year_{}'.format(team, year+1) not in new_cols:
            new_cols.append('on_{}_year_{}'.format(team, year+1)) 

for team in teams_dict.values():
    if 'drafted_by_{}'.format(team) not in new_cols:
        new_cols.append('drafted_by_{}'.format(team))
# Creates a column name for each possible team value
for position in positions_list:
    if 'position_{}'.format(position) not in new_cols:
        new_cols.append('position_{}'.format(position))
for category in categories_list:
    if 'category_{}'.format(category) not in new_cols:
        new_cols.append('category_{}'.format(category))

In [19]:
# Using the drafts .csv as a base for a dataframe of player profiles each year
player_profiles = drafts


# look up pd.explode
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

for col in (new_cols + ['5_years_later']):
    player_profiles[col] = [0 for x in range(len(player_profiles))]




In [20]:
display(rosters)

Unnamed: 0.1,Unnamed: 0,season,team,playerid,full_name,name,side,category,position,games,starts,years,av
0,1,2006,ARI,LeinMa00,Matt Leinart,M.Leinart,O,QB,QB,12.0,11.0,0,8.0
1,2,2006,ARI,LewiJo22,Jonathan Lewis,J.Lewis,,,,4.0,0.0,0,0.0
2,3,2006,ARI,LiwiCh20,Chris Liwienski,C.Liwienski,O,OL,LG,16.0,6.0,8,4.0
3,4,2006,ARI,ArriJ.00,J.J. Arrington,J.Arrington,,,,16.0,0.0,1,2.0
4,5,2006,ARI,LutuDe20,Deuce Lutui,D.Lutui,O,OL,RG,15.0,9.0,0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28612,28613,2019,WAS,IoanMa01,Matthew Ioannidis,M.Ioannidis,D,DL,DT,,,3,
28613,28614,2019,WAS,AlexAd00,Adonis Alexander,A.Alexander,,,,,,1,
28614,28615,2019,WAS,BergTo00,Tony Bergstrom,T.Bergstrom,O,OL,OT,,,7,
28615,28616,2019,WAS,WoodJo01,Josh Woodrum,J.Woodrum,O,QB,QB,,,1,


In [21]:
for index, row in player_profiles.iterrows():
    for year in range(years_later):
        
        # Retreives a player's data for a specific year
        temp = rosters
        temp = temp[temp['playerid'] == (row['pfr_id'])]
        temp = temp[temp['season'] == (row['season'] + year)]
        
        if temp.empty:
            continue
            
        player_profiles.at[index, 'on_{}_year_{}'.format(teams_dict[temp['team'].values[0]], year+1)] = 1
        
        player_profiles.at[index, 'games_year_{}'.format(year+1)] = (temp['games'].values[0] if temp['games'].values[0] != "NaN" else 0)
        player_profiles.at[index, 'starts_year_{}'.format(year+1)] = (temp['starts'].values[0] if temp['starts'].values[0] != "NaN" else 0)        
        player_profiles.at[index, 'value_year_{}'.format(year+1)] = (temp['av'].values[0] if temp['av'].values[0] != "NaN" else 0)
    
    # Checks if a player is on a roster after 5 seasons
    if players[row['season'] + 5].__contains__(row['pfr_id']):
        player_profiles.at[index, '5_years_later'] = 1

    
    player_profiles.at[index, side_dict[row['side']]] = 1
    player_profiles.at[index, 'category_{}'.format(row['category'])] = 1
    player_profiles.at[index, 'position_{}'.format(row['position'])] = 1

    player_profiles.at[index, 'drafted_by_{}'.format(teams_dict[row['team']])] = 1

# Fills all remaining null values with 0
player_profiles.fillna(0, inplace=True)



In [22]:
display(player_profiles)


Unnamed: 0.1,Unnamed: 0,season,team,round,pick,pfr_id,pfr_name,player_id,side,category,...,category_RB,category_QB,category_OL,category_LB,category_TE,category_DB,category_WR,category_K,category_P,5_years_later
7557,7558,2006,HOU,1,1,WillMa22,Mario Williams,0,D,DL,...,0,0,0,0,0,0,0,0,0,1
7558,7559,2006,NO,1,2,BushRe00,Reggie Bush,0,O,RB,...,1,0,0,0,0,0,0,0,0,1
7559,7560,2006,TEN,1,3,YounVi00,Vince Young,00-0024218,O,QB,...,0,1,0,0,0,0,0,0,0,1
7560,7561,2006,NYJ,1,4,FergDB20,D'Brickashaw Ferguson,0,O,OL,...,0,0,1,0,0,0,0,0,0,1
7561,7562,2006,GB,1,5,HawkA.20,A.J. Hawk,0,D,LB,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9586,9587,2013,MIA,7,250,JoneDo02,Don Jones,0,D,DB,...,0,0,0,0,0,1,0,0,0,0
9587,9588,2013,CIN,7,251,JohnTJ00,T.J. Johnson,0,O,OL,...,0,0,1,0,0,0,0,0,0,0
9588,9589,2013,SF,7,252,CoopMa00,Marcus Cooper,0,D,DB,...,0,0,0,0,0,1,0,0,0,1
9589,9590,2013,NYG,7,253,CoxxMi00,Michael Cox,0,O,RB,...,1,0,0,0,0,0,0,0,0,0


In [23]:
# Logistic regression model without cross validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

feature_cols = ['pick'] + new_cols


X = player_profiles[feature_cols]  # Features
y = player_profiles['5_years_later']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=16)

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16, max_iter=2000)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(y_pred)


[1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 1 0 1
 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 1 1 1 0 1 0 0 0 1 0 1 1
 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1
 0 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0 1 0 0 0 1 1 0
 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 1
 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1
 1 0 1 1 0 0 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 1 1 1 1 1 1
 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1
 0 1 1 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 1
 1 1 1 1 1 1 1 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 1
 1 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0
 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1 0
 0 0 0 0 0 0 1 1 0 1 1 0 

In [24]:
from sklearn import metrics 

score = metrics.accuracy_score(y_test,y_pred)


auc = metrics.roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])

# position and category did absolutely nothing to the auc or k-fold score, brought down the normal score
print("accuracy score: {}\nauc: {}".format(score, auc))


accuracy score: 0.7151277013752456
auc: 0.795995670995671


In [25]:
# Logistic regression model using KFold cross examination
from sklearn.model_selection import KFold, cross_val_score

model = LogisticRegression(solver='liblinear')

X = player_profiles[feature_cols]
y = player_profiles['5_years_later']

kf = KFold(n_splits=4)
acc_score = []

result = cross_val_score(model , X, y, cv = kf)


print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: 0.7350022430889656


In [26]:
import seaborn as sns

sns.regplot(x=X, y=y, data=player_profiles, logistic=True)


ValueError: regplot inputs must be 1d