In [2]:
import pandas as pd
import pipeline as p
import datetime as dt

%load_ext autoreload
%autoreload 2

In [19]:
df = p.open_pkl('Data/merged_df.pkl')

## Pre-merge cleaning

### Convert Stats team abbrevs to Elo abbrevs

In [5]:
elo_stats_team_dict = p.open_pkl('TeamDicts/elo_stats_team_dict.pkl')
corr_dict = p.open_pkl('TeamDicts/moved_teams_corr_dict.pkl')
full_name_dict = p.open_pkl('TeamDicts/full_name_team_dict.pkl')

# change the abbreviations of teams that moved in 2018
full_name_dict['San Diego Chargers'] = 'LAC'
full_name_dict['St. Louis Rams'] = 'LAR'

# create reverse-lookup of elo_stats
stats_elo_team_dict = {v:k for k,v in elo_stats_team_dict.items()}
stats_elo_team_dict['SDG'] = 'LAC'
stats_elo_team_dict['STL'] = 'LAR'

In [20]:
# stats_df['Team'] = stats_df['Team'].apply(lambda x: corr_dict[x] if x in corr_dict.keys() else x)
df['Opp'] = df['Opp'].apply(lambda x: full_name_dict[x])
df['Team'] = df['Team'].apply(lambda x: stats_elo_team_dict[x] if x in stats_elo_team_dict else x)
df['Opp'] = df['Opp'].apply(lambda x: stats_elo_team_dict[x] if x in stats_elo_team_dict else x)

### Clean up datatypes

In [21]:
# sort by date. Change name of 'Year' to 'Season' to avoid confusion
df.sort_values('Date', inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'Year':'Season'}, inplace=True)

In [65]:
# Convert to integers:

# Location: 1 = Home, 0 = Away
df['Location'] = df['Location'].apply(lambda x: 1 if x == '@' else 0)

In [15]:
# OT: 1 = OT occured. 0 = no OT. 
df['OT'] = df['OT'].apply(lambda x: 1 if x == 'OT' else 0)

# Result: 1 = Win, 0 = Loss
df['Result'] = df['Result'].apply(lambda x: 1 if x == 'W' else (0 if x == 'L' else 0.5))

In [25]:
# Time of Possession
def convert_timeposs(row):
    return row.minute + (row.second/60)

df['TimePoss'] = pd.to_datetime(df['Time of Possession'], format='%M:%S').dt.time
# df.drop(columns={'Time of Possession'}, inplace=True)

df['TimePossMins'] = df['TimePoss'].apply(lambda x: convert_timeposs(x))

df.drop(columns=['TimePoss', 'Time of Possession'], inplace=True)

### Extract / Create features

In [36]:
df.dropna(subset=['Record'], inplace=True)

df['Wins'] = df['Record'].apply(lambda row: p.row_split(row, 0))
df['Losses'] = df['Record'].apply(lambda row: p.row_split(row, 1))
df['Games'] = df['Wins'] + df['Losses']
df['Win%'] = df['Wins']/df['Games'] * 100

In [45]:
# shift wins, losses, games, win%
def correct_winloss(row, col):
    if row['Week'] == 1:
        return 0
    else:
        return row[col]

df['Wins'] = df.groupby(['Team'])['Wins'].shift()
df['Losses'] = df.groupby(['Team'])['Losses'].shift()
df['Games'] = df.groupby(['Team'])['Games'].shift()
df['Win%'] = df.groupby(['Team'])['Win%'].shift()

df['Wins'].fillna(0, inplace=True)
df['Losses'].fillna(0, inplace=True)
df['Games'].fillna(0, inplace=True)
df['Win%'].fillna(0, inplace=True)

df['Wins'] = df.apply(lambda row: correct_winloss(row, 'Wins'), axis=1)
df['Losses'] = df.apply(lambda row: correct_winloss(row, 'Losses'), axis=1)
df['Games'] = df.apply(lambda row: correct_winloss(row, 'Games'), axis=1)
df['Win%'] = df.apply(lambda row: correct_winloss(row, 'Win%'), axis=1)

In [38]:
# Passer stats
df['PassComp%'] = df['PassCmp'] / df['PassAtt'] * 100

def passer_rating(row):
    a = (row['PassCmp']/row['PassAtt'] - 0.3) * 5
    b = (row['PassY']/row['PassAtt'] - 3) * 0.25
    c = row['PassTDs']/row['PassAtt'] * 20
    d = 2.375 - (row['INT']/row['PassAtt'] * 25)
    return (a+b+c+d)/6 * 100

df['PassRating'] = df.apply(lambda row: passer_rating(row), axis=1)

In [40]:
df.drop(columns='index', inplace = True)
df.rename(columns={'neutral':'Neutral'}, inplace=True)

In [49]:
p.pkl_this('mreged_df_cleaned.pkl', df)

In [56]:
df.head()

Unnamed: 0,1stD,3rdDAtt,3rdDConv,4thDAtt,Date,Day,DefTO,Fumbles,INT,Location,...,TotY,Week,Season,TimePossMins,Wins,Losses,Games,Win%,PassComp%,PassRating
0,18.0,15.0,6.0,1.0,2007-09-06,Thu,1.0,2.0,2.0,@,...,293.0,1,2007,31.75,0.0,0.0,0.0,0.0,68.292683,57.672764
1,20.0,11.0,4.0,0.0,2007-09-06,Thu,3.0,1.0,0.0,,...,452.0,1,2007,28.25,0.0,0.0,0.0,0.0,60.0,125.416667
2,22.0,14.0,7.0,2.0,2007-09-09,Sun,2.0,1.0,1.0,@,...,438.0,1,2007,32.05,0.0,0.0,0.0,0.0,65.909091,107.575758
3,21.0,13.0,6.0,0.0,2007-09-09,Sun,3.0,0.0,2.0,@,...,392.0,1,2007,26.9,0.0,0.0,0.0,0.0,75.0,102.083333
4,14.0,11.0,3.0,1.0,2007-09-09,Sun,2.0,3.0,2.0,@,...,219.0,1,2007,27.166667,0.0,0.0,0.0,0.0,66.666667,50.94697


### Compute trailing season average stats

In [51]:
sa_cols_drop_true = ['1stD', 'PassY', 'PtsOpp',
       'PtsTm', 'RushY', 'TO', 'DefTO', 
       'TotY', 'RushAtt', 'RushTDs', 'PassCmp', 'PassAtt',
       'PassTDs', 'INT', 'SacksO', 'Fumbles', 'Penalies', 'PenY', '3rdDConv',
       '3rdDAtt', '4thDAtt',  
       'PassComp%',
       'PassRating']
# sa_cols_drop_false = ['EloBefore', 'OppElo']

In [66]:
stats_df = df.copy()

In [67]:
for col in sa_cols_drop_true:
    stats_df = p.calc_season_avg(col, stats_df, drop=True)

# for col in sa_cols_drop_false:
#     stats_df = p.calc_season_avg(col, stats_df, drop=False)  

In [68]:
stats_df.head()

Unnamed: 0,Date,Day,Location,OT,Opp,Record,Result,Team,Time,Week,...,INT_SA,SacksO_SA,Fumbles_SA,Penalies_SA,PenY_SA,3rdDConv_SA,3rdDAtt_SA,4thDAtt_SA,PassComp%_SA,PassRating_SA
0,2007-09-06,Thu,1,,IND,0-1,0.0,NO,20:39:00,1,...,,,,,,,,,,
1,2007-09-06,Thu,0,,NO,1-0,1.0,IND,20:39:00,1,...,,,,,,,,,,
2,2007-09-09,Sun,1,,DAL,0-1,0.0,NYG,20:25:00,1,...,,,,,,,,,,
3,2007-09-09,Sun,1,,OAK,1-0,1.0,DET,16:15:00,1,...,,,,,,,,,,
4,2007-09-09,Sun,1,,HOU,0-1,0.0,KC,13:05:00,1,...,,,,,,,,,,


In [69]:
stats_df.loc[0]

Date             2007-09-06 00:00:00
Day                              Thu
Location                           1
OT                               NaN
Opp                              IND
Record                           0-1
Result                             0
Team                              NO
Time                        20:39:00
Week                               1
Season                          2007
TimePossMins                   31.75
Wins                               0
Losses                             0
Games                              0
Win%                               0
1stD_SA                          NaN
PassY_SA                         NaN
PtsOpp_SA                        NaN
PtsTm_SA                         NaN
RushY_SA                         NaN
TO_SA                            NaN
DefTO_SA                         NaN
TotY_SA                          NaN
RushAtt_SA                       NaN
RushTDs_SA                       NaN
PassCmp_SA                       NaN
P

Week 1 SA are blank. Pull in previous season's stats? Or do we need to revert them to an average?

### Merge with Elo ratings

Use five-thirty-eight's Elo data

In [70]:
elo_df = p.open_pkl('Data/elo_df.pkl')

In [71]:
elo_df.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,score1,score2
0,2007-09-06,2007,0,,IND,NO,1653.923,1515.973,0.762833,0.237167,1668.974,1500.922,41.0,10.0
1,2007-09-09,2007,0,,DAL,NYG,1490.892,1494.06,0.588056,0.411944,1510.108,1474.844,45.0,35.0
2,2007-09-09,2007,0,,LAR,CAR,1478.632,1511.942,0.54548,0.45452,1448.656,1541.917,13.0,27.0
3,2007-09-09,2007,0,,LAC,CHI,1646.204,1574.744,0.68687,0.31313,1660.857,1560.09,14.0,3.0
4,2007-09-09,2007,0,,WSH,MIA,1448.841,1489.941,0.534341,0.465659,1461.613,1477.169,16.0,13.0


In [245]:
elo_df[(elo_df['team1'] == 'NO') | (elo_df['team2'] == 'NO')][['date','team1','team2','elo1','elo2']].head()

Unnamed: 0,date,team1,team2,elo1,elo2
0,2007-09-06,IND,NO,1653.923,1515.973
28,2007-09-16,TB,NO,1402.979,1500.922
47,2007-09-24,NO,TEN,1468.805,1506.098
74,2007-10-07,NO,CAR,1437.203,1484.207
85,2007-10-14,SEA,NO,1531.254,1422.503


Shift Elos to compute delta

In [72]:
elo_append_cols_h = ['date','season','neutral','playoff','team1','elo1','score1']
elo_append_cols_a = ['date','season','neutral','playoff','team2','elo2','score2']

elo_df_h = elo_df[elo_append_cols_h]
elo_df_a = elo_df[elo_append_cols_a]

elo_df_h.rename(columns={'elo1':'elo', 'team1':'team','score1':'score'}, inplace=True)
elo_df_a.rename(columns={'elo2':'elo', 'team2':'team','score2':'score'}, inplace=True)

KeyError: "['elo1'] not in index"

In [286]:
elo_df_h.head()

Unnamed: 0,date,season,neutral,playoff,team,elo,score
0,2007-09-06,2007,0,0,IND,1653.923,41
1,2007-09-09,2007,0,0,WSH,1448.841,16
2,2007-09-09,2007,0,0,BUF,1516.684,14
3,2007-09-09,2007,0,0,SEA,1511.936,20
4,2007-09-09,2007,0,0,CLE,1396.563,7


In [287]:
elo_df_a.head()

Unnamed: 0,date,season,neutral,playoff,team,elo,score
0,2007-09-06,2007,0,0,NO,1515.973,10
1,2007-09-09,2007,0,0,MIA,1489.941,13
2,2007-09-09,2007,0,0,DEN,1558.574,15
3,2007-09-09,2007,0,0,TB,1417.385,6
4,2007-09-09,2007,0,0,PIT,1568.569,34


In [291]:
elo_df_flat = elo_df_h.append(elo_df_a, ignore_index=True)

In [294]:
elo_df_flat.sort_values('date', inplace=True)
elo_df_flat.reset_index(inplace=True, drop=True)

In [296]:
elo_df_flat.drop(columns='index',inplace=True)

In [298]:
elo_df_flat['elo_after'] = elo_df_flat.groupby('team')['elo'].shift(-1)
elo_df_flat['elo_delta'] = elo_df_flat['elo_after'] - elo_df_flat['elo']

In [303]:
elo_df_flat.head()

Unnamed: 0,date,season,neutral,playoff,team,elo,score,elo_after,elo_delta
0,2007-09-06,2007,0,0,IND,1653.923,41,1668.974,15.051
1,2007-09-06,2007,0,0,NO,1515.973,10,1500.922,-15.051
2,2007-09-09,2007,0,0,CAR,1511.942,27,1541.917,29.975
3,2007-09-09,2007,0,0,CHI,1574.744,3,1560.09,-14.654
4,2007-09-09,2007,0,0,DET,1388.019,36,1418.35,30.331


In [305]:
elo_df_flat_for_merge = elo_df_flat[['date','team','elo_delta']]

In [302]:
elo_df.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,2007-09-06,2007,0,0,IND,NO,1653.923,1515.973,0.762833,41,10,1.0
1,2007-09-09,2007,0,0,WSH,MIA,1448.841,1489.941,0.534341,16,13,1.0
2,2007-09-09,2007,0,0,BUF,DEN,1516.684,1558.574,0.533209,14,15,0.0
3,2007-09-09,2007,0,0,SEA,TB,1511.936,1417.385,0.714726,20,6,1.0
4,2007-09-09,2007,0,0,CLE,PIT,1396.563,1568.569,0.350697,7,34,0.0


In [308]:
elo_df = pd.merge(elo_df, elo_df_flat_for_merge, left_on=['date','team1'], right_on=['date','team'])

In [None]:
elo_df.drop(columns='team', inplace=True)`
elo_df.rename(columns={'elo_delta':'elo_delta1'}, inplace=True)

In [313]:
elo_df.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1,elo_delta1
0,2007-09-06,2007,0,0,IND,NO,1653.923,1515.973,0.762833,41,10,1.0,15.051
1,2007-09-09,2007,0,0,WSH,MIA,1448.841,1489.941,0.534341,16,13,1.0,12.772
2,2007-09-09,2007,0,0,BUF,DEN,1516.684,1558.574,0.533209,14,15,0.0,-7.471
3,2007-09-09,2007,0,0,SEA,TB,1511.936,1417.385,0.714726,20,6,1.0,14.406
4,2007-09-09,2007,0,0,CLE,PIT,1396.563,1568.569,0.350697,7,34,0.0,-22.287


In [353]:
df2 = pd.merge(elo_df, stats_df, right_on=['Team','Date'], left_on=['team1','date'])
df2 = pd.merge(df2, stats_df, right_on=['Team','Date'], left_on=['team2','date'], suffixes=('1','2'))

In [344]:
list(df2.columns)

['date',
 'season',
 'neutral',
 'playoff',
 'team1',
 'team2',
 'elo1',
 'elo2',
 'elo_prob1',
 'score1',
 'score2',
 'result1',
 'elo_delta1',
 'Date1',
 'Day1',
 'Location1',
 'OT1',
 'Opp1',
 'Record1',
 'Result1',
 'Team1',
 'Time1',
 'Week1',
 'Season1',
 'Wins1',
 'Losses1',
 'Games1',
 'Win%1',
 'TimePossMins1',
 '1stD_SA1',
 'PassY_SA1',
 'PtsOpp_SA1',
 'PtsTm_SA1',
 'RushY_SA1',
 'TO_SA1',
 'DefTO_SA1',
 'TotY_SA1',
 'RushAtt_SA1',
 'RushTDs_SA1',
 'PassCmp_SA1',
 'PassAtt_SA1',
 'PassTDs_SA1',
 'INT_SA1',
 'SacksO_SA1',
 'Fumbles_SA1',
 'Penalies_SA1',
 'PenY_SA1',
 '3rdDConv_SA1',
 '3rdDAtt_SA1',
 '4thDAtt_SA1',
 'PassComp%_SA1',
 'PassRating_SA1',
 'Date2',
 'Day2',
 'Location2',
 'OT2',
 'Opp2',
 'Record2',
 'Result2',
 'Team2',
 'Time2',
 'Week2',
 'Season2',
 'Wins2',
 'Losses2',
 'Games2',
 'Win%2',
 'TimePossMins2',
 '1stD_SA2',
 'PassY_SA2',
 'PtsOpp_SA2',
 'PtsTm_SA2',
 'RushY_SA2',
 'TO_SA2',
 'DefTO_SA2',
 'TotY_SA2',
 'RushAtt_SA2',
 'RushTDs_SA2',
 'PassCmp_SA2'

In [354]:
cols_to_drop = [
 'Date1',
 'Day1',
 'Location1',
 'Opp1',
 'Record1',
 'Result1',
 'Team1',
 'Season1',
 'Wins1',
 'Losses1',
 'Games1',
 'Date2',
 'Day2',
 'Location2',
 'OT2',
 'Opp2',
 'Record2',
 'Result2',
 'Team2',
 'Time2',
 'Week2',
 'Season2',
 'Wins2',
 'Losses2',
 'Games2']

In [355]:
df2.rename(columns={'OT1':'OT','Week1':'week','Time1':'time'}, inplace=True)

In [357]:
df2.drop(columns=cols_to_drop, inplace=True)

In [360]:
df2.shape

(2929, 66)

In [359]:
list(df2.columns)

['date',
 'season',
 'neutral',
 'playoff',
 'team1',
 'team2',
 'elo1',
 'elo2',
 'elo_prob1',
 'score1',
 'score2',
 'result1',
 'elo_delta1',
 'OT',
 'time',
 'week',
 'Win%1',
 'TimePossMins1',
 '1stD_SA1',
 'PassY_SA1',
 'PtsOpp_SA1',
 'PtsTm_SA1',
 'RushY_SA1',
 'TO_SA1',
 'DefTO_SA1',
 'TotY_SA1',
 'RushAtt_SA1',
 'RushTDs_SA1',
 'PassCmp_SA1',
 'PassAtt_SA1',
 'PassTDs_SA1',
 'INT_SA1',
 'SacksO_SA1',
 'Fumbles_SA1',
 'Penalies_SA1',
 'PenY_SA1',
 '3rdDConv_SA1',
 '3rdDAtt_SA1',
 '4thDAtt_SA1',
 'PassComp%_SA1',
 'PassRating_SA1',
 'Win%2',
 'TimePossMins2',
 '1stD_SA2',
 'PassY_SA2',
 'PtsOpp_SA2',
 'PtsTm_SA2',
 'RushY_SA2',
 'TO_SA2',
 'DefTO_SA2',
 'TotY_SA2',
 'RushAtt_SA2',
 'RushTDs_SA2',
 'PassCmp_SA2',
 'PassAtt_SA2',
 'PassTDs_SA2',
 'INT_SA2',
 'SacksO_SA2',
 'Fumbles_SA2',
 'Penalies_SA2',
 'PenY_SA2',
 '3rdDConv_SA2',
 '3rdDAtt_SA2',
 '4thDAtt_SA2',
 'PassComp%_SA2',
 'PassRating_SA2']

In [363]:
p.pkl_this('Data/merged_by_game_df.pkl', df2)

## Now, fit

In [362]:
df2.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,...,INT_SA2,SacksO_SA2,Fumbles_SA2,Penalies_SA2,PenY_SA2,3rdDConv_SA2,3rdDAtt_SA2,4thDAtt_SA2,PassComp%_SA2,PassRating_SA2
0,2007-09-06,2007,0,0,IND,NO,1653.923,1515.973,0.762833,41,...,,,,,,,,,,
1,2007-09-09,2007,0,0,WSH,MIA,1448.841,1489.941,0.534341,16,...,,,,,,,,,,
2,2007-09-09,2007,0,0,BUF,DEN,1516.684,1558.574,0.533209,14,...,,,,,,,,,,
3,2007-09-09,2007,0,0,SEA,TB,1511.936,1417.385,0.714726,20,...,,,,,,,,,,
4,2007-09-09,2007,0,0,CLE,PIT,1396.563,1568.569,0.350697,7,...,,,,,,,,,,
