In [1]:
import requests
import numpy as np
import re
from bs4 import BeautifulSoup
import pandas as pd
from collections import defaultdict, Counter
from functools import partial, reduce
from datetime import datetime, date
import pickle
import math
path = !pwd
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100

In [2]:
abbrs =   { 'AZ' : 'Arizona Cardinals',
            'ARI' : 'Arizona Cardinals',
            'ATL' : 'Atlanta Falcons',
            'BAL' : 'Baltimore Ravens',
            'BUF' : 'Buffalo Bills',
            'CAR' : 'Carolina Panthers',
            'CHI' : 'Chicago Bears',
            'CIN' : 'Cincinnati Bengals',
            'CLE' : 'Cleveland Browns',
            'DAL' : 'Dallas Cowboys',
            'DEN' : 'Denver Broncos',
            'DET': 'Detroit Lions',
            'GB' : 'Green Bay Packers',
            'HOU' : 'Houston Texans',
            'IND' : 'Indianapolis Colts',
            'JAX' : 'Jacksonville Jaguars',
            'JAC' : 'Jacksonville Jaguars',
            'KC' : 'Kansas City Chiefs',
            'LA' : 'Los Angeles Rams',
            'STL' : 'Los Angeles Rams',
            'MIA' : 'Miami Dolphins',
            'MIN' : 'Minnesota Vikings',
            'NE' : 'New England Patriots',
            'NO' : 'New Orleans Saints',
            'NYG' : 'New York Giants',
            'NYJ' : 'New York Jets',
            'OAK' : 'Oakland Raiders',
            'PHI' : 'Philadelphia Eagles',
            'PIT' : 'Pittsburgh Steelers',
            'SD' : 'San Diego Chargers',
            'SF' : 'San Francisco 49ers',
            'SEA' : 'Seattle Seahawks',
            'TB' : 'Tampa Bay Buccaneers',
            'TEN' : 'Tennessee Titans',
            'WAS' : 'Washington Redskins',
            'WSH' : 'Washington Redskins',
            'Chiefs' : 'Kansas City Chiefs',
            'Lions' : 'Detroit Lions',
            'Jets' : 'New York Jets',
            'Dolphins' : 'Miami Dolphins',
            'Texans' : 'Houston Texans',
            'Ravens' : 'Baltimore Ravens',
            'Saints' : 'New Orleans Saints',
            'Colts' : 'Indianapolis Colts',
            'Raiders' : 'Oakland Raiders',
            'Cardinals' : 'Arizona Cardinals',
            'Titans' : 'Tennessee Titans',
            'Vikings' : 'Minnesota Vikings',
            'Seahawks' : 'Seattle Seahawks',
            'Browns' : 'Cleveland Browns',
            'Panthers' : 'Carolina Panthers',
            'Chargers' : 'San Diego Chargers',
            'Rams' : 'Los Angeles Rams',
            'Wash.' : 'Washington Redskins',
            'Cowboys' : 'Dallas Cowboys',
            'Patriots' : 'New England Patriots',
            'Packers' : 'Green Bay Packers',
            'Eagles' : 'Philadelphia Eagles',
            'Washington' : 'Washington Redskins',
            'Bills' : 'Buffalo Bills',
            'Bengals' : 'Cincinnati Bengals',
            'Steelers' : 'Pittsburgh Steelers',
            'Falcons' : 'Atlanta Falcons',
            'Giants' : 'New York Giants',
            'Broncos' : 'Denver Broncos',
            'Bears' : 'Chicago Bears',
            'Jaguars' : 'Jacksonville Jaguars',
            'Buccaneers' : 'Tampa Bay Buccaneers',
            '49ers' : 'San Francisco 49ers'
          }

team_names_long = defaultdict(str)
team_names_long.update(abbrs)

In [4]:
def sum_of_naughties(c):
    return sum(c[p] for p in naughties)

def sum_of_super_naughties(c):
    return sum(c[p] for p in super_naughties)

naughties = {'CLIPPING',
             'DEFENSIVE PASS INTERFERENCE',
             'DISQUALIFICATION',
             'FACE MASK (15 YARDS)',
             'HORSE COLLAR TACKLE',
             'ILLEGAL BLINDSIDE BLOCK',
             'ILLEGAL BLOCK ABOVE THE WAIST',
             'ILLEGAL CONTACT',
             'ILLEGAL USE OF HANDS',
             'NEUTRAL ZONE INFRACTION',
             'PERSONAL FOUL',
             'ROUGHING THE KICKER',
             'ROUGHING THE PASSER',
             'RUNNING INTO THE KICKER',
             'TAUNTING',
             'TRIPPING',
             'UNNECESSARY ROUGHNESS',
             'UNSPORTSMANLIKE CONDUCT'}

super_naughties = {'UNSPORTSMANLIKE CONDUCT',
                   'UNNECESSARY ROUGHNESS',
                   'ROUGHING THE PASSER',
                   'PERSONAL FOUL',
                   'DISQUALIFICATION'}

In [5]:
words = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen"]
numbers = list(range(20))
numbers_dict = dict(zip(words, numbers))

def week_to_num(s):
    return numbers_dict[s.lower().split(' ')[-1]] 

def scrub_teams(s):
    if s.lower() == 'regional action':
        return [np.nan]
    s = str(re.sub(r'\(.*\)', '', s))
    s = str(re.sub(r' ', '', s))
    return re.split('/', s)

def get_team_name(team_abb):
    return team_names_long[team_abb]

In [6]:
def scrub_2014_date(d):
    d = reduce(string_cat, [d.split(' ')[1], ' ', d.split(' ')[2].zfill(2), ' 2014'])
    return datetime.strptime(d, '%B %d %Y')

In [7]:
def viewers_to_num(n):
    n = n.replace('.', '')
    return int(n[:-1])*(10**6)

In [8]:
def team_in_game(t, row):
    if t == row['team1']:
        return 1
    elif t == row['team2']:
        return 2
    return 0

In [9]:
def my_datetime(d):
    l = d.split(' ')
    if len(l[1]) == 2:
        d = ' '.join([l[0], '0' + l[1], l[2]])
    return datetime.strptime(d, '%b %d, %Y')

In [10]:
def get_best_rank(teams):
    x, y = teams
    return min(team_ranks[x], team_ranks[y])

In [11]:
def union(a, b):
    return set(a) | set(b)

In [12]:
def string_cat(a, b):
    return str(a) + str(b)

In [13]:
def did_team1_win(row):
    return int(row['team1_score'] > row['team2_score'])

def did_team2_win(row):
    return int(row['team2_score'] > row['team1_score'])

In [14]:
def fill_win_columns(df):
    df['team1_won'] = df.apply(did_team1_win, axis=1)
    df['team2_won'] = df.apply(did_team2_win, axis=1)

In [15]:
def align_teams(row):
    if row.team1 != row.team_1:
        t1s = row.team_2_score
        t2s = row.team_1_score
        row['team1_score'] = t1s
        row['team2_score'] = t2s
    else:
        row['team1_score'] = row.team_1_score
        row['team2_score'] = row.team_2_score
    return row

In [16]:
def make_datetime(t):
    return datetime.strptime(t, '%Y-%m-%d')

In [17]:
def set_up_ratings(df, date_is_week):
    df.rename(columns={'Window' : 'program', 
                        'Game' : 'teams', 
                        'Date' : 'week', 
                        'Net' : 'network',
                        '+/-' : 'ratings diff',
                        '+/-.1' : 'viewers diff',
                        'Rtg.' : 'rating',
                        'Vwrs.' : 'viewers'}, inplace=True)
    if date_is_week:
        df['week'] = df['week'].apply(week_to_num)
    else:
        df.rename(columns={'week' : 'date'}, inplace=True)
        df['date'] = df['date'].apply(scrub_2014_date)
    df['teams'] = df['teams'].apply(scrub_teams)
    df['teams'] = df['teams'].apply(lambda x: frozenset({get_team_name(y) for y in x}))
    if date_is_week:
        df = df[['program', 'teams', 'network', 'rating', 'ratings diff', 'viewers', 'viewers diff', 'week']]
    else:
        df = df[['program', 'teams', 'network', 'rating', 'ratings diff', 'viewers', 'viewers diff', 'date']]
    return df

r16 = set_up_ratings(pd.read_csv('ratings_2016.csv',index_col=0), True)
r15 = set_up_ratings(pd.read_csv('ratings_2015.csv',index_col=0), True)
r14 = set_up_ratings(pd.read_csv('ratings_2014.csv',index_col=0), False) 

r16['year'] = 2016
r15['year'] = 2015
r14['year'] = 2014


In [18]:
def date_to_weeknum_2014(d):
    d = datetime.strptime(str(d), '%Y-%m-%d %H:%M:%S')
    start = datetime.strptime('2014-09-04', '%Y-%m-%d')
    return (d-start).days//7 + 1

In [19]:
r14['week'] = r14['date'].apply(date_to_weeknum_2014)
r14 = r14[['program', 'teams', 'network', 'rating', 'ratings diff', 'viewers', 'viewers diff', 'week']]

In [37]:
rtgs = pd.concat([r14, r15, r15])
rtgs.head()

Unnamed: 0,network,program,rating,ratings diff,teams,viewers,viewers diff,week,year
0,NBC,Kickoff,15.5,+4%,"(Seattle Seahawks, Green Bay Packers)",26.91M,+7%,1,
1,FOX,National,15.7,-5%,"(Dallas Cowboys, San Francisco 49ers)",28.00M,-2%,1,
2,NBC,SNF,13.9,-8%,"(Denver Broncos, Indianapolis Colts)",23.69M,-7%,1,
3,CBS,Single,9.9,+4%,(),16.50M,+6%,1,
4,FOX,Regional,8.2,-8%,"(New Orleans Saints, Atlanta Falcons)",14.10M,-4%,1,


I didn't end up using ratings data, used google search data instead

In [21]:
i16 = pd.read_csv('games_2016.csv', index_col = 0)
i15 = pd.read_csv('games_2015.csv', index_col = 0)
i14 = pd.read_csv('games_2014.csv', index_col = 0)
i13 = pd.read_csv('games_2013.csv', index_col = 0)

i16['teams'] = i16.apply(lambda row: frozenset({row[2], row[4]}), axis=1)
i15['teams'] = i15.apply(lambda row: frozenset({row[2], row[4]}), axis=1)
i14['teams'] = i14.apply(lambda row: frozenset({row[2], row[4]}), axis=1)
i13['teams'] = i13.apply(lambda row: frozenset({row[2], row[4]}), axis=1)

i16['date'] = i16['date'].apply(my_datetime)
i15['date'] = i15['date'].apply(my_datetime)
i14['date'] = i14['date'].apply(my_datetime)
i13['date'] = i13['date'].apply(my_datetime)

i13.head()

Unnamed: 0,week,date,team_1,team_1_score,team_2,team_2_score,teams
0,1,2013-09-05,Baltimore Ravens,27,Denver Broncos,49,"(Denver Broncos, Baltimore Ravens)"
1,1,2013-09-08,Oakland Raiders,17,Indianapolis Colts,21,"(Oakland Raiders, Indianapolis Colts)"
2,1,2013-09-08,Miami Dolphins,23,Cleveland Browns,10,"(Cleveland Browns, Miami Dolphins)"
3,1,2013-09-08,Cincinnati Bengals,21,Chicago Bears,24,"(Cincinnati Bengals, Chicago Bears)"
4,1,2013-09-08,New England Patriots,23,Buffalo Bills,21,"(Buffalo Bills, New England Patriots)"


In [22]:
#this compiles the play by play data (downloaded in a csv) for each game into the relevant summary statitics

def get_game_df(df):
    rows = []
    for i in df.GameId.unique():
        g = df[df.GameId == i]
        g.IsFumble.replace(['0','OPP'], 0, inplace=True)
        g.IsFumble.replace('1', 1, inplace=True)
        date = make_datetime(g.GameDate.unique()[0])
        t1, t2 = [get_team_name(x) for x in g.OffenseTeam.unique() if get_team_name(x)]

        p = g[g.IsPenalty == 1]
        t1pen = Counter(p[p.PenaltyTeam.apply(get_team_name) == t1].PenaltyType)
        t2pen = Counter(p[p.PenaltyTeam.apply(get_team_name) == t2].PenaltyType)

        o1 = g[g.OffenseTeam.apply(get_team_name) == t1]
        t1pass = sum(o1.IsPass)/(sum(o1.IsPass) + sum(o1.IsRush))
        t1td = sum(o1.IsTouchdown)
        t1avgyd = np.mean(o1.Yards)

        o2 = g[g.OffenseTeam.apply(get_team_name) == t2]
        t2pass = sum(o2.IsPass)/(sum(o2.IsPass) + sum(o2.IsRush))
        t2td = sum(o2.IsTouchdown)
        t2avgyd = np.mean(o2.Yards)

        d1 = g[g.DefenseTeam.apply(get_team_name) == t1]
        t1sack = sum(d1.IsSack)
        t1int = sum(d1.IsInterception)
        t1fumb = sum(d1.IsFumble)

        d2 = g[g.DefenseTeam.apply(get_team_name) == t2]
        t2sack = sum(d2.IsSack)
        t2int = sum(d2.IsInterception)
        t2fumb = sum(d2.IsFumble)

        rows.append([i, date, t1, t1pen, t1pass, t1td, t1avgyd, t1sack, t1int, t1fumb, 
                           t2, t2pen, t2pass, t2td, t2avgyd, t2sack, t2int, t2fumb,
                    frozenset({t1, t2})])

    return pd.DataFrame(rows, columns=['gameid',
                                      'date',
                                      'team1',
                                      'team1_penalties',
                                      'team1_passratio',
                                      'team1_TD',
                                      'team1_avgYd',
                                      'team1_sacks',
                                      'team1_int',
                                      'team1_fmbl',
                                      'team2',
                                      'team2_penalties',
                                      'team2_passratio',
                                      'team2_TD',
                                      'team2_avgYd',
                                      'team2_sacks',
                                      'team2_int',
                                      'team2_fmbl',
                                      'teams'])

In [23]:
g16 = get_game_df(pd.read_csv('pbp-2016.csv'))
g15 = get_game_df(pd.read_csv('pbp-2015.csv'))
g14 = get_game_df(pd.read_csv('pbp-2014.csv'))
g13 = get_game_df(pd.read_csv('pbp-2013.csv', error_bad_lines=False))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
b'Skipping line 2331: expected 45 fields, saw 48\nSkipping line 12467: expected 45 fields, saw 46\nSkipping line 15362: expected 45 fields, saw 48\nSkipping line 15376: expected 45 fields, saw 48\n'
b'Skipping line 36878: expected 45 fields, saw 48\nSkipping line 36885: expected 45 fields, saw 48\nSkipping line 42706: expected 45 fields, saw 46\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
d16 = pd.merge(g16, i16)
d15 = pd.merge(g15, i15)
d14 = pd.merge(g14, i14)
d13 = pd.merge(g13, i13)

d16 = d16.apply(align_teams, axis=1)
d15 = d15.apply(align_teams, axis=1)
d14 = d14.apply(align_teams, axis=1)
d13 = d13.apply(align_teams, axis=1)

d16['year'] = 2016
d15['year'] = 2015
d14['year'] = 2014
d13['year'] = 2013

dfs = [d13, d14, d15, d16]

start_dates= {12 : '2012-09-05',
              13 : '2013-09-05',
              14 : '2014-09-04',
              15 : '2015-09-10',
              16 : '2016-09-08',
              17 : '2017-09-08'}

def get_year(d):
    return d.year
    
def get_week_num(d):
    y = int(str(get_year(d))[-2:])
    start = datetime.strptime(start_dates[y], '%Y-%m-%d')
    return ((d - start).days // 7) + 1



for d in dfs:
    fill_win_columns(d)
    d['week'] = d['date'].apply(get_week_num)

In [25]:
DF = pd.concat(dfs).reset_index().sort_values('date')

In [27]:
DF.head(10)

Unnamed: 0,index,gameid,date,team1,team1_penalties,team1_passratio,team1_TD,team1_avgYd,team1_sacks,team1_int,team1_fmbl,team2,team2_penalties,team2_passratio,team2_TD,team2_avgYd,team2_sacks,team2_int,team2_fmbl,teams,week,team_1,team_1_score,team_2,team_2_score,team1_score,team2_score,year,team1_won,team2_won
67,67,2013090500,2013-09-05,Denver Broncos,"{'DEFENSIVE HOLDING': 1, 'OFFENSIVE PASS INTER...",0.681818,7,5.520833,4,2,2,Baltimore Ravens,"{'OFFENSIVE PASS INTERFERENCE': 1, 'FALSE STAR...",0.752941,3,3.767857,3,0,1,"(Denver Broncos, Baltimore Ravens)",1,Baltimore Ravens,27,Denver Broncos,49,49,27,2013,1,0
70,70,2013090808,2013-09-08,New Orleans Saints,"{'UNNECESSARY ROUGHNESS': 1, 'OFFENSIVE HOLDIN...",0.569231,2,5.168675,3,1,2,Atlanta Falcons,"{'INTENTIONAL GROUNDING': 1, 'OFFENSIVE HOLDIN...",0.735849,2,5.565217,2,1,0,"(New Orleans Saints, Atlanta Falcons)",1,Atlanta Falcons,17,New Orleans Saints,23,23,17,2013,1,0
68,68,2013090809,2013-09-08,Pittsburgh Steelers,"{'ILLEGAL BLOCK ABOVE THE WAIST': 1, 'DEFENSIV...",0.702128,1,3.253968,1,0,1,Tennessee Titans,"{'OFFENSIVE PASS INTERFERENCE': 1, 'ILLEGAL SH...",0.359375,1,2.927711,5,1,2,"(Pittsburgh Steelers, Tennessee Titans)",1,Tennessee Titans,16,Pittsburgh Steelers,9,9,16,2013,0,1
79,79,2013090812,2013-09-08,Dallas Cowboys,"{'FALSE START': 2, 'OFFENSIVE HOLDING': 3}",0.704225,2,4.385417,3,3,2,New York Giants,"{'ILLEGAL SHIFT': 1, 'FALSE START': 1, 'OFFENS...",0.75,6,7.28,2,1,2,"(New York Giants, Dallas Cowboys)",1,New York Giants,31,Dallas Cowboys,36,36,31,2013,1,0
78,78,2013090811,2013-09-08,San Francisco 49ers,"{'FALSE START': 4, 'ILLEGAL FORMATION': 1, 'DE...",0.52,4,4.930693,2,2,1,Green Bay Packers,"{'ILLEGAL USE OF HANDS': 1, 'PERSONAL FOUL': 1...",0.637931,4,5.782051,2,0,0,"(Green Bay Packers, San Francisco 49ers)",1,Green Bay Packers,28,San Francisco 49ers,34,34,28,2013,1,0
77,77,2013090805,2013-09-08,Indianapolis Colts,"{'LOW BLOCK': 1, 'FACE MASK (15 YARDS)': 1, 'I...",0.510204,3,4.104478,1,2,0,Oakland Raiders,"{'NEUTRAL ZONE INFRACTION': 2, 'FALSE START': ...",0.467742,3,5.584416,4,0,0,"(Oakland Raiders, Indianapolis Colts)",1,Oakland Raiders,17,Indianapolis Colts,21,21,17,2013,1,0
76,76,2013090804,2013-09-08,Minnesota Vikings,"{'ENCROACHMENT': 1, 'ILLEGAL BLOCK ABOVE THE W...",0.591837,4,5.239437,0,1,4,Detroit Lions,"{'OFFENSIVE HOLDING': 3, 'LOW BLOCK': 1, 'DEFE...",0.584416,7,5.049505,4,3,2,"(Detroit Lions, Minnesota Vikings)",1,Minnesota Vikings,24,Detroit Lions,34,24,34,2013,0,1
75,75,2013090802,2013-09-08,Cincinnati Bengals,"{'ILLEGAL BLOCK ABOVE THE WAIST': 1, 'FACE MAS...",0.625,3,5.8,0,1,0,Chicago Bears,"{'INELIGIBLE DOWNFIELD PASS': 1, 'DEFENSIVE PA...",0.540984,3,4.405063,1,2,2,"(Cincinnati Bengals, Chicago Bears)",1,Cincinnati Bengals,21,Chicago Bears,24,21,24,2013,0,1
74,74,2013090803,2013-09-08,Miami Dolphins,"{'FALSE START': 1, 'ILLEGAL FORMATION': 1, 'FA...",0.655738,2,3.712644,6,3,1,Cleveland Browns,"{'DEFENSIVE OFFSIDE': 1, 'ILLEGAL USE OF HANDS...",0.805556,3,3.847826,4,1,0,"(Cleveland Browns, Miami Dolphins)",1,Miami Dolphins,23,Cleveland Browns,10,23,10,2013,1,0
73,73,2013090801,2013-09-08,Seattle Seahawks,"{'DEFENSIVE PASS INTERFERENCE': 1, 'ILLEGAL SH...",0.610169,1,5.197368,1,0,2,Carolina Panthers,"{'DISQUALIFICATION': 1, 'UNSPORTSMANLIKE CONDU...",0.490196,1,4.196721,2,0,2,"(Seattle Seahawks, Carolina Panthers)",1,Seattle Seahawks,12,Carolina Panthers,7,12,7,2013,1,0


In [28]:
s = pd.read_csv('searches.csv', index_col=0)

In [30]:
t1cols =   ['date',
            'week',
            'year',
            'team1_penalties',
            'team1_passratio',
            'team1_TD',
            'team1_avgYd',
            'team1_sacks',
            'team1_int',
            'team1_fmbl',
            'team1_won',
            'team1_score',]
t2cols =   ['date',
            'week',
            'year',
            'team2_penalties',
            'team2_passratio',
            'team2_TD',
            'team2_avgYd',
            'team2_sacks',
            'team2_int',
            'team2_fmbl',
            'team2_won',
            'team2_score',]



tcols = ['date', 'week', 'year', 'penalties', 'passratio', 'touchdowns', 'avg_yd', 'sacks', 'int', 'fmbl', 'won', 'points']
t1rename = dict(zip(t1cols, tcols))
t2rename = dict(zip(t2cols, tcols))

cols_for_running_avg = ['passratio', 
                        'touchdowns', 
                        'avg_yd', 
                        'sacks','int', 
                        'fmbl', 
                        'won',
                        'points',
                        'total_penalties', 
                        'naughties', 
                        'super_naughties']

def get_team_df(t):
    T1 = DF[t == DF['team1']][t1cols].rename(columns=t1rename)
    T2 = DF[t == DF['team2']][t2cols].rename(columns=t2rename)
    T = pd.concat([T1, T2]).sort_values('date')
    T = pd.merge(T, s[['week', 'year', t]])
    T.rename(columns={t : 'interest'}, inplace=True)
    T['total_penalties'] = T['penalties'].apply(lambda x: sum(x[i] for i in x.elements()))
    T['naughties'] = T['penalties'].apply(sum_of_naughties)
    T['super_naughties'] = T['penalties'].apply(sum_of_super_naughties)
    for c in cols_for_running_avg:
        T[c+'_last'] = T[c].shift(1)
        T[c+'_last_three'] = T[c+'_last'].rolling(3).mean()
        T[c+'_last_five'] = T[c+'_last'].rolling(5).mean()
        T[c+'_total_avg'] = T[c].mean()
    return T
    

teams = list(set(DF.team1.unique()) | set(DF.team2.unique()))
team_dfs = {}
for t in teams:
    team_dfs[t] = get_team_df(t)

In [35]:
team_dfs['Arizona Cardinals'].head()

Unnamed: 0,date,week,year,penalties,passratio,touchdowns,avg_yd,sacks,int,fmbl,won,points,interest,total_penalties,naughties,super_naughties,passratio_last,passratio_last_three,passratio_last_five,passratio_total_avg,touchdowns_last,touchdowns_last_three,touchdowns_last_five,touchdowns_total_avg,avg_yd_last,avg_yd_last_three,avg_yd_last_five,avg_yd_total_avg,sacks_last,sacks_last_three,sacks_last_five,sacks_total_avg,int_last,int_last_three,int_last_five,int_total_avg,fmbl_last,fmbl_last_three,fmbl_last_five,fmbl_total_avg,won_last,won_last_three,won_last_five,won_total_avg,points_last,points_last_three,points_last_five,points_total_avg,total_penalties_last,total_penalties_last_three,total_penalties_last_five,total_penalties_total_avg,naughties_last,naughties_last_three,naughties_last_five,naughties_total_avg,super_naughties_last,super_naughties_last_three,super_naughties_last_five,super_naughties_total_avg
0,2013-09-15,2,2013,"{'RUNNING INTO THE KICKER': 1, 'OFFENSIVE HOLD...",0.636364,3,4.521739,1,0,1,1,25,1,9,1,0,,,,0.603154,,,,3.083333,,,,4.565789,,,,2.833333,,,,1.233333,,,,1.3,,,,0.666667,,,,25.166667,,,,10.516667,,,,2.116667,,,,0.683333
1,2013-09-22,3,2013,"{'INELIGIBLE DOWNFIELD PASS': 1, 'DEFENSIVE PA...",0.703704,1,4.449275,4,1,1,0,7,1,4,2,0,0.636364,,,0.603154,3.0,,,3.083333,4.521739,,,4.565789,1.0,,,2.833333,0.0,,,1.233333,1.0,,,1.3,1.0,,,0.666667,25.0,,,25.166667,9.0,,,10.516667,1.0,,,2.116667,0.0,,,0.683333
2,2013-09-29,4,2013,"{'FALSE START': 3, 'ILLEGAL FORMATION': 2, 'EN...",0.694915,1,3.860759,2,2,1,1,13,1,19,3,1,0.703704,,,0.603154,1.0,,,3.083333,4.449275,,,4.565789,4.0,,,2.833333,1.0,,,1.233333,1.0,,,1.3,0.0,,,0.666667,7.0,,,25.166667,4.0,,,10.516667,2.0,,,2.116667,0.0,,,0.683333
3,2013-10-06,5,2013,"{'ILLEGAL FORMATION': 1, 'ROUGHING THE PASSER'...",0.526316,2,3.986667,7,3,1,1,22,2,3,1,1,0.694915,0.678328,,0.603154,1.0,1.666667,,3.083333,3.860759,4.277258,,4.565789,2.0,2.333333,,2.833333,2.0,1.0,,1.233333,1.0,1.0,,1.3,1.0,0.666667,,0.666667,13.0,15.0,,25.166667,19.0,10.666667,,10.516667,3.0,2.0,,2.116667,1.0,0.333333,,0.683333
4,2013-10-13,6,2013,"{'OFFENSIVE HOLDING': 1, 'FALSE START': 1, 'NE...",0.671875,4,6.4,2,1,2,0,20,3,8,3,2,0.526316,0.641645,,0.603154,2.0,1.333333,,3.083333,3.986667,4.098901,,4.565789,7.0,4.333333,,2.833333,3.0,2.0,,1.233333,1.0,1.0,,1.3,1.0,0.666667,,0.666667,22.0,14.0,,25.166667,3.0,8.666667,,10.516667,1.0,2.0,,2.116667,1.0,0.666667,,0.683333


In [34]:
S = pd.concat(team_dfs.values()).dropna()
S.head(10)

Unnamed: 0,date,week,year,penalties,passratio,touchdowns,avg_yd,sacks,int,fmbl,won,points,interest,total_penalties,naughties,super_naughties,passratio_last,passratio_last_three,passratio_last_five,passratio_total_avg,touchdowns_last,touchdowns_last_three,touchdowns_last_five,touchdowns_total_avg,avg_yd_last,avg_yd_last_three,avg_yd_last_five,avg_yd_total_avg,sacks_last,sacks_last_three,sacks_last_five,sacks_total_avg,int_last,int_last_three,int_last_five,int_total_avg,fmbl_last,fmbl_last_three,fmbl_last_five,fmbl_total_avg,won_last,won_last_three,won_last_five,won_total_avg,points_last,points_last_three,points_last_five,points_total_avg,total_penalties_last,total_penalties_last_three,total_penalties_last_five,total_penalties_total_avg,naughties_last,naughties_last_three,naughties_last_five,naughties_total_avg,super_naughties_last,super_naughties_last_three,super_naughties_last_five,super_naughties_total_avg
5,2013-10-13,6,2013,"{'DELAY OF GAME': 3, 'INELIGIBLE DOWNFIELD KIC...",0.569231,2,3.72043,3,0,3,0,7,2,26,1,0,0.5,0.576023,0.523908,0.610804,3.0,2.666667,2.4,2.859155,4.565789,4.646775,4.741985,4.425904,2.0,1.333333,2.6,2.070423,3.0,1.0,0.6,0.84507,2.0,2.333333,1.6,1.295775,1.0,0.333333,0.4,0.422535,27.0,20.666667,19.6,22.478873,13.0,12.333333,12.8,16.408451,3.0,1.333333,1.6,2.591549,2.0,0.666667,0.6,0.647887
6,2013-10-27,8,2013,"{'OFFENSIVE HOLDING': 1, 'DELAY OF GAME': 2, '...",0.345455,3,3.881579,5,3,1,1,21,3,8,1,0,0.569231,0.543545,0.544206,0.610804,2.0,2.333333,2.2,2.859155,3.72043,4.14182,4.369188,4.425904,3.0,2.0,2.4,2.070423,0.0,1.0,0.6,0.84507,3.0,2.333333,2.2,1.295775,0.0,0.333333,0.4,0.422535,7.0,16.0,17.6,22.478873,26.0,16.0,15.2,16.408451,1.0,1.333333,1.0,2.591549,0.0,0.666667,0.4,0.647887
7,2013-11-03,9,2013,"{'OFFENSIVE HOLDING': 3, 'DEFENSIVE OFFSIDE': ...",0.637363,2,5.175439,1,0,0,0,20,1,20,3,1,0.345455,0.471562,0.528551,0.610804,3.0,2.666667,2.6,2.859155,3.881579,4.055933,4.308467,4.425904,5.0,3.333333,2.4,2.070423,3.0,2.0,1.2,0.84507,1.0,2.0,2.2,1.295775,1.0,0.666667,0.4,0.422535,21.0,18.333333,18.0,22.478873,8.0,15.666667,14.2,16.408451,1.0,1.666667,1.2,2.591549,0.0,0.666667,0.4,0.647887
8,2013-11-03,9,2013,"{'OFFENSIVE HOLDING': 3, 'DEFENSIVE OFFSIDE': ...",0.637363,2,5.175439,1,0,0,0,20,2,20,3,1,0.637363,0.517349,0.52269,0.610804,2.0,2.333333,2.4,2.859155,5.175439,4.259149,4.296496,4.425904,1.0,3.0,2.4,2.070423,0.0,1.0,1.2,0.84507,0.0,1.333333,1.6,1.295775,0.0,0.333333,0.4,0.422535,20.0,16.0,17.8,22.478873,20.0,18.0,15.2,16.408451,3.0,1.666667,1.6,2.591549,1.0,0.333333,0.6,0.647887
9,2013-11-10,10,2013,"{'OFFENSIVE HOLDING': 4, 'FALSE START': 2, 'IL...",0.509434,2,3.945946,3,1,2,0,20,2,24,1,0,0.637363,0.54006,0.537882,0.610804,2.0,2.333333,2.4,2.859155,5.175439,4.744152,4.503735,4.425904,1.0,2.333333,2.4,2.070423,0.0,1.0,1.2,0.84507,0.0,0.333333,1.2,1.295775,0.0,0.333333,0.4,0.422535,20.0,20.333333,19.0,22.478873,20.0,16.0,17.4,16.408451,3.0,2.333333,2.2,2.591549,1.0,0.666667,0.8,0.647887
10,2013-11-17,11,2013,"{'FALSE START': 2, 'UNNECESSARY ROUGHNESS': 1,...",0.507937,5,3.78022,2,1,3,1,28,3,24,3,2,0.509434,0.59472,0.539769,0.610804,2.0,2.0,2.2,2.859155,3.945946,4.765608,4.379766,4.425904,3.0,1.666667,2.6,2.070423,1.0,0.333333,0.8,0.84507,2.0,0.666667,1.2,1.295775,0.0,0.0,0.2,0.422535,20.0,20.0,17.6,22.478873,24.0,21.333333,19.6,16.408451,1.0,2.333333,1.8,2.591549,0.0,0.666667,0.4,0.647887
11,2013-11-24,12,2013,"{'ROUGHING THE PASSER': 1, 'FACE MASK (15 YARD...",0.618182,1,4.916667,2,0,3,0,19,5,6,2,1,0.507937,0.551578,0.52751,0.610804,5.0,3.0,2.8,2.859155,3.78022,4.300535,4.391724,4.425904,2.0,2.0,2.4,2.070423,1.0,0.666667,1.0,0.84507,3.0,1.666667,1.2,1.295775,1.0,0.333333,0.4,0.422535,28.0,22.666667,21.8,22.478873,24.0,22.666667,19.2,16.408451,3.0,2.333333,2.2,2.591549,2.0,1.0,0.8,0.647887
12,2013-11-28,13,2013,"{'NEUTRAL ZONE INFRACTION': 2, 'ROUGHING THE P...",0.607843,4,4.197368,2,0,2,0,24,2,25,5,2,0.618182,0.545184,0.582056,0.610804,1.0,2.666667,2.4,2.859155,4.916667,4.214277,4.598742,4.425904,2.0,2.333333,1.8,2.070423,0.0,0.666667,0.4,0.84507,3.0,2.666667,1.6,1.295775,0.0,0.333333,0.2,0.422535,19.0,22.333333,21.4,22.478873,6.0,18.0,18.8,16.408451,2.0,2.0,2.4,2.591549,1.0,1.0,1.0,0.647887
13,2013-12-08,14,2013,"{'ILLEGAL FORMATION': 1, 'FALSE START': 1, 'OF...",0.609375,4,4.776471,1,1,1,0,27,1,5,1,1,0.607843,0.577987,0.576152,0.610804,4.0,3.333333,2.8,2.859155,4.197368,4.298085,4.403128,4.425904,2.0,2.0,2.0,2.070423,0.0,0.333333,0.4,0.84507,2.0,2.666667,2.0,1.295775,0.0,0.333333,0.2,0.422535,24.0,23.666667,22.2,22.478873,25.0,18.333333,19.8,16.408451,5.0,3.333333,2.8,2.591549,2.0,1.666667,1.2,0.647887
14,2013-12-08,14,2013,"{'ILLEGAL FORMATION': 1, 'FALSE START': 1, 'OF...",0.609375,4,4.776471,1,1,1,0,27,2,5,1,1,0.609375,0.6118,0.570554,0.610804,4.0,3.0,3.2,2.859155,4.776471,4.630169,4.323334,4.425904,1.0,1.666667,2.0,2.070423,1.0,0.333333,0.6,0.84507,1.0,2.0,2.2,1.295775,0.0,0.0,0.2,0.422535,27.0,23.333333,23.6,22.478873,5.0,12.0,16.8,16.408451,1.0,2.666667,2.4,2.591549,1.0,1.333333,1.2,0.647887


In [None]:
#commented to prevent accidental overwrite
#S.to_csv('final.csv')

Below is where I started to incorporate the injury data

In [None]:
start_dates= {12 : '2012-09-05',
              13 : '2013-09-05',
              14 : '2014-09-04',
              15 : '2015-09-10',
              16 : '2016-09-08',
              17 : '2017-09-08'}

def get_year(d):
    try:
        if type(d) == str:
            d = datetime.strptime(d, '%Y-%m-%d')
        return d.year
    except ValueError:
        return None
    
def get_week_num(d):
    try:
        if type(d) == str:
            d = datetime.strptime(d, '%Y-%m-%d')
        y = int(str(get_year(d))[-2:])
        start = datetime.strptime(start_dates[y], '%Y-%m-%d')
        return ((d - start).days // 7) + 1
    except ValueError:
        return None
    
def string_cat(a, b):
    return str(a) + str(b)

df = pd.read_csv('final.csv')

In [None]:
I = pd.concat([i13, i14, i15, i16])

In [None]:
inj = pd.read_csv('injuries.csv', index_col=0, header=0)

In [None]:
labels = [x.replace('\xa0', '') for x in list(inj.head().iloc[0])]
rename_dict = dict(zip([str(i) for i in range(5)], labels))
inj.rename(columns=rename_dict, inplace=True)
inj.drop(0, inplace=True)
inj['i'] = inj['Relinquished'].apply(len)
inj = inj[inj.i > 0][['Date', 'Team']]
inj = inj[inj.Date != '\xa0Date']
inj['Date'] = inj['Date'].apply(to_datetime)
inj = inj[inj.Date > '2012-02-01']
inj['team'] = inj['Team'].apply(get_full_name)
inj['year'] = inj['Date'].apply(get_year)
inj['week'] = inj['Date'].apply(get_week_num)

In [None]:
def to_datetime(d):
    return datetime.strptime(d, '%Y-%m-%d')

In [None]:
inj = inj[inj.week > 0]
inj = inj[inj.week < 18]
inj.head()

In [None]:
I['year'] = I['date'].apply(get_year)

In [None]:
m1 = pd.merge(I, inj.rename(columns={'team' : 'team_1'}))
m1

In [None]:
def get_full_name(s):
    n = [t for t in teams if {s.replace(' ','')} <= set(t.split(' '))]
    if n:
        return n[0]
    return np.nan


In [None]:
type('') == str