In [1]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import numpy as np
import nflgame
import os
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

#Setting for displaying inline graphics with matplotlib
%matplotlib inline


In [None]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import numpy as np
import nflgame
import os
import seaborn as sns
import matplotlib.pyplot as plt


#Custom color palettes for graphs
custompal = ["#F44336","#E91E63","#9C27B0","#673AB7","#3F51B5","#2196F3","#03A9F4","#00BCD4","#009688","#4CAF50",
             "#8BC34A","#CDDC39","#FFEB3B","#FFC107","#FF9800","#FF5722","#795548","#9E9E9E","#607D8B"]
minicustom = custompal[::2]
sns.set_palette(minicustom)
#sns.palplot(custompal)

sns.set_style("whitegrid")
sns.set_context("notebook")
plt.figure(figsize=(16, 8))
teams = ['buffalo-bills', 'miami-dolphins', 'new-england-patriots', 'new-york-jets', 'baltimore-ravens',
         'cincinnati-bengals', 'cleveland-browns', 'pittsburgh-steelers', 'houston-texans', 'indianapolis-colts',
         'jacksonville-jaguars', 'tennessee-titans', 'denver-broncos', 'kansas-city-chiefs', 'oakland-raiders',
         'san-diego-chargers', 'dallas-cowboys', 'new-york-giants', 'philadelphia-eagles', 'washington-redskins',
         'chicago-bears', 'detroit-lions', 'green-bay-packers', 'minnesota-vikings', 'atlanta-falcons',
         'carolina-panthers', 'new-orleans-saints', 'tampa-bay-buccaneers', 'arizona-cardinals',
         'san-francisco-49ers', 'seattle-seahawks', 'st-louis-rams']

def GetStdTeam(team):
    #Convert team name into easily parsable string
    tm = team.replace('-',' ')

    #Rams no longer in St. Louis (so just use Rams for search)
    if ("rams" in tm):
        tm = "rams"

    #Get standard team name
    stdteam = nflgame.standard_team(str(tm))
    return stdteam


def GetPenaltyData(team, year):
    print 'Getting data for ' + team + ' ' + year

    #Set up url string
    teamfile = 'http://www.nflpenalties.com/team/' + team + '?year=' + year + '&view=log'

    #Set up BeautifulSoup parser
    r = urllib.urlopen(teamfile).read()
    soup = BeautifulSoup(r, "lxml")
    print type(soup)

    #Get content
    content = soup.find_all("div", id="content")

    return content


def ParseContent(content, stdteam):
    tb = content[0].find('tbody')
    y = pd.DataFrame()
    z = ['team','penalty','game','opponent','player','position','week','year','penaltyindex']
    count = 0
    missing = 0

    for row in tb.findAll('tr'):
        missing = str(row).count('></a>')

        if (missing == 0):
            for link in row.findAll('a'):

                href = link['href']
                linkinfo = href.split('/')
                linkcat = str(linkinfo[1])

                linkdetails = str(link.get_text())

                if (linkcat == 'penalty'):
                    t = [stdteam]

                if ('week.php' not in href):
                    t = np.hstack([t, linkdetails])
                else:
                    weekandyear = href.split('week=')[1]
                    week = weekandyear.split('?')[0]
                    year = weekandyear.split('year=')[1]

                    t = np.hstack([t, week, year])

                    penaltyindex = stdteam + '-' + str(year) + '-' + str(count)

                    t = np.hstack([t, penaltyindex])
                    z = np.vstack([z,t])
                    count += 1
        else:
            missing += 1

    totalpenalties = missing + count
    z['opponent'] = z['opponent'].apply(lambda x: GetStdTeam(x))
    return [z,totalpenalties]

#This writes CSV of data for particular team and year
def WritePenaltyData(penalties, stdteam, year, fname):
    totalnumpenalties = penalties[1]
    print stdteam + ',' + year + ',' + str(totalnumpenalties)
    np.savetxt(fname, penalties[0], fmt = '%s', delimiter=',')

#Gets data from all teams into dataframe for particular year
def GetAllTeams(year):
    data = pd.DataFrame()
    for team in teams:
        stdteam = GetStdTeam(team)
        print stdteam
        fname = '/Users/kelly89/NFLPenalties/' + stdteam + year + '.csv'

        if (os.path.isfile(fname) == False):
            content = GetPenaltyData(team, year)
            penalties = ParseContent(content, stdteam)
            WritePenaltyData(penalties, stdteam, year, fname)
        else:
            print 'Found data file for ' + stdteam + ' ' + year

        temp = pd.read_csv(fname, sep = ',')
        data = pd.concat([data,temp], ignore_index = True)
    return data

#Writes main CSV (penalty data fro all teams and all years from 2011-2015)
def GetAllYears(fname):
    alldata = pd.DataFrame()
    for year in ['2011','2012','2013','2014','2015']:
        tempyear = GetAllTeams(year)
        alldata = pd.concat([alldata,tempyear], ignore_index = True)

    alldata.columns = ['Team','Penalty','NDate','Opponent','Player','Position','Week','Year','PenaltyIndex']
    alldata['Opponent'] = alldata['Opponent'].apply(lambda x: GetStdTeam(x))
    d = alldata.dropna()
    d.to_csv(fname, sep = ',', index = False)

def GetRegSeasonRecord (team, year):
    team = str(team)
    year = int(year)

    games = nflgame.games_gen(year, week = None, home = team, away = team, kind = 'REG')
    for g in games:
        print g
        if (g.winner == team):
            w += 1
        elif (g.loser == team):
            l += 1
        else:
            print "ERROR: can not determine winner of this game"

        print 'W-L: ' + str(w) + '-' + str(l)

    return [int(w),int(l)]

def GetAllRefInfo(fname):
    pfr = 'http://www.pro-football-reference.com/officials'
    pfrUrl = urllib.urlopen(pfr).read()
    pfrSoup = BeautifulSoup(pfrUrl, "lxml")
    content = pfrSoup.find_all("div", id="page_content")

    a = content[0].find('table')
    b = pd.DataFrame()
    c = ['webpage','name','games','positions','years']
    count = 0
    m = 0
    e = []

    for row in a.findAll('tr')[1:]:
        m = str(row).count('></td>')
        webpage = os.path.dirname(pfr) + str(row.findNext('a')['href'])
        e = [webpage]
        for val in row.findAll('td'):
            d = val.get_text()
            e = np.hstack([e, str(d)])
        if (len(e) == 5):
            c = np.vstack([c,e])
            count += 1
        else:
            m += 1
    GetAllRefGames(c[1:], fname)

def GetAllRefGames(refinfo, fname):
    refstats = ['ref','exp','date','visitor','home','position','vpen','vpenyd','hpen','hpenyd']

    for ref in refinfo:
        refname = ref[1].rpartition(' ')[2]
        exp = str(ref[2])
        print 'Getting data for ' + refname

        refUrl = urllib.urlopen(ref[0]).read()
        refSoup = BeautifulSoup(refUrl, "lxml")

        i = refSoup.find_all("div", id="div_game_logs")[0]

        for row in i.findAll('tr'):
            j = []
            alltd = row.findAll("td")
            if (len(alltd) != 0):
                date = alltd[0].get_text()
                teams = alltd[1].get_text().split(' @ ')
                v = nflgame.standard_team(teams[0].rpartition(' ')[2])
                h = nflgame.standard_team(teams[1].rpartition(' ')[2])
                pos = alltd[2].get_text().replace(' ','-')
                vpen = alltd[4].get_text()
                vpenyd = alltd[5].get_text()
                hpen = alltd[7].get_text()
                hpenyd = alltd[8].get_text()

                j = np.hstack([refname, exp, date, v, h, pos, vpen, vpenyd, hpen, hpenyd])

                if (len(j) == 10):
                    refstats = np.vstack([refstats, j])
            print refstats
    np.savetxt(fname, refstats, fmt = '%s', delimiter=',')


def NormDate(date):
    d = date.split('-')
    d = str(d[1] + '/' + d[2] + '/' + d[0])
    return str(d)

def MakeKey(r):
    yr = r.NDate.replace('/','-')
    k = str(yr + '-' + r.HTeam)
    return k

def GetHomeTm(r, homeTms):
    if (r.Team in homeTms):
        return r.Team
    else:
        return r.Opponent


def GetAwayTm(r, homeTms):
    if (r.Team not in homeTms):
        return r.Team
    else:
        return r.Opponent

def MergePenaltyAndRefs(fname, data, rd):

    rd['NDate'] = rd['Date'].apply(lambda x: NormDate(x))
    #rd['Year'] = rd['NDate'].apply(lambda x: str(x).split('/')[2])

    #penYears = map(str, range(2011,2016))
    #rd = rd[rd['Year'].isin(penYears)]

    #Add column to determine if player was on home team
    a = rd.groupby(['HTeam', 'NDate'], as_index = False).count()
    data['HTeam'] = data.apply(lambda x: GetHomeTm(x, a[(a['NDate'] == str(x.NDate))]), axis=1)
    data['VTeam'] = data.apply(lambda x: GetAwayTm(x, a[(a['NDate'] == str(x.NDate))]), axis=1)

    #This isn't necessary and sort of hackish. Probably better to multindex?
    rd['key'] = rd.apply(MakeKey, axis=1)
    data['key'] = data.apply(MakeKey, axis=1)

    uniqcols = rd.columns.difference(data.columns)
    newrd = pd.concat([rd[['key']],rd[uniqcols]], axis=1)
    #data = data.set_index('key')
    #rd = rd.set_index('key')
    #newrd = newrd.set_index('key')


    c = pd.merge(data, newrd, left_on = 'key', right_on = 'key')
    #d = c.update(rd)
    print c.shape
    print c.head(40)
    c.to_csv(fname, sep=",", index = False)


def GetGameData(year):
    games = nflgame.games_gen(year)
    a = pd.DataFrame()
    for g in games:
        Schedule = g.schedule
        s = pd.DataFrame.from_dict([Schedule])
        extra = {'HScore': g.score_home, 'VScore': g.score_away, 'Winner': g.winner, 'Season': g.season()}
        e = pd.DataFrame.from_dict([extra])
        r = pd.concat([s, e], axis=1)
        r['HVScoreDiff'] = r['HScore'] - r['VScore']
        r['NDate'] = str(str(r.month[0]) + '/' + str(r.day[0]) + '/' + str(r.year[0]))
        a = pd.concat([a, r])
    return a

def GetAllGameData(fname):
    allyears = pd.DataFrame()
    hdr = ['HScore','HVScoreDiff','NDate','Season','VScore','Winner','VTeam','day','eid','gamekey','HTeam','meridiem','month','gametype','time','wday','week','year']
    for year in range(2009,2016):
        yr = GetGameData(year)
        print 'Getting game data for ' + str(year) + ' season'
        allyears = pd.concat([allyears, yr])
    allyears.columns = [hdr]
    allyears = allyears.drop(['meridiem','year'], axis = 1)
    allyears.to_csv(fname, sep=",", index = False)




#Main code

#Get penalty data for all years into dataframe
allPenF = '/Users/kelly89/NFLPenalties/AllPenalties.csv'
if (os.path.isfile(allPenF) == False):
    GetAllYears(allPenF)
else:
    print 'Found ' + allPenF
data = pd.read_csv(allPenF, sep=",")
print data.shape

#Get all game data
gameDataF = '/Users/kelly89/NFLPenalties/AllGameData.csv'
if (os.path.isfile(gameDataF) == False):
    GetAllGameData(gameDataF)
else:
    print 'Found ' + gameDataF
gd = pd.read_csv(gameDataF, sep=',')
print gd.shape

uniqcols = data.columns.difference(gd.columns)
print data[uniqcols].columns.values

gd['Team'] = gd['HTeam']
gpd = pd.merge(data[uniqcols], gd, on = 'Team')

print gpd.columns.values
gpd = gpd.drop('Team', axis = 1)

uniq = data.columns.difference(gpd.columns)
print data[uniq].columns.values

gpd['Team'] = gpd['VTeam']
m = pd.merge(gpd[uniq], data, on = 'Team')

print m.head(10)

In [None]:
a['IsHomeTeam'] = np.where((a['Team'] == a['HTeam']), True, False)
a['NumPenalty'] = np.where((a['Team'] == a['HTeam']), a['HPen'], a['VPen'])
a['PenaltyYds'] = np.where((a['Team'] == a['HTeam']), a['HPenYd'], a['VPenYd'])

In [None]:





uniqcols = gd.columns.difference(data.columns)
newgd = pd.concat([gd['NDate-Team'],gd[uniqcols]], axis=1)
    
print newgd.head(5)
print gd.head(5)
print data.head(5)

#gpd = pd.merge(data, newgd, left_on = 'NDate-Team', right_on = 'NDate-Team')
#print gpd.shape
#print gpd.head(15)

#Get refdata into dataframe
refDataF = '/Users/kelly89/NFLPenalties/RefData.csv'
if (os.path.isfile(refDataF) == False):
    GetAllRefInfo(refDataF)
else:
    print 'Found ' + refDataF
rd = pd.read_csv(refDataF, sep=",")
rd.columns = ['LastName','Exp','Date','VTeam','HTeam','RefPosition','VPen','VPenYd','HPen','HPenYd']
rd = rd[(rd['RefPosition'] == 'Referee')]
print rd.shape

#Merge ref data with penalty data
refPenF = '/Users/kelly89/NFLPenalties/RefAndPenalties.csv'
if (os.path.isfile(refPenF) == False):
    MergePenaltyAndRefs(refPenF, data, rd)
else:
    print 'Found ' + refPenF
rp = pd.read_csv(refPenF, sep=',')
print rp.shape
print 'rp descriptive stats:'
for c in rp.columns.values:
    n = rp[c].isnull().sum()
    print c + ' ' + str(n)

#Get all game data
gameDataF = '/Users/kelly89/NFLPenalties/AllGameData.csv'
if (os.path.isfile(gameDataF) == False):
    GetAllGameData(gameDataF)
else:
    print 'Found ' + gameDataF
gd = pd.read_csv(gameDataF, sep=',')
print gd.shape
print 'gd descriptive stats:'
for c in gd.columns.values:
    n = gd[c].isnull().sum()
    print c + ' ' + str(n)

#Merge ref, penalty, and gamedata
rfpF = '/Users/kelly89/NFLPenalties/RefPenaltyAndGameData.csv'
if (os.path.isfile(rfpF) == False):
    gd['key'] = gd.apply(lambda x: MakeKey(x), axis=1)
    print 'gd'
    print gd.head(10)
    uniqcols = rd[rp.columns.difference(data.columns),'key']
    #newrp = pd.concat([rp[['key']],rp[uniqcols]], axis=1)
    print 'uniqcols'
    print uniqcols.head(10)
    #m = pd.merge(gd, newrp, left_on = 'key', right_on = 'key', how = 'outer')
    #print m.shape
    #m.to_csv(rfpF, sep = ',', index = False)
else:
    print 'Found ' + rfpF
#rfp = pd.read_csv(rfpF, sep=',')
#print rfp.shape
#print 'rfp descriptive stats:'

#for c in rfp.columns.values:
#    n = rfp[c].isnull().sum()
#    print c + ' ' + str(n)

#sdF = '/Users/kelly89/NFLPenalties/StadiumData.csv'
#sd = pd.read_csv(sdF, sep = ',')
#print sd.shape

#print sd.columns.values
#print rfp.columns.values 

#rfpgsF = '/Users/kelly89/NFLPenalties/RefPenaltyGameAndStadiumData.csv'
#if (os.path.isfile(rfpgsF) == False):
#    m = pd.merge(rfp, sd, left_on = 'HTeam', right_on = 'HTeam')
#    m.to_csv(rfpgsF, sep = ',', index = False)
#else:
#    print 'Found ' + rfpgsF
#f = pd.read_csv(rfpgsF, sep = ',')
#print f.shape
