# Box Score Gatherer

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm
from urllib.request import urlopen
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
requestHead = {"User-Agent": "Chrome/47.0.2526.111"}

## General

In [3]:
def getSeasonIDs(num,online = True,prefix = None,post=True):
    if online:
        if num < 10:
            strnum = '0' + str(num)
        elif num >= 10:
            strnum = str(num)
        url = "http://sim-football.com/indexes/NSFLS"+strnum+"/GameResults.html"
        page = requests.get(url, headers = requestHead)
        soup = BeautifulSoup(page.text,'html.parser')
    else:
        with open(prefix+'/GameResults.html') as f:
            soup = BeautifulSoup(f,'html.parser')
    
    if num > 21:
        preseason=24
        postseason=7
    elif num > 15:
        preseason = 20
        postseason= 7
    elif num > 1:
        preseason = 16
        postseason= 3
    else:
        preseason = 12
        postseason= 3
    
    pbplist = soup.find_all('a',href=re.compile('Logs'))
    pbpURLs = [p.get('href') for p in pbplist]
    if len(pbpURLs[preseason:-postseason]) == 55:
        preseason = preseason-1
    if post:
        idList = [p[5:].strip('.html') for p in pbpURLs[preseason:]]
    else:
        idList = [p[5:].strip('.html') for p in pbpURLs[preseason:-postseason]]
    return idList

In [4]:
gameID = 5354
S = 22

if S < 10:
    strnum = '0' + str(S)
elif S >= 10:
    strnum = str(S)

In [5]:
sIDList = getSeasonIDs(S)
# gameID = sIDList[4]
print(gameID)

5354


In [6]:
boxList = pd.read_html('http://sim-football.com/indexes/NSFLS%s/Boxscores/%s.html'%(strnum,gameID))

In [22]:
# Quarters
boxScore = boxList[2].iloc[:-1,:-1]
boxScore

Unnamed: 0,Team,1st,2nd,3rd,4th,OT,F
0,SaberCats (7-4-0),3,10,3,0,0,16
1,Yeti (6-5-0),3,0,7,6,3,19


In [27]:
# Scoring Summary
scoreSum = boxList[5]
homeTeam = scoreSum['Scoring Summary.5'].iloc[0]
awayTeam = scoreSum['Scoring Summary.4'].iloc[0]
scoreSum = scoreSum.rename(columns = {'Scoring Summary':'Quarter','Scoring Summary.1':'Score Type','Scoring Summary.2':'Time Remaining',
                                      'Scoring Summary.3':'Play','Scoring Summary.4':awayTeam,'Scoring Summary.5':homeTeam})

isnull = scoreSum['Quarter'].isnull()
partitions = (isnull != isnull.shift()).cumsum()

gb = scoreSum[isnull].groupby(partitions)

q1 = gb.get_group(2).iloc[:,1:]
q1['Q'] = 1

q2 = gb.get_group(4).iloc[:,1:]
q2['Q'] = 2

q3 = gb.get_group(6).iloc[:,1:]
q3['Q'] = 3
q4 = gb.get_group(8).iloc[:,1:]
q4['Q'] = 4
try:
    q5 = gb.get_group(10).iloc[:,1:]
    q5['Q'] = 5
except:
    q5 = None

scoring = pd.concat([q1,q2,q3,q4,q5])
cols = ['Q'] + list(scoring.columns[:-1])
scoring = scoring[cols]
scoring

Unnamed: 0,Q,Score Type,Time Remaining,Play,SJS,COL
1,1,FG,9:09,41 yd FG by Matthew McDairmid,3,0
2,1,FG,3:18,17 yd FG by Silver Banana,3,3
4,2,FG,14:00,43 yd FG by Matthew McDairmid,6,3
5,2,TD,2:26,Quindarius Tyerucker 5 yd run. (Matthew McDairmid kick),13,3
7,3,FG,12:20,42 yd FG by Matthew McDairmid,16,3
8,3,TD,4:38,Ashley Owens 5 yd run. (Silver Banana kick),16,10
10,4,FG,7:15,43 yd FG by Silver Banana,16,13
11,4,FG,0:25,37 yd FG by Silver Banana,16,16
13,5,FG,13:53,23 yd FG by Silver Banana,16,19


In [83]:
# Team Stats
teamStats = boxList[6].iloc[1:]
teamStats.columns = ['Stat'] + list(boxList[6].iloc[0][1:])

pd.to_numeric(teamStats['SJS'],errors='ignore')

teamStatsDF = teamStats.copy()

def pct(teamStats,iloc1,iloc2):
    num = int(teamStats.iloc[iloc1,iloc2].split('/')[0])
    denom = int(teamStats.iloc[iloc1,iloc2].split('/')[1])
    if denom != 0:
        return num/denom
    else:
        return 0
    
for op in [1,2,5]:
    teamStatsDF.iloc[op,1] = pct(teamStats,op,1)
    teamStatsDF.iloc[op,2] = pct(teamStats,op,2)

aComp = int(teamStats.iloc[5,1].split('/')[0])
aAtt = int(teamStats.iloc[5,1].split('/')[1])
hComp = int(teamStats.iloc[5,2].split('/')[0])
hAtt = int(teamStats.iloc[5,2].split('/')[1])
aPen = int(teamStats.iloc[10,1].split('-')[0])
aPenY = int(teamStats.iloc[10,1].split('-')[1])
hPen = int(teamStats.iloc[10,2].split('-')[0])
hPenY = int(teamStats.iloc[10,2].split('-')[1])
aFum = int(teamStats.iloc[12,1].split(' (')[0])
aFumL = int(teamStats.iloc[12,1].split(' (')[1][:-1])
hFum = int(teamStats.iloc[12,2].split(' (')[0])
hFumL = int(teamStats.iloc[12,2].split(' (')[1][:-1])

arr = [['Completions',aComp,hComp],
       ['Attempts',aAtt,hAtt],
       ['Penalties',aPen,hPen],
       ['Penalty Yards',aPenY,hPenY],
       ['Fumbles',aFum,hFum],
       ['Fumbles Lost',aFumL,hFumL]]

extraDF = pd.DataFrame(arr, columns = teamStats.columns)
extraDF

teamStatsDF2 = pd.concat([teamStatsDF.iloc[:5],extraDF.iloc[:2],teamStatsDF.iloc[6:10],extraDF.iloc[2:4],teamStatsDF.iloc[11:12],extraDF[4:],teamStatsDF.iloc[13:]])
teamStatsDF2[awayTeam] = pd.to_numeric(teamStatsDF2[awayTeam],errors='ignore')
teamStatsDF2[homeTeam] = pd.to_numeric(teamStatsDF2[homeTeam],errors='ignore')

teamStatsDF2

Unnamed: 0,Stat,SJS,COL
1,First Downs,15,17
2,3rd Down Efficiency,0.333333,0.333333
3,4rd Down Efficiency,0,1
4,Total Yards,304,382
5,Passing,165,160
0,Completions,17,12
1,Attempts,29,22
7,Yards per Pass,5.7,7.3
8,Rushing,139,222
9,Rushing Attempts,30,42


In [16]:
def posStatDF(boxList,index,homeTeam,awayTeam):
    away = boxList[index].iloc[1:]
    away.columns = ['Player'] + list(boxList[index].iloc[0][1:])
    away['Team'] = awayTeam
    
    home = boxList[index+1].iloc[1:]
    home.columns = ['Player'] + list(boxList[index].iloc[0][1:])
    home['Team'] = homeTeam
    
    cols = ['Team'] + list(home.columns[:-1])
    stats = pd.concat([home,away])
    stats = stats[cols]
    return stats

In [21]:
passStats = posStatDF(boxList,8,homeTeam,awayTeam)
rushStats = posStatDF(boxList,10,homeTeam,awayTeam)
recStats = posStatDF(boxList,12,homeTeam,awayTeam)
kickStats = posStatDF(boxList,14,homeTeam,awayTeam)
puntStats = posStatDF(boxList,16,homeTeam,awayTeam)
specStats = posStatDF(boxList,18,homeTeam,awayTeam)
defStats = posStatDF(boxList,20,homeTeam,awayTeam)
othStats = posStatDF(boxList,22,homeTeam,awayTeam)