In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle
from time import sleep

In [2]:
with open("../pickled_data/team_dict.pickle", "rb") as dict_file:
    team_dict = pickle.load(dict_file)
dict_file.close()

In [3]:
url = "http://www.espn.com/mens-college-basketball/bpi/_/season/{}/page/{}/view/{}"
num_pages = 16
seasons = [i for i in range(2008, 2018)]
print seasons

[2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]


In [6]:
# function to scrape data from espn into dataframe

def get_stats(statstype):
    for season in seasons:
        for num in range(1, num_pages):
            formatted_url = url.format(str(season), str(num), statstype)
            if season == 2008 and num == 1:
                # might need to adjust sleep if connection interrupted
                sleep(1)
                df_list = pd.read_html(formatted_url)
                stat_df = df_list[1]
                stat_df["Season"] = season
            else:
                try:
                    df_list = pd.read_html(formatted_url)
                    df_to_append = df_list[1]
                    df_to_append["Season"] = season
                    stat_df = stat_df.append(df_to_append, ignore_index=True)
                except IndexError:
                    # Depending on the season, pages can have less than 15 pages
                    print formatted_url
    
    return stat_df

In [7]:
# get dataframe with espn's team resume data
resume_df = get_stats("resume")
print len(resume_df)
resume_df.tail()

http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/1/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/2/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/3/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/4/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/5/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/6/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/7/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/8/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/9/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/10/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/11/view/resume
http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/12/view/resume
http://www.espn.com/mens-

error: [Errno 54] Connection reset by peer

In [22]:
# remove the short form team name from the end of string
resume_df["TEAM"] = resume_df["TEAM"].apply(lambda x: re.sub(r"(?<=[a-z])[A-Z]+$","", x))
resume_df.drop(["CONF", "Seed", "W-L", "SOR S-Curve"], axis=1, inplace=True)
resume_df.head()

Unnamed: 0,RK,TEAM,SOR RK,Qual W-L,SOS RK,Non-Conf SOS RK,Season
0,1,North Carolina,1,13-3,9,12,2008
1,2,Kansas,2,15-2,46,26,2008
2,3,Memphis,3,10-2,76,7,2008
3,4,UCLAUCLA,4,16-3,28,39,2008
4,5,Tennessee,5,12-4,19,5,2008


In [23]:
# get dataframe with espn's bpi stats
bpi_df = get_stats("bpi")

http://www.espn.com/mens-college-basketball/bpi/_/season/2008/page/15/view/bpi
http://www.espn.com/mens-college-basketball/bpi/_/season/2009/page/15/view/bpi
http://www.espn.com/mens-college-basketball/bpi/_/season/2010/page/15/view/bpi
http://www.espn.com/mens-college-basketball/bpi/_/season/2011/page/15/view/bpi
http://www.espn.com/mens-college-basketball/bpi/_/season/2012/page/15/view/bpi
http://www.espn.com/mens-college-basketball/bpi/_/season/2013/page/15/view/bpi


In [24]:
print len(bpi_df)
bpi_df["TEAM"] = bpi_df["TEAM"].apply(lambda x: re.sub(r"(?<=[a-z])[A-Z]+$","", x))
bpi_df.drop(["CONF", "W-L", "7-Day RK CHG"], axis=1, inplace=True)
bpi_df.tail()

3475


Unnamed: 0,RK,TEAM,BPI Off,BPI Def,BPI,Season
3470,347,Arkansas-Pine Bluff,-12.5,-2.8,-15.3,2017
3471,348,Coppin State,-9.0,-7.7,-16.7,2017
3472,349,Presbyterian College,-9.8,-7.3,-17.1,2017
3473,350,North Carolina A&TNCAT,-7.4,-10.1,-17.5,2017
3474,351,Alabama A&MAAMU,-8.5,-10.5,-19.0,2017


In [25]:
resume_df.sort_values(by=["Season", "TEAM"], inplace=True)
bpi_df.sort_values(by=["Season", "TEAM"], inplace=True)

In [26]:
espn_df = resume_df.merge(bpi_df, on=["TEAM", "Season"])
print len(espn_df)
espn_df.head()

3474


Unnamed: 0,RK_x,TEAM,SOR RK,Qual W-L,SOS RK,Non-Conf SOS RK,Season,RK_y,BPI Off,BPI Def,BPI
0,162,Air Force,162,1-4,155,307,2008,180,-9.5,8.4,-1.1
1,77,Akron,77,0-1,138,133,2008,73,2.9,3.7,6.6
2,97,Alabama,97,4-10,63,135,2008,80,6.5,-0.5,6.0
3,T312,Alabama A&MAAMU,312,0-1,336,205,2008,313,-7.5,-2.6,-10.1
4,205,Alabama State,205,0-1,334,210,2008,238,-3.1,-1.2,-4.3


In [27]:
team_list = list(team_dict.values())

for index, row in espn_df.iterrows():
    for team in team_list:
        if team in row["TEAM"]:
            espn_df.set_value(index,'Teamname',team)
        else:
            row["Teamname"] = "fail"
            

In [28]:
cols = ["Season","Teamname", "SOR RK", "SOS RK", "Non-Conf SOS RK", "Qual W-L", "BPI"]
espn_df = espn_df[cols]
espn_df.head()

Unnamed: 0,Season,Teamname,SOR RK,SOS RK,Non-Conf SOS RK,Qual W-L,BPI
0,2008,Air Force,162,155,307,1-4,-1.1
1,2008,Akron,77,138,133,0-1,6.6
2,2008,Alabama,97,63,135,4-10,6.0
3,2008,Alabama A&M,312,336,205,0-1,-10.1
4,2008,Alabama St,205,334,210,0-1,-4.3


In [29]:
Q_cols = pd.DataFrame(espn_df["Qual W-L"].str.split('-',1).tolist(),
                                   columns = ['Q_wins','Q_losses'])
Q_cols.head()

Unnamed: 0,Q_wins,Q_losses
0,1,4
1,0,1
2,4,10
3,0,1
4,0,1


In [32]:
ESPN_df = espn_df.join(Q_cols)
ESPN_df.drop("Qual W-L", axis=1, inplace=True)
ESPN_df.head()

Unnamed: 0,Season,Teamname,SOR RK,SOS RK,Non-Conf SOS RK,BPI,Q_wins,Q_losses
0,2008,Air Force,162,155,307,-1.1,1,4
1,2008,Akron,77,138,133,6.6,0,1
2,2008,Alabama,97,63,135,6.0,4,10
3,2008,Alabama A&M,312,336,205,-10.1,0,1
4,2008,Alabama St,205,334,210,-4.3,0,1


In [33]:
with open("../pickled_data/espn_df.pickle", "wb") as e_df:
    pickle.dump(ESPN_df, e_df)
e_df.close()