## Basketball-Reference Web Scraping

In [1]:
# special IPython command to prepare the notebook for matplotlib
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
import string
from pattern import web

###Team Mapping

In [2]:
team_name_list = ["Atlanta Hawks","Boston Celtics","Brooklyn Nets","Charlotte Bobcats","Chicago Bulls","Charlotte Hornets","Cleveland Cavaliers","Dallas Mavericks","Denver Nuggets","Detroit Pistons"
             ,"Golden State Warriors","Houston Rockets","Indiana Pacers","Los Angeles Clippers","Los Angeles Lakers","Memphis Grizzlies","Miami Heat","Milwaukee Bucks","Minnesota Timberwolves","New Jersey Nets"
             ,"New Orleans Hornets","New Orleans/Oklahoma City Hornets","New Orleans Pelicans","New York Knicks","Oklahoma City Thunder","Orlando Magic","Philadelphia 76ers","Phoenix Suns","Portland Trail Blazers","Sacramento Kings"
             ,"San Antonio Spurs","Seattle SuperSonics","Toronto Raptors","Utah Jazz","Washington Wizards","League Average"]

alias_list = ["ATL","BOS","BRK","CHA","CHI","CHO","CLE","DAL","DEN","DET"
              ,"GSW","HOU","IND","LAC","LAL","MEM","MIA","MIL","MIN","NJN"
              ,"NOH","NOK","NOP","NYK","OKC","ORL","PHI","PHO","POR","SAC"
              ,"SAS","SEA","TOR","UTA","WAS","League"]

###Seaon

In [3]:
year_list = (range(2010, 2016))
print year_list
year = year_list[-1]
print year

[2010, 2011, 2012, 2013, 2014, 2015]
2015


## Fetching the basketball-reference

In [4]:
#url = 'http://www.basketball-reference.com/playoffs/NBA_2014.html'

def strYear(year):
    url='http://www.basketball-reference.com/leagues/NBA_%d.html' % year
    return url
team_url = strYear(year)
print team_url

http://www.basketball-reference.com/leagues/NBA_2015.html


## Parsing html data

In [5]:
def get_team_html_tables(html):
    """Parse html and return html tables of basketball-reference team data."""

    dom = web.Element(html)
    
    #### 1. step: get all tables
    #tbls = dom('table')
    #### 2. step: get all tables we care about
    tbls = dom.by_class('sortable')
    return tbls

def team_table_type(tbl):
    ### Extract the team_table type
    return tbl.id

def team_information(team_information_url):    
    team_website_html = requests.get(team_information_url).text
    #print team_website_html.decode
    team_information_tables = get_team_html_tables(team_website_html)
    #print team_information_tables
    
    ### team_table to dict ####
    team_table_by_type = {}
    
    for tbl in team_information_tables:
        typ = team_table_type(tbl)
        if typ not in team_table_by_type:
            team_table_by_type[typ] = list()  # equivalent to []
        team_table_by_type[typ].append(tbl)

    # group the team_tables by type
    team_tables_by_type = defaultdict(list)
    
    # defaultdicts have a default value that is inserted when a new key is accessed
    for tbl in team_information_tables:
        team_tables_by_type[team_table_type(tbl)].append(tbl)
    #print team_tables_by_type
    
    # return team_information_tables,team_tables_by_type    
    return (team_information_tables,team_tables_by_type)

### Function call ###
(team_information_tables,team_tables_by_type) = team_information(team_url)
print "table length: %d" %len(team_information_tables)
for t in team_information_tables:
    print t.attributes

print "=" *80
    
print team_tables_by_type

table length: 7
{u'data-mobile-leave-headers': u'1', u'data-freeze': u'1', u'class': u'sortable  no_highlight stats_table wide_table', u'id': u'E_standings'}
{u'data-mobile-leave-headers': u'1', u'data-freeze': u'1', u'class': u'sortable  no_highlight stats_table wide_table', u'id': u'W_standings'}
{u'class': u'sortable  stats_table', u'id': u'team'}
{u'class': u'sortable  stats_table', u'id': u'opponent'}
{u'class': u'sortable  stats_table', u'id': u'misc'}
{u'class': u'sortable  stats_table', u'id': u'shooting'}
{u'class': u'sortable  stats_table', u'id': u'shooting_opp'}
defaultdict(<type 'list'>, {u'shooting_opp': [Element(tag=u'table')], u'misc': [Element(tag=u'table')], u'E_standings': [Element(tag=u'table')], u'W_standings': [Element(tag=u'table')], u'team': [Element(tag=u'table')], u'shooting': [Element(tag=u'table')], u'opponent': [Element(tag=u'table')]})


## Extracting data and filling it into a dictionary

### Team Stats

In [6]:
def get_teamdata(tables):
    """Extract 2013-14 playoffs team data from Team Stats tables and store it in dictionary."""
    
    ### result = defaultdict(dict) ###
    team_list=[] ### (Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P,2P,2PA,2P,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PTS/G)
    team_dict={}
    
    # 1. step: try to extract data for a single table
    for tbl in tables:
        headers = tbl('tr')
        #print headers
        first_header = headers[0]
        #print first_header
        th_s = first_header('th')
        #print th_s    
        title = [val.content for val in th_s ]
        #print team_title
        
        # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict
        rows = tbl('tr')[1:]        
        for row in rows:
            
            tds = row('td')
            
            ### if row content is null , set default string ###
            f_content = (lambda x: "" if len(x) == 0 else x.content)
            ### if row a_herf is null , set default string ###
            f_ahref = (lambda x: x.content if len(x('a')) == 0 else x('a')[0].content)
            
            Rk           = f_content(tds[0])
            Team         = f_ahref(tds[1])
            Tm           = alias_list[team_name_list.index(Team)]        
            G            = f_content(tds[2])
            MP           = f_content(tds[3])
            FG           = f_content(tds[4])
            FGA          = f_content(tds[5])
            FG_Per       = f_content(tds[6])
            three_P      = f_content(tds[7])
            three_PA     = f_content(tds[8])
            three_P_Per  = f_content(tds[9])
            two_P        = f_content(tds[10])
            two_PA       = f_content(tds[11])
            two_P_Per    = f_content(tds[12])
            FT           = f_content(tds[13])
            FTA          = f_content(tds[14])
            FT_Per       = f_content(tds[15])
            ORB          = f_content(tds[16])
            DRB          = f_content(tds[17])
            TRB          = f_content(tds[18])
            AST          = f_content(tds[19])
            STL          = f_content(tds[20])
            BLK          = f_content(tds[21])
            TOV          = f_content(tds[22])
            PF           = f_content(tds[23])
            PTS          = f_content(tds[24])
            PTSG         = f_content(tds[25]) 
                     
            ### set team_information to list ###
            team_list=[Tm, G, MP, FG, FGA, FG_Per, three_P, three_PA, three_P_Per, two_P, two_PA, 
                       two_P_Per, FT, FTA, FT_Per, ORB, DRB, TRB, AST, STL, BLK, 
                       TOV, PF, PTS, PTSG]            
               
            ### team_dict add Key(Team)，value(team_list) ###
            team_dict[Tm]=team_list
                   
    return team_dict

### team stats table data ###
team_stats_dict={}
team_stats_dict = get_teamdata(team_tables_by_type[u'team'])

if len(team_stats_dict) > 0:    
    df_team_stats = pd.DataFrame(team_stats_dict.values())    
    
    Season = "%d-%d" % (year-1,year)
    df_team_stats['Season'] = Season
    df_team_stats.columns=['tm_Tm','tm_G','tm_MP','tm_FG','tm_FGA','tm_FG_Per','tm_three_P','tm_three_PA','tm_three_P','tm_twoP',
                           'tm_twoPA','tm_twoP','tm_FT','tm_FTA','tm_FT_Per','tm_ORB','tm_DRB','tm_TRB','tm_AST','tm_STL',
                           'tm_BLK','tm_TOV','tm_PF','tm_PTS','tm_PTSG','Season']
    
    ### df league ###
    df_league_team_stats = df_team_stats.loc[df_team_stats['tm_Tm'] == "League"]
    print df_league_team_stats
    
    print "=" * 120
    
    ### df team without row == league ###
    df_team_stats = df_team_stats[df_team_stats['tm_Tm'] != "League"]
    print df_team_stats



     tm_Tm tm_G  tm_MP tm_FG tm_FGA tm_FG_Per tm_three_P tm_three_PA  \
15  League   82  19840  3076   6852      .449        643        1838   

   tm_three_P tm_twoP    ...     tm_DRB tm_TRB tm_AST tm_STL tm_BLK tm_TOV  \
15       .350    2433    ...       2657   3550   1807    634    393   1177   

   tm_PF tm_PTS tm_PTSG     Season  
15  1658   8201   100.0  2014-2015  

[1 rows x 26 columns]
   tm_Tm tm_G  tm_MP tm_FG tm_FGA tm_FG_Per tm_three_P tm_three_PA tm_three_P  \
0    MIL   82  19930  3083   6722      .459        545        1500       .363   
1    GSW   82  19730  3410   7137      .478        883        2217       .398   
2    MIN   82  19805  2986   6820      .438        406        1223       .332   
3    TOR   82  19855  3108   6829      .455        726        2060       .352   
4    ATL   82  19730  3121   6699      .466        818        2152       .380   
5    BOS   82  19880  3193   7211      .443        660        2021       .327   
6    DET   82  19830  3041   7038 

### Opp

In [12]:
### team stats table data ###
opp_stats_dict={}
opp_stats_dict = get_teamdata(team_tables_by_type[u'opponent'])

if len(opp_stats_dict) > 0:
    df_opp_stats = pd.DataFrame(opp_stats_dict.values())
    Season = "%d-%d" % (year-1,year)
    df_opp_stats['Season'] = Season
    df_opp_stats.columns=['opp_Tm','opp_G','opp_MP','opp_FG','opp_FGA','opp_FG_Per','opp_three_P','opp_three_PA','opp_three_P','opp_twoP',
                       'opp_twoPA','opp_twoP','opp_FT','opp_FTA','opp_FT_Per','opp_ORB','opp_DRB','opp_TRB','opp_AST','opp_STL',
                       'opp_BLK','opp_TOV','opp_PF','opp_PTS','opp_PTSG','Season']   

    ### df league opp ###
    df_league_opp_stats = df_opp_stats.loc[df_opp_stats['opp_Tm'] == "League"]
    print df_league_opp_stats
    
    print "=" * 120
    
    ### df opp without row == league ###
    df_opp_stats = df_opp_stats[df_opp_stats['opp_Tm'] != "League"]
    print df_opp_stats

    opp_Tm opp_G opp_MP opp_FG opp_FGA opp_FG_Per opp_three_P opp_three_PA  \
15  League    82  19840   3076    6852       .449         643         1838   

   opp_three_P opp_twoP    ...     opp_DRB opp_TRB opp_AST opp_STL opp_BLK  \
15        .350     2433    ...        2657    3550    1807     634     393   

   opp_TOV opp_PF opp_PTS opp_PTSG     Season  
15    1177   1658    8201    100.0  2014-2015  

[1 rows x 26 columns]
   opp_Tm opp_G opp_MP opp_FG opp_FGA opp_FG_Per opp_three_P opp_three_PA  \
0     MIL    82  19930   2917    6681       .437         676         1974   
1     GSW    82  19730   3031    7084       .428         592         1757   
2     MIN    82  19805   3373    6933       .487         697         1897   
3     MIA    82  19730   2960    6523       .454         666         1887   
4     ATL    82  19730   2991    6815       .439         720         2112   
5     BOS    82  19880   3125    6943       .450         607         1804   
6     DET    82  19830   310

### Output Team stats

In [13]:
def Output_Team(output_df):
    output_file = open("e:\\nba\\nba_team.csv", "ab")
    output_df.to_csv(output_file,index=False)
    output_file.close()

Output_Team(df_team_stats)

### Output League Team stats

In [14]:
def Output_League(output_df):
    output_file = open("e:\\nba\\nba_league_team.csv", "ab")
    output_df.to_csv(output_file,index=False)
    output_file.close()

Output_League(df_league_team_stats)

### Output Opp stats

In [15]:
def Output_Opp(output_df):
    output_file = open("e:\\nba\\nba_opp.csv", "ab")
    output_df.to_csv(output_file,index=False)
    output_file.close()

Output_Opp(df_opp_stats)

### Output League Opp stats

In [16]:
def Output_League_Opp(output_df):
    output_file = open("e:\\nba\\nba_league_opp.csv", "ab")
    output_df.to_csv(output_file,index=False)
    output_file.close()

Output_League_Opp(df_league_opp_stats)

###Team information

In [17]:
def All_team(year,team_url):
    ### Information table ###
    (team_information_tables,team_tables_by_type) = team_information(team_url)

    ###  team stats table ###
    team_stats_dict=get_teamdata(team_tables_by_type[u'team'])
    
    if len(team_stats_dict) > 0:
        df_team_stats = pd.DataFrame(team_stats_dict.values())
        Season = "%d-%d" % (year-1,year)
        df_team_stats['Season'] = Season
        df_team_stats.columns=['tm_Tm','tm_G','tm_MP','tm_FG','tm_FGA','tm_FG_Per','tm_three_P','tm_three_PA','tm_three_P','tm_twoP',
                           'tm_twoPA','tm_twoP','tm_FT','tm_FTA','tm_FT_Per','tm_ORB','tm_DRB','tm_TRB','tm_AST','tm_STL',
                           'tm_BLK','tm_TOV','tm_PF','tm_PTS','tm_PTSG','Season']
        df_league_team_stats =  df_team_stats.loc[df_team_stats['tm_Tm'] == "League"]
        df_team_stats = df_team_stats[df_team_stats['tm_Tm'] != "League"]

        Output_Team(df_team_stats)
        Output_League(df_league_team_stats)
    
    ### opp stats table ###
    opp_stats_dict=get_teamdata(team_tables_by_type[u'opponent'])
    
    if len(opp_stats_dict) > 0:
        df_opp_stats = pd.DataFrame(opp_stats_dict.values())
        Season = "%d-%d" % (year-1,year)
        df_opp_stats['Season'] = Season
        df_opp_stats.columns=['opp_Tm','opp_G','opp_MP','opp_FG','opp_FGA','opp_FG_Per','opp_three_P','opp_three_PA','opp_three_P','opp_twoP',
                           'opp_twoPA','opp_twoP','opp_FT','opp_FTA','opp_FT_Per','opp_ORB','opp_DRB','opp_TRB','opp_AST','opp_STL',
                           'opp_BLK','opp_TOV','opp_PF','opp_PTS','opp_PTSG','Season']
        df_league_opp_stats =  df_opp_stats.loc[df_opp_stats['opp_Tm'] == "League"]
        df_opp_stats = df_opp_stats[df_opp_stats['opp_Tm'] != "League"]

        Output_Opp(df_opp_stats)
        Output_League_Opp(df_league_opp_stats)

### Function Call ###

for year in year_list:
    team_url = strYear(year)
    #print year,team_url
    All_team(year,team_url)