# Project Luther

Kenny Leung - kenleung11@gmail.com

Part 4/8 - Data scraping NBA team statistics

This notebook documents the process of scraping advance statistics for all teams since 2003 from https://www.basketball-reference.com/.

In [1]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import pickle
from datetime import datetime

# Data Scraping

I hypothesized that a rookie in the NBA will perform better/have a bigger contribution to a team with fewer "good" players (how good a player is can be estimated by a player's BPM). Here I am scraping season average advance statistics for all players on each team since 2003. First, I needed to obtain a list of url extensions for each team for each year. I started with getting the urls for all the teams in the 2017 season.

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2017.html'

response = requests.get(url)
page = response.text.replace('<!--', '').replace('-->', '')
soup = BeautifulSoup(page, "lxml")

url_list = []
for x in soup.findAll(class_="table_outer_container")[5].findAll('a'):
    try:
        url = x['href']
        url_list.append(url)
    except:
        pass

In [99]:
url_list

['/teams/GSW/2017.html',
 '/teams/HOU/2017.html',
 '/teams/DEN/2017.html',
 '/teams/CLE/2017.html',
 '/teams/WAS/2017.html',
 '/teams/LAC/2017.html',
 '/teams/BOS/2017.html',
 '/teams/POR/2017.html',
 '/teams/PHO/2017.html',
 '/teams/TOR/2017.html',
 '/teams/OKC/2017.html',
 '/teams/BRK/2017.html',
 '/teams/MIN/2017.html',
 '/teams/SAS/2017.html',
 '/teams/IND/2017.html',
 '/teams/CHO/2017.html',
 '/teams/LAL/2017.html',
 '/teams/NOP/2017.html',
 '/teams/NYK/2017.html',
 '/teams/MIL/2017.html',
 '/teams/MIA/2017.html',
 '/teams/ATL/2017.html',
 '/teams/CHI/2017.html',
 '/teams/SAC/2017.html',
 '/teams/PHI/2017.html',
 '/teams/DET/2017.html',
 '/teams/ORL/2017.html',
 '/teams/UTA/2017.html',
 '/teams/MEM/2017.html',
 '/teams/DAL/2017.html']

# Scrape for urls for all teams since 2003

Now to get the url extensions for all teams since 2003. Noted that I cannot simply change the year in the url since there have been a few organizations that moved their team to another city, thus changing the url. For example, prior to 2008, the Oklahoma City Thunder were the Seattle Supersonics, and in 2012 the New Jersey Nets moved their team to Brooklyn.

In [101]:
url_template = 'https://www.basketball-reference.com/leagues/NBA_{year}.html'
url_list = []

for year in range(2003, 2018):
    print(year)
    url = url_template.format(year=year)  # get the url

    response = requests.get(url)
    page = response.text.replace('<!--', '').replace('-->', '')
    soup = BeautifulSoup(page, "lxml")
    
    for x in soup.findAll(class_="table_outer_container")[5].findAll('a'):
        try:
            url = x['href']
            url_list.append(url)
        except:
            pass
        
    time.sleep(.5+2*random.random())

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017


In [None]:
url_list

In [None]:
# pickle the list
with open('team_year_url_list.pkl', 'wb') as fp:
    pickle.dump(url_list, fp)

In [30]:
with open("team_year_url_list.pkl", 'rb') as picklefile: 
    url_list = pickle.load(picklefile)

# Scraping the Advance Statistics for players on the 2016-2017 Boston Celtics

Now that I have the url extensions for each team since 2003, I can loop through the list to scrape all the data I need. First, I wanted to make sure the code works by focusing on the Boston Celtics' 2016-2017 season.

In [None]:
url = 'https://www.basketball-reference.com/teams/BOS/2017.html'

response = requests.get(url)
page = response.text.replace('<!--', '').replace('-->', '')
soup = BeautifulSoup(page, "lxml")

Year = int(soup.find_all('h1')[0].getText().split('\n')[1][:4]) # get year
Team = soup.find_all('h1')[0].getText().split('\n')[2] # get team name

column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[7].findAll('th')][0:27]
data_rows = soup.findAll(class_="overthrow table_container")[7].findAll('tr')[1:]
player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]

for player in player_data:
    player.insert(0, '') # insert an empty data column
    
for player in player_data:
    player.insert(0, Year) # insert a column data for year
    
for player in player_data:
    player.insert(0, Team) # insert a column data for team name
    
column_headers.insert(0,"Team") # insert team header
column_headers.insert(0,"Year") # insert year header

df = pd.DataFrame(player_data, columns=column_headers)

In [None]:
df

# Scraping the Advance Statistics for each player on each team since 2003

Now that the code works, I looped through the url_list I obtained above to get the advance stats for all players on each team for each season since 2003.

In [None]:
url_template2 = "https://www.basketball-reference.com{team_year}"
team_year_df = pd.DataFrame()

for team_year in url_list:
    print(team_year)
    url = url_template2.format(team_year = team_year)
    
    response = requests.get(url)
    page = response.text.replace('<!--', '').replace('-->', '')
    soup = BeautifulSoup(page, "lxml")
    
    Year = int(soup.find_all('h1')[0].getText().split('\n')[1][:4])
    Team = soup.find_all('h1')[0].getText().split('\n')[2]
    
    column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[7].findAll('th')][0:27]
    data_rows = soup.findAll(class_="overthrow table_container")[7].findAll('tr')[1:]
    player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]

    for player in player_data:
        player.insert(0, '')
    
    for player in player_data:
        player.insert(0, Year)
    
    for player in player_data:
        player.insert(0, Team)
    
    column_headers.insert(0,"Year")
    column_headers.insert(0,"Team")
    
    df = pd.DataFrame(player_data, columns=column_headers)
    team_year_df = team_year_df.append(df, ignore_index=True)

    time.sleep(.5+2*random.random())

In [None]:
team_year_df.columns = ['Team', 'Year', 'null', 'Player', 'Age', 'G', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'null', 'OWS', 'DWS', 'WS', 'WS/48', 'null', 'OBPM', 'DBPM', 'BPM',
       'VORP']

In [None]:
# dropped the null columns
team_year_df = team_year_df.drop(['null'], axis=1)

In [None]:
# save to csv file
team_year_df.to_csv('team_year_raw.csv')

In [98]:
team_year_df = pd.read_csv('team_year_raw.csv',index_col=0)

# Scraping the Per Game Statistics for each player on each team since 2003

In [37]:
url_template2 = "https://www.basketball-reference.com{team_year}"
team_year_df = pd.DataFrame()

for team_year in url_list:
    print(team_year)
    url = url_template2.format(team_year = team_year)

    response = requests.get(url)
    page = response.text.replace('<!--', '').replace('-->', '')
    soup = BeautifulSoup(page, "lxml")

    Year = int(soup.find_all('h1')[0].getText().split('\n')[1][:4])
    Team = soup.find_all('h1')[0].getText().split('\n')[2]

    column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[3].findAll('th')][1:28]
    data_rows = soup.findAll(class_="overthrow table_container")[3].findAll('tr')[1:]
    player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]

    for player in player_data:
        player.insert(0, Year)

    for player in player_data:
        player.insert(0, Team)

    column_headers.insert(0,"Year")
    column_headers.insert(0,"Team")

    df = pd.DataFrame(player_data, columns=column_headers)
    team_year_df = team_year_df.append(df, ignore_index=True)

    time.sleep(.5+2*random.random())

/teams/DAL/2003.html
/teams/GSW/2003.html
/teams/SAC/2003.html
/teams/LAL/2003.html
/teams/MIL/2003.html
/teams/ORL/2003.html
/teams/MIN/2003.html
/teams/MEM/2003.html
/teams/PHI/2003.html
/teams/IND/2003.html
/teams/NYK/2003.html
/teams/SAS/2003.html
/teams/PHO/2003.html
/teams/NJN/2003.html
/teams/POR/2003.html
/teams/CHI/2003.html
/teams/UTA/2003.html
/teams/ATL/2003.html
/teams/NOH/2003.html
/teams/LAC/2003.html
/teams/HOU/2003.html
/teams/BOS/2003.html
/teams/SEA/2003.html
/teams/WAS/2003.html
/teams/CLE/2003.html
/teams/DET/2003.html
/teams/TOR/2003.html
/teams/MIA/2003.html
/teams/DEN/2003.html
/teams/DAL/2004.html
/teams/SAC/2004.html
/teams/LAL/2004.html
/teams/MIL/2004.html
/teams/DEN/2004.html
/teams/SEA/2004.html
/teams/MEM/2004.html
/teams/BOS/2004.html
/teams/LAC/2004.html
/teams/MIN/2004.html
/teams/PHO/2004.html
/teams/ORL/2004.html
/teams/GSW/2004.html
/teams/CLE/2004.html
/teams/ATL/2004.html
/teams/NYK/2004.html
/teams/NOH/2004.html
/teams/WAS/2004.html
/teams/SAS/20

/teams/HOU/2016.html
/teams/BOS/2016.html
/teams/POR/2016.html
/teams/LAC/2016.html
/teams/CLE/2016.html
/teams/WAS/2016.html
/teams/SAS/2016.html
/teams/CHO/2016.html
/teams/ATL/2016.html
/teams/NOP/2016.html
/teams/TOR/2016.html
/teams/MIN/2016.html
/teams/DAL/2016.html
/teams/IND/2016.html
/teams/ORL/2016.html
/teams/DET/2016.html
/teams/DEN/2016.html
/teams/CHI/2016.html
/teams/PHO/2016.html
/teams/MIA/2016.html
/teams/MEM/2016.html
/teams/MIL/2016.html
/teams/BRK/2016.html
/teams/NYK/2016.html
/teams/UTA/2016.html
/teams/PHI/2016.html
/teams/LAL/2016.html
/teams/GSW/2017.html
/teams/HOU/2017.html
/teams/DEN/2017.html
/teams/CLE/2017.html
/teams/WAS/2017.html
/teams/LAC/2017.html
/teams/BOS/2017.html
/teams/POR/2017.html
/teams/PHO/2017.html
/teams/TOR/2017.html
/teams/OKC/2017.html
/teams/BRK/2017.html
/teams/MIN/2017.html
/teams/SAS/2017.html
/teams/IND/2017.html
/teams/CHO/2017.html
/teams/LAL/2017.html
/teams/NOP/2017.html
/teams/NYK/2017.html
/teams/MIL/2017.html
/teams/MIA/20

In [44]:
team_year_df.columns = ['Team', 'Year', 'Player', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS/G']

In [46]:
# save to csv file
team_year_df.to_csv('team_year_stats.csv')

In [89]:
team_year_df = pd.read_csv('team_year_stats.csv',index_col=0)

# Sorting and Grouping the data frame

One of the features that I think will be an indicator on how well a rookie performs is the BPM stats of the players on the team in the previous year. For example, if a team contains many players with high BPM, it is less likely that a rookie will get an opportunity to contribute. Here, I wanted to group each team, each year, to find and sum the top 3, top 5, and top 7 player's BPMs. I also summed the BPM's of the entire team for each season.

In [99]:
teams = ['Atlanta Hawks', 
         'Boston Celtics', 
         'Brooklyn Nets', 
         'Charlotte Bobcats', 
         'Charlotte Hornets', 
         'Chicago Bulls', 
         'Cleveland Cavaliers', 
         'Dallas Mavericks', 
         'Denver Nuggets', 
         'Detroit Pistons', 
         'Golden State Warriors', 
         'Houston Rockets', 
         'Indiana Pacers', 
         'Los Angeles Clippers', 
         'Los Angeles Lakers', 
         'Memphis Grizzlies', 
         'Miami Heat', 
         'Milwaukee Bucks', 
         'Minnesota Timberwolves', 
         'New Jersey Nets', 
         'New Orleans Hornets', 
         'New Orleans Pelicans', 
         'New Orleans/Oklahoma City Hornets', 
         'New York Knicks', 
         'Oklahoma City Thunder', 
         'Orlando Magic', 
         'Philadelphia 76ers', 
         'Phoenix Suns', 
         'Portland Trail Blazers', 
         'Sacramento Kings', 
         'San Antonio Spurs', 
         'Seattle SuperSonics', 
         'Toronto Raptors', 
         'Utah Jazz', 
         'Washington Wizards']

In [100]:
teams_abr = ['ATL', 
         'BOS', 
         'BRK', 
         'CHA', 
         'CHH', 
         'CHI', 
         'CLE', 
         'DAL', 
         'DEN', 
         'DET', 
         'GSW', 
         'HOU', 
         'IND', 
         'LAC', 
         'LAL', 
         'MEM', 
         'MIA', 
         'MIL', 
         'MIN', 
         'NJN', 
         'NOH', 
         'NOP', 
         'NOK', 
         'NYK', 
         'OKC', 
         'ORL', 
         'PHI', 
         'PHO', 
         'POR', 
         'SAC', 
         'SAS', 
         'SEA', 
         'TOR', 
         'UTA', 
         'WAS']

In [101]:
team_dict = dict(zip(teams,teams_abr))

In [102]:
def team_abr(name):
    '''
    Change full team name to abbreviation to match draft dataframe
    '''
    return team_dict[name]

In [103]:
team_year_df['Team'] = team_year_df['Team'].apply(team_abr)

# Sorting dataframe based on BPM

In [105]:
team_year_df;

In [106]:
# sum of advance stats for top 3 players by BPM for each team 
top3 = team_year_df.sort_values('BPM', ascending=False) \
.groupby(['Team','Year']).head(3) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'BPM': 'BPM_top3'})

In [107]:
# drop all other columsn except bpm
top3 = top3[['Team','Year','BPM_top3']]

In [108]:
# sum of advance stats for top 5 players by BPM for each team 
top5 = team_year_df.sort_values('BPM', ascending=False) \
.groupby(['Team','Year']).head(5) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'BPM': 'BPM_top5'})

In [109]:
# drop all other columsn except bpm
top5 = top5[['Team','Year','BPM_top5']]

In [110]:
# sum of advance stats for top 7 players by BPM for each team 
top7 = team_year_df.sort_values('BPM', ascending=False) \
.groupby(['Team','Year']).head(7) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'BPM': 'BPM_top7'})

In [111]:
# drop all other columsn except bpm
top7 = top7[['Team','Year','BPM_top7']]

In [112]:
# sum of advance stats for each team
top13 = team_year_df.sort_values('BPM', ascending=False) \
.groupby(['Team','Year']).head(13) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'BPM': 'BPM_top13'})

In [113]:
# drop all other columsn except bpm
top13 = top13[['Team','Year','BPM_top13']]

In [114]:
# merge dataframes
BPMs = pd.merge(top3, top5, how='left', on=['Team', 'Year'])
BPMs = pd.merge(BPMs, top7, how='left', on=['Team', 'Year'])
BPMs = pd.merge(BPMs, top13, how='left', on=['Team', 'Year'])

In [115]:
BPMs.columns = ['Draft_Tm', 'Draft_Yr', 'BPM_top3', 'BPM_top5', 'BPM_top7', 'BPM_top13']

In [116]:
BPMs['Yr_Tm'] = BPMs["Draft_Yr"].map(str) + BPMs["Draft_Tm"]

In [117]:
BPMs['top3/top5'] = BPMs['BPM_top3']/BPMs['BPM_top5']

In [118]:
BPMs['top3/top7'] = BPMs['BPM_top3']/BPMs['BPM_top7']

In [119]:
BPMs['top3/top13'] = BPMs['BPM_top3']/BPMs['BPM_top13']

In [120]:
BPMs['top5/top7'] = BPMs['BPM_top5']/BPMs['BPM_top7']

In [121]:
BPMs['top5/top13'] = BPMs['BPM_top5']/BPMs['BPM_top13']

In [122]:
BPMs['top7/top13'] = BPMs['BPM_top7']/BPMs['BPM_top13']

In [123]:
# save to csv file
BPMs.to_csv('team_bpm.csv')

# Sorting dataframe based on PPG

I also wanted to predict rookie year points per game (PPG) to compare whether the model predicts BPM or PPG better.

In [63]:
# sum of stats for top 3 players by PPG for each team 
top3 = team_year_df.sort_values('PTS/G', ascending=False) \
.groupby(['Team','Year']).head(3) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'PTS/G': 'PTS/G_top3'});

In [64]:
# drop all other columsn except ppg
top3 = top3[['Team','Year','PTS/G_top3']]

In [69]:
# sum of stats for top 5 players by PPG for each team 
top5 = team_year_df.sort_values('PTS/G', ascending=False) \
.groupby(['Team','Year']).head(5) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'PTS/G': 'PTS/G_top5'});

In [70]:
# drop all other columsn except ppg
top5 = top5[['Team','Year','PTS/G_top5']]

In [71]:
# sum of stats for top 7 players by PPG for each team 
top7 = team_year_df.sort_values('PTS/G', ascending=False) \
.groupby(['Team','Year']).head(7) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'PTS/G': 'PTS/G_top7'});

In [72]:
# drop all other columsn except ppg
top7 = top7[['Team','Year','PTS/G_top7']]

In [73]:
# sum of stats for top 13 players by PPG for each team 
top13 = team_year_df.sort_values('PTS/G', ascending=False) \
.groupby(['Team','Year']).head(13) \
.sort_values(['Year','Team']) \
.groupby(['Team','Year']).sum() \
.reset_index() \
.rename(columns={'PTS/G': 'PTS/G_top13'});

In [74]:
# drop all other columsn except ppg
top13 = top13[['Team','Year','PTS/G_top13']]

In [75]:
# merge dataframes
PPGs = pd.merge(top3, top5, how='left', on=['Team', 'Year'])
PPGs = pd.merge(PPGs, top7, how='left', on=['Team', 'Year'])
PPGs = pd.merge(PPGs, top13, how='left', on=['Team', 'Year'])

In [76]:
PPGs.columns = ['Draft_Tm', 'Draft_Yr', 'PPG_top3', 'PPG_top5', 'PPG_top7', 'PPG_top13']

In [77]:
PPGs['Yr_Tm'] = PPGs["Draft_Yr"].map(str) + PPGs["Draft_Tm"]

In [80]:
PPGs['top3/top5'] = PPGs['PPG_top3']/PPGs['PPG_top5']

In [81]:
PPGs['top3/top7'] = PPGs['PPG_top3']/PPGs['PPG_top7']

In [82]:
PPGs['top3/top13'] = PPGs['PPG_top3']/PPGs['PPG_top13']

In [84]:
PPGs['top5/top7'] = PPGs['PPG_top5']/PPGs['PPG_top7']

In [85]:
PPGs['top5/top13'] = PPGs['PPG_top5']/PPGs['PPG_top13']

In [86]:
PPGs['top7/top13'] = PPGs['PPG_top7']/PPGs['PPG_top13']

In [87]:
# save to csv file
PPGs.to_csv('team_ppg.csv')