In [1]:
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
import math

In [2]:
team_codes = ['PHI', 'MIL', 'CHI', 'CLE', 'BOS', 'LAC', 'MEM', 'ATL', 'MIA', 'CHO', 'UTA', 'SAC', 'NYK', 
             'LAL', 'ORL', 'DAL', 'BRK', 'DEN', 'IND', 'NOP', 'DET', 'TOR', 'HOU', 'SAS', 'PHO', 'OKC', 
             'MIN', 'POR', 'GSW', 'WAS']
years = [2018, 2019]


NOTE: Using the http request method vs the urllib method doesn't make a big difference in runtime. Currently, http request method is commented out. 

In [3]:
def get_data(year, team, advanced):
    
    url = "https://www.basketball-reference.com/teams/{}/{}/gamelog-advanced/".format(team, year) if advanced else "https://www.basketball-reference.com/teams/{}/{}/gamelog/".format(team, year)
    page = urlopen(url)
    soup = BeautifulSoup(page, 'lxml')
    #page = requests.get(url)
    #soup = BeautifulSoup(page.content, 'lxml')

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    headers = headers[1:]
    if advanced:
        headers[len(headers)-1] = 'dFT/FGA'
        headers[len(headers)-2] = 'dDRB%'
        headers[len(headers)-3] = 'dTOV%'
        headers[len(headers)-4] = 'deFG%'
    else:
        headers[len(headers)-1] = 'dPF'
        headers[len(headers)-9] = 'dFTA'
    rows = soup.findAll('tr')[2:]
    gamelog = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]

    stats = pd.DataFrame(gamelog, columns = headers)
    stats = stats.dropna()
    stats = stats.reset_index(drop=True)

    stats = stats.loc[:,~stats.columns.duplicated()]
    page.close()
    return stats

def combined_data(year, team):
    basic_table = get_data(year, team, 0)
    advanced_table = get_data(year, team, 1)
    temp = pd.concat([basic_table, advanced_table], axis=1, sort=False)
    temp = temp.loc[:,~temp.columns.duplicated()]
    data1 = ['Tm', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST',
       'PF', 'dPF']
    data2 = ['Tm', 'ORtg', 'DRtg', 'Pace', 'FTr',
           '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'eFG%', 'TOV%', 'ORB%',
           'FT/FGA', 'deFG%', 'dTOV%', 'dDRB%', 'dFT/FGA', 'FG', '3P']
    for x in data1:
        temp[x] = pd.to_numeric(temp[x], downcast='float')
    for x in data2:
        temp[x] = pd.to_numeric(temp[x], downcast='float')
    
    temp['2P'] = temp['FG'] - temp['3P']
    temp['2PA'] = temp['FGA'] - temp['3PA']
    temp['2P%'] = temp['2P']/temp['2PA']
    
    return temp



    1. Get the 2 teams that are playing that day
    2. For team A, bootstrap their FT's attempted per game
    3. For team B, bootstrap their FT's allowed per game
    4. FTA for team A will be 0.2*Step2 + 0.8*Step3
    5. Get cumulative FT% for team A and last 5 games FT% for team A and get final FT%
    6. Calculate FT's = Step4*Step5
    7. Boostrap Team A's pace, 3PAr
    8. Obtain 3P% and 2P% for Team A similarly to step 5
    9. Calculate 2p and 3p using formula
    10. Add together FT, 2P, and 3P to get Team A's score 
    11. Repeat steps 2-10 for Team B


In [8]:
# hypothetical game 30 of 2018-2019 season between SAS and ORL
# Team A is SAS, Team B is ORL

sas_df = combined_data(2019, 'SAS')
orl_df = combined_data(2019, 'ORL')
sas_gm_num = 30
orl_gm_num = 30

# Calculating SAS FTA per game
sas_fta = np.array(sas_df['FTA'][0:sas_gm_num-1], dtype='float')

# bootstrapping SAS FTA
means = []
for x in range(1000):
    temp = np.random.choice(sas_fta, size=len(sas_fta))
    means.append(np.mean(temp))
means = np.array(means)
sas_bt_fta = np.mean(means)

# bootstrapping ORL dFTA
orl_dfta = np.array(orl_df['dFTA'][0:orl_gm_num-1], dtype='float')
means = []
for x in range(1000):
    temp = np.random.choice(orl_dfta, size=len(orl_dfta))
    means.append(np.mean(temp))
means = np.array(means)
orl_bt_dfta = np.mean(means)

# weight the two values
sas_total_fta = sas_bt_fta*0.2 + orl_bt_dfta*0.8

In [9]:
# Calculating SAS FT%
sas_ft = sas_df['FT']
sas_ft = np.array(sas_ft, dtype='float')

sas_fta = sas_df['FTA']
sas_fta = np.array(sas_fta, dtype='float')

sas_cum_ftp = np.sum(sas_ft[0:sas_gm_num-1])/np.sum(sas_fta[0:sas_gm_num-1])
sas_l5_ftp = np.sum(sas_ft[sas_gm_num-6:sas_gm_num-1])/np.sum(sas_fta[sas_gm_num-6:sas_gm_num-1])

sas_true_ftp = sas_cum_ftp*0.5 + sas_l5_ftp*0.5

# Calculating Total SAS FT for Gm
sas_total_ft_pts = sas_true_ftp*sas_total_fta

In [10]:
# Calculate SAS 2P and 3P
sas_pace = np.array(sas_df['Pace'][0:sas_gm_num-1], dtype='float')
sas_3par = np.array(sas_df['3PAr'][0:sas_gm_num-1], dtype='float')

# bootstrapping SAS Pace
means = []
for x in range(1000):
    temp = np.random.choice(sas_pace, size=len(sas_pace))
    means.append(np.mean(temp))
means = np.array(means)
sas_bt_pace = np.mean(means)

# bootstrapping SAS 3PAr
means = []
for x in range(1000):
    temp = np.random.choice(sas_3par, size=len(sas_3par))
    means.append(np.mean(temp))
means = np.array(means)
sas_bt_3par = np.mean(means)

# Calculating 2P% for SAS
sas_2p = sas_df['2P']
sas_2p = np.array(sas_2p, dtype='float')

sas_2pa = sas_df['2PA']
sas_2pa = np.array(sas_2pa, dtype='float')

sas_cum_2pp = np.sum(sas_2p[0:sas_gm_num-1])/np.sum(sas_2pa[0:sas_gm_num-1])
sas_l5_2pp = np.sum(sas_2p[sas_gm_num-6:sas_gm_num-1])/np.sum(sas_2pa[sas_gm_num-6:sas_gm_num-1])

sas_true_2pp = sas_cum_2pp*0.5 + sas_l5_2pp*0.5

# Calculating 3P% for SAS
sas_3p = sas_df['3P']
sas_3p = np.array(sas_3p, dtype='float')

sas_3pa = sas_df['3PA']
sas_3pa = np.array(sas_3pa, dtype='float')

sas_cum_3pp = np.sum(sas_3p[0:sas_gm_num-1])/np.sum(sas_3pa[0:sas_gm_num-1])
sas_l5_3pp = np.sum(sas_3p[sas_gm_num-6:sas_gm_num-1])/np.sum(sas_3pa[sas_gm_num-6:sas_gm_num-1])

sas_true_3pp = sas_cum_3pp*0.5 + sas_l5_3pp*0.5

In [11]:
# Calculate SAS Score
sas_total_2p_pts = sas_bt_pace*(1-sas_bt_3par)*2*sas_true_2pp
sas_total_3p_pts = sas_bt_pace*(sas_bt_3par)*3*sas_true_3pp

sas_score = sas_total_ft_pts + sas_total_2p_pts + sas_total_3p_pts
sas_score

125.14413729391012