In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request

In [2]:
# look at the data from kaggle, try to scrape missin data
df_kag = pd.read_csv('Data/nba_team_stats_00_to_18.csv')
df_kag.head(10)

# drop the Unnamed: 0 col
df_kag = df_kag.drop(columns='Unnamed: 0')
df_kag.head()

Unnamed: 0,TEAM,GP,W,L,WIN%,MIN,PTS,FGM,FGA,FG%,...,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-,SEASON
0,Atlanta Hawks,82,29,53,0.354,48.4,113.3,41.4,91.8,45.1,...,46.1,25.8,17.0,8.2,5.1,5.5,23.6,22.2,-6.0,2018-19
1,Boston Celtics,82,49,33,0.598,48.2,112.4,42.1,90.5,46.5,...,44.5,26.3,12.8,8.6,5.3,3.9,20.4,19.5,4.4,2018-19
2,Brooklyn Nets,82,42,40,0.512,48.7,112.2,40.3,89.7,44.9,...,46.6,23.8,15.1,6.6,4.1,5.3,21.5,22.0,-0.1,2018-19
3,Charlotte Hornets,82,39,43,0.476,48.4,110.7,40.2,89.8,44.8,...,43.8,23.2,12.2,7.2,4.9,6.0,18.9,20.6,-1.1,2018-19
4,Chicago Bulls,82,22,60,0.268,48.5,104.9,39.8,87.9,45.3,...,42.9,21.9,14.1,7.4,4.3,5.8,20.3,18.7,-8.4,2018-19


In [3]:
# We want to grab additional data from stats.nba.com for 19-20 and 20-21 season
# Doing the URL request/soup approach didn't get the table data, so I copied the html from the website into an html file
# url_19_20 = "https://www.nba.com/stats/teams/traditional/?PerMode=Totals&sort=TEAM_NAME&dir=-1&Season=2019-20&SeasonType=Regular%20Season"
# url_20_21 = "https://www.nba.com/stats/teams/traditional/?PerMode=Totals&sort=TEAM_NAME&dir=-1&Season=2020-21&SeasonType=Regular%20Season"

In [4]:
# Here is a function to turn a url address into a beautiful soup object
# url should be a string
# file path should also be a string specifying the path to save the html data
def url_to_soup(url, file_path):
    # here we actually access the website
    with urllib.request.urlopen(url) as response:
        html = response.read()
        html = html.decode('utf-8')

    # save the file
    with open(file_path, 'w') as new_file:
        new_file.write(html)

    # here it's already a local operation
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [5]:
# soup_19_20 = url_to_soup(url_19_20, 'Data/nba-stats-19-to-20-html')
# soup_20_21 = url_to_soup(url_20_21, 'Data/nba-stats-20-to-21-html')
# Looks like the html code doesn't contain the data table

In [7]:
# use pandas to read the 19-20-html table
with open('Data/19-20-table.html') as file:
    stat_soup = BeautifulSoup(file, 'html.parser')
stat_table_19_20 = pd.read_html(str(stat_soup))[0]

# delete NaN cols
stat_table_19_20 = stat_table_19_20.dropna(axis='columns')

# Add column to indicate data comes from 2019-2020 season
stat_table_19_20['SEASON'] = '2019-2020'

# Looks ready to merge with kaggle dataset!
stat_table_19_20.head()

Unnamed: 0,TEAM,GP,W,L,WIN%,MIN,PTS,FGM,FGA,FG%,...,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-,SEASON
0,Atlanta Hawks,67,20,47,0.299,48.6,111.8,40.6,90.6,44.9,...,43.3,24.0,16.2,7.8,5.1,6.4,23.1,21.0,-8.0,2019-2020
1,Boston Celtics,72,48,24,0.667,48.4,113.7,41.3,89.6,46.1,...,46.1,23.0,13.8,8.3,5.6,5.5,21.6,20.7,6.3,2019-2020
2,Brooklyn Nets,72,35,37,0.486,48.6,111.8,40.4,90.3,44.8,...,47.9,24.5,15.3,6.4,4.5,5.3,21.0,21.1,-0.6,2019-2020
3,Charlotte Hornets,65,23,42,0.354,48.5,102.9,37.3,85.9,43.4,...,42.8,23.8,14.6,6.6,4.1,5.0,18.8,20.6,-6.8,2019-2020
4,Chicago Bulls,65,22,43,0.338,48.2,106.8,39.6,88.6,44.7,...,41.9,23.2,15.5,10.0,4.1,5.9,21.8,19.2,-3.1,2019-2020


In [8]:
# use pandas to read the 20-21-html table
with open('Data/20-21-table.html') as file:
    stat_soup = BeautifulSoup(file, 'html.parser')
stat_table_20_21 = pd.read_html(str(stat_soup))[0]

# delete NaN cols
stat_table_20_21 = stat_table_20_21.dropna(axis='columns')

# Add column to indicate data comes from 2020-2021 season
stat_table_20_21['SEASON'] = '2020-2021'

# Looks ready to merge with kaggle dataset!
stat_table_20_21.head()

Unnamed: 0,TEAM,GP,W,L,WIN%,MIN,PTS,FGM,FGA,FG%,...,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-,SEASON
0,Atlanta Hawks,46,23,23,0.5,48.2,112.5,39.9,86.8,45.9,...,45.6,24.1,13.8,6.8,4.9,5.3,19.8,20.4,1.5,2020-2021
1,Boston Celtics,46,23,23,0.5,48.2,112.4,41.7,88.7,47.0,...,44.0,23.0,13.9,8.0,5.2,4.9,20.5,19.3,1.2,2020-2021
2,Brooklyn Nets,46,31,15,0.674,48.5,119.3,43.3,87.5,49.5,...,44.1,26.4,13.8,6.6,5.1,4.5,19.0,19.0,4.2,2020-2021
3,Charlotte Hornets,45,23,22,0.511,48.2,111.6,40.4,87.6,46.2,...,44.0,26.9,15.6,8.0,4.7,5.0,18.5,19.0,-0.7,2020-2021
4,Chicago Bulls,44,19,25,0.432,48.5,112.8,42.5,89.0,47.8,...,45.0,26.2,16.0,6.7,4.4,5.0,19.8,18.6,-1.0,2020-2021


In [9]:
# Merge all stat tables together into one
frames = [stat_table_20_21, stat_table_19_20, df_kag]
team_stats = pd.concat(frames)
team_stats.head()

Unnamed: 0,TEAM,GP,W,L,WIN%,MIN,PTS,FGM,FGA,FG%,...,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-,SEASON
0,Atlanta Hawks,46,23,23,0.5,48.2,112.5,39.9,86.8,45.9,...,45.6,24.1,13.8,6.8,4.9,5.3,19.8,20.4,1.5,2020-2021
1,Boston Celtics,46,23,23,0.5,48.2,112.4,41.7,88.7,47.0,...,44.0,23.0,13.9,8.0,5.2,4.9,20.5,19.3,1.2,2020-2021
2,Brooklyn Nets,46,31,15,0.674,48.5,119.3,43.3,87.5,49.5,...,44.1,26.4,13.8,6.6,5.1,4.5,19.0,19.0,4.2,2020-2021
3,Charlotte Hornets,45,23,22,0.511,48.2,111.6,40.4,87.6,46.2,...,44.0,26.9,15.6,8.0,4.7,5.0,18.5,19.0,-0.7,2020-2021
4,Chicago Bulls,44,19,25,0.432,48.5,112.8,42.5,89.0,47.8,...,45.0,26.2,16.0,6.7,4.4,5.0,19.8,18.6,-1.0,2020-2021


In [10]:
team_stats.describe()

Unnamed: 0,GP,W,L,WIN%,MIN,PTS,FGM,FGA,FG%,3PM,...,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-
count,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,...,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0
mean,78.926518,39.463259,39.463259,0.499736,48.354473,100.813578,37.576677,82.834026,45.357348,7.539936,...,31.600319,42.622045,22.101118,14.447604,7.591054,4.902077,4.904473,20.925719,15.831629,-0.00655
std,8.611863,12.723626,12.630228,0.149842,0.183183,6.792056,2.338026,3.837667,1.597223,2.776997,...,2.226437,2.057298,2.121037,1.147004,0.856439,0.800987,0.704417,1.687306,8.805872,4.507587
min,43.0,7.0,9.0,0.106,48.0,84.2,32.4,74.3,40.8,2.6,...,26.9,36.9,17.4,11.2,5.5,2.4,3.0,16.6,0.0,-13.9
25%,82.0,30.0,30.0,0.39,48.2,95.825,35.925,80.0,44.3,5.5,...,29.9,41.2,20.6,13.7,7.0,4.3,4.4,19.8,17.925,-3.1
50%,82.0,41.0,39.0,0.512,48.4,99.5,37.3,82.4,45.3,6.9,...,31.3,42.5,21.9,14.5,7.5,4.9,4.9,20.9,20.0,0.1
75%,82.0,49.0,49.0,0.61,48.5,105.075,39.0,85.6,46.4,9.3,...,33.1,43.975,23.4,15.1,8.2,5.4,5.4,22.1,21.2,3.375
max,82.0,73.0,72.0,0.89,49.0,119.3,44.4,94.0,50.4,17.0,...,42.2,51.7,30.4,18.5,10.3,8.2,6.9,26.7,25.7,11.6
