# Web scraping  
## Loading packages

In [1]:
import pandas as pd
import numpy as np
import requests
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

## Scrape data from stats.nba.com

We will scrape each team's statistics from the beginning of the regular season until the first day of the play-in tournament.

In [2]:
# define URLs

## traditional stats URL
url_trad = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2021-22&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision='

## advanced stats URL
url_adv = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2021-22&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision='

header_dict = {
    'User-Agent': 'Mozilla/5.0',
    'x-nba-stats-origin': 'stats',
    'x-nba-stats-token': 'true',
    'Referer': 'https://stats.nba.com',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Host': 'stats.nba.com'
}

# traditional stats
res_trad = requests.get(url_trad, headers=header_dict)
json_set_trad = res_trad.json()
headers_trad = json_set_trad['resultSets'][0]['headers']
data_set_trad = json_set_trad['resultSets'][0]['rowSet']
df_trad = pd.DataFrame(data_set_trad, columns=headers_trad)

# advanced stats
res_adv = requests.get(url_adv, headers=header_dict)
json_set_adv = res_adv.json()
headers_adv = json_set_adv['resultSets'][0]['headers']
data_set_adv = json_set_adv['resultSets'][0]['rowSet']
df_adv = pd.DataFrame(data_set_adv, columns=headers_adv)

In [3]:
# print first 5 rows of traditional data to verify correct scraping
df_trad.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,...,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,CFID,CFPARAMS
0,1610612737,Atlanta Hawks,82,43,39,0.524,48.1,41.5,88.3,0.47,...,1,22,23,10,7,6,6,14,10,Atlanta Hawks
1,1610612738,Boston Celtics,82,51,31,0.622,48.5,40.7,87.4,0.466,...,13,19,2,11,5,20,12,2,10,Boston Celtics
2,1610612751,Brooklyn Nets,82,44,38,0.537,48.2,42.0,88.4,0.475,...,17,24,5,21,22,16,9,15,10,Brooklyn Nets
3,1610612766,Charlotte Hornets,82,43,39,0.524,48.5,42.8,91.4,0.468,...,10,5,11,12,18,18,4,16,10,Charlotte Hornets
4,1610612741,Chicago Bulls,82,46,36,0.561,48.1,41.7,86.9,0.48,...,6,23,25,24,8,28,13,20,10,Chicago Bulls


In [4]:
# print first 5 rows of advanced data to verify correct scraping
df_adv.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,...,OREB_PCT_RANK,DREB_PCT_RANK,REB_PCT_RANK,TM_TOV_PCT_RANK,EFG_PCT_RANK,TS_PCT_RANK,PACE_RANK,PIE_RANK,CFID,CFPARAMS
0,1610612737,Atlanta Hawks,82,43,39,0.524,3941.0,114.0,115.4,112.1,...,17,12,14,1,8,6,17,14,10,Atlanta Hawks
1,1610612738,Boston Celtics,82,51,31,0.622,3981.0,112.1,113.6,104.0,...,11,16,10,13,9,9,24,2,10,Boston Celtics
2,1610612751,Brooklyn Nets,82,44,38,0.537,3951.0,111.0,113.2,109.6,...,9,30,15,18,11,11,11,11,10,Brooklyn Nets
3,1610612766,Charlotte Hornets,82,43,39,0.524,3976.0,111.7,113.6,111.0,...,14,29,27,9,7,13,5,15,10,Charlotte Hornets
4,1610612741,Chicago Bulls,82,46,36,0.561,3946.0,111.0,112.7,111.1,...,28,7,17,6,10,8,14,20,10,Chicago Bulls


## Data cleaning

In [5]:
# join datasets and drop ID, Games Played, Wins and Losses, Win %, Minutes, CFID, and CFParams
df = pd.concat([df_trad, df_adv.iloc[:,7:]], axis = 1)
df.drop(['TEAM_ID','GP','W','L','W_PCT','MIN','CFID','CFPARAMS'], axis = 1, inplace = True)

# get rid of rank columns
df = df.iloc[:,~df.columns.str.endswith('RANK')]

# rename teams with team codes
df['TEAM_NAME'] = ['ATL','BOS','BKN','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN',
                   'NOP','NYK','OKC','ORL','PHI','PHX','POR','SAC','SAS','TOR','UTA','WAS']

# print first 5 rows to verify correct cleaning
df.head()

Unnamed: 0,TEAM_NAME,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,...,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,ATL,41.5,88.3,0.47,12.9,34.4,0.374,18.1,22.3,0.812,...,0.731,0.501,0.12,0.543,0.581,100.0,98.67,82.22,8099,0.509
1,BOS,40.7,87.4,0.466,13.2,37.1,0.356,17.0,20.9,0.816,...,0.725,0.509,0.139,0.542,0.578,99.0,97.26,81.05,8068,0.547
2,BKN,42.0,88.4,0.475,11.5,31.7,0.361,17.5,21.7,0.805,...,0.704,0.499,0.141,0.54,0.576,101.6,99.4,82.83,8177,0.512
3,CHA,42.8,91.4,0.468,13.9,38.2,0.365,15.8,21.4,0.74,...,0.706,0.486,0.131,0.544,0.572,102.3,100.52,83.76,8322,0.504
4,CHI,41.7,86.9,0.48,10.6,28.8,0.369,17.5,21.5,0.813,...,0.735,0.495,0.13,0.541,0.579,100.4,98.76,82.3,8124,0.496


## Write to CSV

In [None]:
df.to_csv('Combined_Team_Data.csv', index = False)