## NBA GM Performance Analysis - Web Scraping - Win Shares

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import requests
import time
import json
from bs4 import BeautifulSoup

In [None]:
!pip install pandas numpy matplotlib seaborn requests beautifulsoup4

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
years = list(range(2022,2025))

### Scraping from Basketball Reference

In [4]:
win_shares_url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'

In [5]:
for year in years:
    url = win_shares_url.format(year)
    data = requests.get(url)
    
    with open("../../html/win_shares_stats/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)

Testing with just 2024 to see how the data looks.

In [6]:
with open("../../html/win_shares_stats/2024.html", encoding='utf-8') as f:
    page = f.read()
    
soup = BeautifulSoup(page, "html.parser")

winsharesstats2024 = soup.find(id="advanced")

winshares_2024 = pd.read_html(str(winsharesstats2024))[0]
winshares_2024["Year"] = year

winshares_2024

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Awards,Year
0,1.0,DeMar DeRozan,34.0,CHI,SF,79.0,79.0,2989.0,19.7,0.584,0.166,0.452,1.6,11.3,6.4,21.8,1.5,1.5,7.7,25.8,7.0,2.2,9.2,0.147,2.1,-0.3,1.8,2.8,CPOY-2,2024
1,2.0,Domantas Sabonis,27.0,SAC,C,82.0,82.0,2928.0,23.2,0.637,0.081,0.389,11.0,32.3,21.4,33.9,1.2,1.5,17.9,22.2,8.6,4.0,12.6,0.206,4.0,2.4,6.5,6.2,"MVP-8,DPOY-10,NBA3",2024
2,3.0,Coby White,23.0,CHI,PG,79.0,78.0,2881.0,14.5,0.570,0.460,0.215,1.7,12.4,6.9,20.8,0.9,0.6,11.1,22.7,3.1,1.6,4.7,0.078,0.7,-1.3,-0.7,0.9,,2024
3,4.0,Mikal Bridges,27.0,BRK,SF,82.0,82.0,2854.0,14.9,0.560,0.457,0.245,2.5,12.0,7.1,16.3,1.4,0.9,10.3,24.3,2.1,2.1,4.2,0.070,0.7,-1.0,-0.4,1.2,,2024
4,5.0,Paolo Banchero,21.0,ORL,PF,80.0,80.0,2799.0,17.3,0.546,0.249,0.397,3.4,20.0,11.6,25.2,1.3,1.6,13.0,29.7,1.3,4.0,5.3,0.090,1.3,0.0,1.3,2.3,AS,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,732.0,Jalen Crutcher,24.0,NOP,PG,1.0,0.0,3.0,-12.6,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.5,0.0,0.0,0.0,-0.334,-18.5,-7.8,-26.2,0.0,,2024
732,733.0,Dmytro Skapintsev,25.0,NYK,C,2.0,0.0,2.0,-19.3,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,0.0,0.0,0.0,-0.483,-16.0,-9.8,-25.9,0.0,,2024
733,734.0,Justin Jackson,28.0,MIN,SF,2.0,0.0,1.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.031,-6.3,-1.2,-7.5,0.0,,2024
734,735.0,Javonte Smart,24.0,PHI,PG,1.0,0.0,1.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.005,-6.2,-2.1,-8.3,0.0,,2024


In [7]:
winshares_2024.columns

Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP',
       'Awards', 'Year'],
      dtype='object')

In [9]:
drop_cols = ['Rk']
winshares_2024 = winshares_2024.drop(drop_cols, axis = 1)

In [10]:
winshares_2024

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Awards,Year
0,DeMar DeRozan,34.0,CHI,SF,79.0,79.0,2989.0,19.7,0.584,0.166,0.452,1.6,11.3,6.4,21.8,1.5,1.5,7.7,25.8,7.0,2.2,9.2,0.147,2.1,-0.3,1.8,2.8,CPOY-2,2024
1,Domantas Sabonis,27.0,SAC,C,82.0,82.0,2928.0,23.2,0.637,0.081,0.389,11.0,32.3,21.4,33.9,1.2,1.5,17.9,22.2,8.6,4.0,12.6,0.206,4.0,2.4,6.5,6.2,"MVP-8,DPOY-10,NBA3",2024
2,Coby White,23.0,CHI,PG,79.0,78.0,2881.0,14.5,0.570,0.460,0.215,1.7,12.4,6.9,20.8,0.9,0.6,11.1,22.7,3.1,1.6,4.7,0.078,0.7,-1.3,-0.7,0.9,,2024
3,Mikal Bridges,27.0,BRK,SF,82.0,82.0,2854.0,14.9,0.560,0.457,0.245,2.5,12.0,7.1,16.3,1.4,0.9,10.3,24.3,2.1,2.1,4.2,0.070,0.7,-1.0,-0.4,1.2,,2024
4,Paolo Banchero,21.0,ORL,PF,80.0,80.0,2799.0,17.3,0.546,0.249,0.397,3.4,20.0,11.6,25.2,1.3,1.6,13.0,29.7,1.3,4.0,5.3,0.090,1.3,0.0,1.3,2.3,AS,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,Jalen Crutcher,24.0,NOP,PG,1.0,0.0,3.0,-12.6,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.5,0.0,0.0,0.0,-0.334,-18.5,-7.8,-26.2,0.0,,2024
732,Dmytro Skapintsev,25.0,NYK,C,2.0,0.0,2.0,-19.3,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,0.0,0.0,0.0,-0.483,-16.0,-9.8,-25.9,0.0,,2024
733,Justin Jackson,28.0,MIN,SF,2.0,0.0,1.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.031,-6.3,-1.2,-7.5,0.0,,2024
734,Javonte Smart,24.0,PHI,PG,1.0,0.0,1.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.005,-6.2,-2.1,-8.3,0.0,,2024


Need to clean out duplicate rows of players. Instead of keeping the TOT row, I will want to keep the row corresponding to the acquiring team.

### Loop for Yearly Stats

In [12]:
dfs = []
relative_folder = "html/win_shares_stats"  # Define relative directory

for year in years:
    file_path = f"../../html/win_shares_stats/{year}.html"
    
    # Read the HTML file
    with open(file_path, encoding='utf-8') as f:
        page = f.read()
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(page, "html.parser")
    winsharesstats = soup.find(id="advanced")
    
    # Convert to DataFrame
    winshares = pd.read_html(str(winsharesstats))[0]
    winshares["Year"] = year  # Add year column to track data
    
    # Drop unnecessary columns
    drop_cols = ['Rk']
    winshares = winshares.drop(columns=drop_cols, axis=1)  # Ignore errors if columns are missing
    
    dfs.append(winshares)  # Append to list

The following data will provide me with the win shares for all players from 2022-2024.

In [13]:
winshares_df = pd.concat(dfs)

In [14]:
winshares_df[winshares_df['Player'] == 'Precious Achiuwa']

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Awards,Year
167,Precious Achiuwa,22.0,TOR,C,73.0,28.0,1725.0,12.7,0.503,0.259,0.217,8.7,21.7,14.9,6.9,1.1,2.3,11.3,18.5,0.4,2.1,2.5,0.07,-2.0,-0.6,-2.6,-0.2,,2022
297,Precious Achiuwa,23.0,TOR,C,55.0,12.0,1140.0,15.2,0.554,0.267,0.307,9.3,24.4,16.3,6.3,1.3,2.6,11.4,19.4,0.8,1.4,2.2,0.093,-1.4,-0.8,-2.3,-0.1,,2023
204,Precious Achiuwa,24.0,2TM,PF,74.0,18.0,1624.0,14.6,0.545,0.207,0.239,13.0,20.5,16.7,8.4,1.4,3.9,13.8,15.9,1.2,2.2,3.4,0.102,-1.7,0.3,-1.4,0.2,,2024
205,Precious Achiuwa,24.0,TOR,C,25.0,0.0,437.0,15.0,0.512,0.276,0.247,12.3,22.1,17.1,14.5,1.8,2.4,13.3,21.2,0.0,0.4,0.4,0.048,-1.4,-0.2,-1.6,0.0,,2024
206,Precious Achiuwa,24.0,NYK,PF,49.0,18.0,1187.0,14.5,0.564,0.167,0.234,13.3,19.9,16.6,6.2,1.3,4.4,14.1,14.0,1.2,1.8,3.0,0.122,-1.9,0.5,-1.4,0.2,,2024


In [None]:
winshares_df.to_csv('../../inputs/win_shares_stats_2022_2024.csv', index=False)

In [None]:
winshares_df = pd.read_csv('../../inputs/win_shares_stats_2022_2024.csv')