## NBA GM Performance Analysis - Web Scraping - Win Shares

### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import requests
import time
import json
from bs4 import BeautifulSoup

In [44]:
pd.set_option('display.max_columns', None)

In [4]:
years = list(range(2022,2025))

### Scraping from Basketball Reference

In [5]:
win_shares_url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'

In [None]:
for year in years:
    url = win_shares_url.format(year)
    data = requests.get(url)
    
    # Define a relative path (e.g., saving inside "win_shares_stats" folder in the current working directory)
    relative_folder = os.path.join("html/win_shares_stats")
    
    # Ensure the directory exists
    os.makedirs(relative_folder, exist_ok=True)

    # Construct the relative file path
    file_path = os.path.join(relative_folder, f"{year}.html")

    # Write the file
    with open(file_path, "w+", encoding='utf-8') as f:
        f.write(data.text)

Testing with just 2024 to see how the data looks.

In [None]:
with open("../../html/win_shares_stats/2024.html", encoding='utf-8') as f:
    page = f.read()
    
soup = BeautifulSoup(page, "html.parser")

winsharesstats2024 = soup.find(id="advanced")

winshares_2024 = pd.read_html(str(winsharesstats2024))[0]
winshares_2024["Year"] = year

winshares_2024

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,Year
0,1,Precious Achiuwa,PF-C,24,TOT,74,1624,14.6,.545,.207,...,1.2,2.2,3.4,.102,,-1.7,0.3,-1.4,0.2,2024
1,1,Precious Achiuwa,C,24,TOR,25,437,15.0,.512,.276,...,0.0,0.4,0.4,.048,,-1.4,-0.2,-1.6,0.0,2024
2,1,Precious Achiuwa,PF,24,NYK,49,1187,14.5,.564,.167,...,1.2,1.8,3.0,.122,,-1.9,0.5,-1.4,0.2,2024
3,2,Bam Adebayo,C,26,MIA,71,2416,19.8,.576,.041,...,2.9,4.3,7.2,.144,,0.8,1.7,2.4,2.7,2024
4,3,Ochai Agbaji,SG,23,TOT,78,1641,7.7,.497,.487,...,-0.5,0.6,0.1,.002,,-3.5,-0.9,-4.4,-1.0,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,568,Thaddeus Young,PF,35,PHO,10,89,15.1,.515,.048,...,0.1,0.1,0.3,.137,,-1.5,0.6,-0.9,0.0,2024
759,569,Trae Young,PG,25,ATL,54,1942,20.3,.585,.465,...,4.0,0.6,4.6,.114,,4.9,-2.3,2.6,2.2,2024
760,570,Omer Yurtseven,C,25,UTA,48,545,15.0,.565,.130,...,0.3,0.4,0.7,.062,,-1.6,-1.5,-3.0,-0.1,2024
761,571,Cody Zeller,C,31,NOP,43,320,12.8,.483,.048,...,0.4,0.4,0.8,.124,,-2.9,0.3,-2.6,0.0,2024


In [9]:
winshares_2024.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr',
       'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'Unnamed: 19', 'OWS', 'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'Year'],
      dtype='object')

In [10]:
drop_cols = ['Rk','Unnamed: 24', 'Unnamed: 19']
winshares_2024 = winshares_2024.drop(drop_cols, axis = 1)

In [11]:
winshares_2024

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,Precious Achiuwa,PF-C,24,TOT,74,1624,14.6,.545,.207,.239,...,15.9,1.2,2.2,3.4,.102,-1.7,0.3,-1.4,0.2,2024
1,Precious Achiuwa,C,24,TOR,25,437,15.0,.512,.276,.247,...,21.2,0.0,0.4,0.4,.048,-1.4,-0.2,-1.6,0.0,2024
2,Precious Achiuwa,PF,24,NYK,49,1187,14.5,.564,.167,.234,...,14.0,1.2,1.8,3.0,.122,-1.9,0.5,-1.4,0.2,2024
3,Bam Adebayo,C,26,MIA,71,2416,19.8,.576,.041,.381,...,24.9,2.9,4.3,7.2,.144,0.8,1.7,2.4,2.7,2024
4,Ochai Agbaji,SG,23,TOT,78,1641,7.7,.497,.487,.129,...,13.4,-0.5,0.6,0.1,.002,-3.5,-0.9,-4.4,-1.0,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,Thaddeus Young,PF,35,PHO,10,89,15.1,.515,.048,.143,...,12.8,0.1,0.1,0.3,.137,-1.5,0.6,-0.9,0.0,2024
759,Trae Young,PG,25,ATL,54,1942,20.3,.585,.465,.404,...,30.5,4.0,0.6,4.6,.114,4.9,-2.3,2.6,2.2,2024
760,Omer Yurtseven,C,25,UTA,48,545,15.0,.565,.130,.152,...,17.9,0.3,0.4,0.7,.062,-1.6,-1.5,-3.0,-0.1,2024
761,Cody Zeller,C,31,NOP,43,320,12.8,.483,.048,.613,...,12.9,0.4,0.4,0.8,.124,-2.9,0.3,-2.6,0.0,2024


Need to clean out duplicate rows of players. Instead of keeping the TOT row, I will want to keep the row corresponding to the acquiring team.

### Loop for Yearly Stats

In [None]:
dfs = []
relative_folder = "html/win_shares_stats"  # Define relative directory

for year in years:
    # Construct relative file path
    file_path = os.path.join(relative_folder, f"{year}.html")
    
    # Read the HTML file
    with open(file_path, encoding='utf-8') as f:
        page = f.read()
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(page, "html.parser")
    winsharesstats = soup.find(id="advanced_stats")
    
    # Convert to DataFrame
    winshares = pd.read_html(str(winsharesstats))[0]
    winshares["Year"] = year  # Add year column to track data
    
    # Drop unnecessary columns
    drop_cols = ['Rk', 'Unnamed: 24', 'Unnamed: 19']
    winshares = winshares.drop(columns=drop_cols, errors='ignore')  # Ignore errors if columns are missing
    
    dfs.append(winshares)  # Append to list

The following data will provide me with the win shares for all players from 2022-2024.

In [17]:
winshares_df = pd.concat(dfs)

In [19]:
winshares_df[winshares_df['Player'] == 'Precious Achiuwa']

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,Precious Achiuwa,C,22,TOR,73,1725,12.7,0.503,0.259,0.217,...,18.5,0.4,2.1,2.5,0.07,-2.0,-0.6,-2.6,-0.2,2022
0,Precious Achiuwa,C,23,TOR,55,1140,15.2,0.554,0.267,0.307,...,19.4,0.8,1.4,2.2,0.093,-1.4,-0.8,-2.3,-0.1,2023
0,Precious Achiuwa,PF-C,24,TOT,74,1624,14.6,0.545,0.207,0.239,...,15.9,1.2,2.2,3.4,0.102,-1.7,0.3,-1.4,0.2,2024
1,Precious Achiuwa,C,24,TOR,25,437,15.0,0.512,0.276,0.247,...,21.2,0.0,0.4,0.4,0.048,-1.4,-0.2,-1.6,0.0,2024
2,Precious Achiuwa,PF,24,NYK,49,1187,14.5,0.564,0.167,0.234,...,14.0,1.2,1.8,3.0,0.122,-1.9,0.5,-1.4,0.2,2024


In [None]:
winshares_df.to_csv('../../inputs/win_shares_stats_2022_2024.csv', index=False)

In [None]:
winshares_df = pd.read_csv('../../inputs/win_shares_stats_2022_2024.csv')