## NBA Skills Improvement Project Part 1 - Data Scraping

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import requests
import time
import json

In [3]:
from bs4 import BeautifulSoup

In [4]:
pd.set_option('display.max_columns', None)

In [4]:
years = list(range(2014,2024))

### Scraping from Basketball Reference

In [5]:
shooting_url = 'https://www.basketball-reference.com/leagues/NBA_{}_shooting.html'
defense_url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'

In [6]:
for year in years:
    url = shooting_url.format(year)
    data = requests.get(url)
    
    with open("C:/Users/kevin/Jupyter Notebook Projects/shooting stats/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)
        
for year in years:
    url = defense_url.format(year)
    data = requests.get(url)
    
    with open("C:/Users/kevin/Jupyter Notebook Projects/defensive stats/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)

Testing with just 2023 to see how the data looks.

In [None]:
with open("C:/Users/kevin/Jupyter Notebook Projects/shooting stats/2023.html", encoding='utf-8') as f:
    page = f.read()
    
soup = BeautifulSoup(page, "html.parser")

shootingstats2023 = soup.find(id="all_shooting_stats")

shooting_2023 = pd.read_html(str(shootingstats2023))[0]
shooting_2023["Year"] = year

shooting_2023

In [None]:
shooting_2023.columns = shooting_2023.columns.droplevel(0)
shooting_2023

In [33]:
shooting_2023.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG%', 'Dist.',
       'Unnamed: 9_level_1', '2P', '0-3', '3-10', '10-16', '16-3P', '3P',
       'Unnamed: 16_level_1', '2P', '0-3', '3-10', '10-16', '16-3P', '3P',
       'Unnamed: 23_level_1', '2P', '3P', 'Unnamed: 26_level_1', '%FGA', '#',
       'Unnamed: 29_level_1', '%3PA', '3P%', 'Unnamed: 32_level_1', 'Att.',
       '#', ''],
      dtype='object')

In [None]:
drop_cols = ['Rk','Unnamed: 9_level_1', 'Unnamed: 16_level_1', 'Unnamed: 23_level_1', 'Unnamed: 26_level_1', 'Unnamed: 29_level_1', 'Unnamed: 32_level_1']
shooting_2023 = shooting_2023.drop(drop_cols, axis = 1)
shooting_2023

In [35]:
shooting_2023.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG%', 'Dist.', '2P', '0-3',
       '3-10', '10-16', '16-3P', '3P', '2P', '0-3', '3-10', '10-16', '16-3P',
       '3P', '2P', '3P', '%FGA', '#', '%3PA', '3P%', 'Att.', '#', ''],
      dtype='object')

In [None]:
shooting_cols = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG%', 'Dist.', '2P_FGA%',
       '0-3_FGA%', '3-10_FGA%', '10-16_FGA%', '16-3P_FGA%', '3P_FGA%', '2P_FG%', '0-3_FG%', '3-10_FG%', '10-16_FG%',
       '16-3P_FG%', '3P_FG%', '2P_FGast', '3P_FGast', 'Dunks_%FGA', 'Dunks_#', 'Corner_%3PA', 'Corner_3P%', 
                 'Heaves_Att.', 'Heaves_#', 'Year']

shooting_2023.columns = shooting_cols
shooting_2023

In [None]:
with open("C:/Users/kevin/Jupyter Notebook Projects/defensive stats/2023.html", encoding='utf-8') as f:
    page = f.read()
    
soup = BeautifulSoup(page, "html.parser")

defensivestats2023 = soup.find(id="all_advanced_stats")

defensive_2023 = pd.read_html(str(defensivestats2023))[0]
defensive_2023["Year"] = year

defensive_2023

In [39]:
defensive_2023.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr',
       'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'Unnamed: 19', 'OWS', 'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'Year'],
      dtype='object')

In [None]:
drop_cols = ['Rk', 'Unnamed: 19', 'Unnamed: 24']
defensive_2023 = defensive_2023.drop(drop_cols, axis = 1)
defensive_2023

Now that I have worked out the logic for getting the basic dataframe for both the shooting and defensive metrics, I can integrate the logic into a loop that adds info for each year.

### Loop for Shooting Stats

In [37]:
dfs = []
for year in years:
    with open("C:/Users/kevin/Jupyter Notebook Projects/shooting stats/{}.html".format(year), encoding='utf-8') as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    shootingstats = soup.find(id="all_shooting_stats")
    shooting = pd.read_html(str(shootingstats))[0]
    shooting["Year"] = year
    
    shooting.columns = shooting.columns.droplevel(0)
    
    drop_cols = ['Rk','Unnamed: 9_level_1', 'Unnamed: 16_level_1', 'Unnamed: 23_level_1', 'Unnamed: 26_level_1', 'Unnamed: 29_level_1', 'Unnamed: 32_level_1']
    shooting = shooting.drop(drop_cols, axis = 1)
    
    shooting_cols = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG%', 'Dist.', '2P_FGA%',
       '0-3_FGA%', '3-10_FGA%', '10-16_FGA%', '16-3P_FGA%', '3P_FGA%', '2P_FG%', '0-3_FG%', '3-10_FG%', '10-16_FG%',
       '16-3P_FG%', '3P_FG%', '2P_FGast', '3P_FGast', 'Dunks_%FGA', 'Dunks_#', 'Corner_%3PA', 'Corner_3P%', 
                 'Heaves_Att.', 'Heaves_#', 'Year']

    shooting.columns = shooting_cols
    
    dfs.append(shooting)

In [43]:
shooting_df = pd.concat(dfs)

In [45]:
shooting_df.to_csv('C:/Users/kevin/Downloads/shooting stats 2014-2023.csv', index=False)

In [6]:
shooting_df = pd.read_csv("C:/Users/kevin/Downloads/shooting stats 2014-2023.csv")

### Loop for Defensive Data

In [9]:
dfs2 = []
for year in years:
    with open("C:/Users/kevin/Jupyter Notebook Projects/defensive stats/{}.html".format(year), encoding='utf-8') as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    defensivestats = soup.find(id="all_advanced_stats")
    defensive = pd.read_html(str(defensivestats))[0]
    defensive["Year"] = year
    
    drop_cols = ['Rk', 'Unnamed: 19', 'Unnamed: 24']
    defensive = defensive.drop(drop_cols, axis = 1)
    
    dfs2.append(defensive)

In [10]:
defensive_df = pd.concat(dfs2)

In [13]:
defensive_df.to_csv('C:/Users/kevin/Downloads/defensive stats 2014-23.csv', index=False)

In [7]:
defensive_df = pd.read_csv("C:/Users/kevin/Downloads/defensive stats 2014-2023.csv")

### Scraping Measurement Data from NBA API

In [8]:
import nba_api

In [9]:
custom_headers = {
    'Host': 'stats.nba.com',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

Below is the player dictionary I have previously saved from the NBA API that will serve as keys to access other endpoints.

In [10]:
player_dict = pd.read_csv("C:/Users/kevin/Downloads/player dictionary.csv")

In [11]:
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.library.parameters import SeasonType

In [12]:
from nba_api.stats.endpoints import commonplayerinfo

Finding Scottie Barnes's ID from the player dictionary as an example to pull from the endpoint.

In [14]:
player_dict[player_dict['full_name'] == 'Scottie Barnes']

Unnamed: 0,id,full_name,first_name,last_name,is_active
224,1630567,Scottie Barnes,Scottie,Barnes,True


In [20]:
commonplayerinfo.CommonPlayerInfo(player_id = '1630567').get_data_frames()[0]

Unnamed: 0,PERSON_ID,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,LAST_AFFILIATION,HEIGHT,WEIGHT,SEASON_EXP,JERSEY,POSITION,ROSTERSTATUS,GAMES_PLAYED_CURRENT_SEASON_FLAG,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CODE,TEAM_CITY,PLAYERCODE,FROM_YEAR,TO_YEAR,DLEAGUE_FLAG,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GREATEST_75_FLAG
0,1630567,Scottie,Barnes,Scottie Barnes,"Barnes, Scottie",S. Barnes,scottie-barnes,2001-08-01T00:00:00,Florida State,USA,Florida State/USA,6-7,237,2,4,Forward-Guard,Active,Y,1610612761,Raptors,TOR,raptors,Toronto,scottie_barnes,2021,2023,N,Y,Y,2021,1,4,N


I wanted height and weight as the main pre-draft indicators and this has that.

In [13]:
merged_df = pd.merge(shooting_df, defensive_df, on=['Player', 'Year'], how='outer')

In [14]:
merged_list = merged_df['Player'].unique()

In [None]:
player_bios = pd.DataFrame()

for i, name in enumerate(merged_list, start=1):
    # Check if 'full_name' column exists in player_dict
    if 'full_name' in player_dict.columns:
        # Get player_id for the current name if 'full_name' column exists
        matching_rows = player_dict.loc[player_dict['full_name'] == name, 'id']

        # Check if any matching rows were found
        if not matching_rows.empty:
            id = matching_rows.values[0]

            # Make the API call
            result_df = commonplayerinfo.CommonPlayerInfo(player_id=id).get_data_frames()[0]

            # Append result_df to the merged_data DataFrame
            player_bios = player_bios.append(result_df, ignore_index=True)

            # Print the player number
            print(f"Processed player {i}/{len(merged_list)}: {name}")
            
            # Introduce a random time lag
            lag = np.random.uniform(low=0.3, high=3)
            print(f'...waiting {round(lag, 1)} seconds')
            time.sleep(lag)
        else:
            print(f"No matching rows found for {name} in player_dict.")
    else:
        print("Column 'full_name' not found in player_dict.")

In [19]:
player_bios.to_csv('C:/Users/kevin/Downloads/player bios 2014-2023.csv', index=False)

In [20]:
player_bios = pd.read_csv('C:/Users/kevin/Downloads/player bios 2014-2023.csv')