In [None]:
#Import libraries 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

#Paste player links for the 2020s Hall of Fame class
player_urls = [
    "https://www.basketball-reference.com/players/w/westje01.html",  #Jerry West 2024
    "https://www.basketball-reference.com/players/b/barnedi01.html",  #Dick Barnett 2024
    "https://www.basketball-reference.com/players/b/billuch01.html",  #Chauncey Billups 2024
    "https://www.basketball-reference.com/players/c/cartevi01.html",   #Vince Carter 2024
    "https://www.basketball-reference.com/players/c/coopemi01.html",  #Michael Cooper 2024
    "https://www.basketball-reference.com/players/d/daviswa03.html",  #Walter Davis 2024
    "https://www.basketball-reference.com/players/g/gasolpa01.html",  #Pau Gasol 2023
    "https://www.basketball-reference.com/players/n/nowitdi01.html",  #Dirk Nowitzki 2023
    "https://www.basketball-reference.com/players/p/parketo01.html",  #Tony Parker 2023
    "https://www.basketball-reference.com/players/w/wadedw01.html",   #Dwayne Wade 2023
    "https://www.basketball-reference.com/players/g/ginobma01.html",  #Manu Ginobili 2022
    "https://www.basketball-reference.com/players/h/hardati01.html",  #Tim Hardaway (Sr) 2022
    "https://www.basketball-reference.com/players/h/hudsolo01.html",  #Lou Hudson 2022
    "https://www.basketball-reference.com/players/b/boshch01.html",   #Chris Bosh 2021
    "https://www.basketball-reference.com/players/d/dandrbo01.html",  #Bob Danridge 2021
    "https://www.basketball-reference.com/players/k/kukocto01.html",  #Toni Kukoc 2021
    "https://www.basketball-reference.com/players/p/piercpa01.html",  #Paul Pierce 2021
    "https://www.basketball-reference.com/players/w/wallabe01.html",  #Ben Wallace 2021
    "https://www.basketball-reference.com/players/w/webbech01.html",  #Chris Webber 2021
    "https://www.basketball-reference.com/players/b/bryanko01.html",  #Kobe Bryant 2020
    "https://www.basketball-reference.com/players/d/duncati01.html",  #Tim Duncan 2020
    "https://www.basketball-reference.com/players/g/garneke01.html"    #Kevin Garnett 2020
]

#Headers for the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

#Initialize a list to store all players' data
all_players_data = []

#Player position abbreviations
position_map = {
    'Center': 'C',
    'Power Forward': 'PF',
    'Small Forward': 'SF',
    'Shooting Guard': 'SG',
    'Point Guard': 'PG'
}

#Function to clean and extract player position
def get_position(soup):
    position = None  #Default position if not found
    p_elems = soup.find_all('p')

    for p_elem in p_elems:
        if 'Position:' in p_elem.get_text():
            position_text = p_elem.get_text(separator=" ").split("Position:")[1].strip()
            position_text = position_text.replace('▪', '').strip()
            if 'Shoots:' in position_text:
                position_text = position_text.split('Shoots:')[0].strip()
            position_text = " ".join(position_text.split())
            position_text = position_text.replace(' and ', ', ')
            positions = position_text.split(',')
            primary_position = positions[0].strip()
            position = position_map.get(primary_position, primary_position)
            break
    return position



#Function to safely extract MVP count
def get_mvp_count(soup):
    mvp_count = 0
    mvp_elem = soup.find_all('li', {'class': 'poptip'}, string=lambda s: s and 'MVP' in s and 'Finals' not in s and 'AS' not in s and 'MBWA NBA' not in s)
    for elem in mvp_elem:
        text = elem.text.strip()
        if 'x' in text:
            mvp_count += int(text.split('x')[0])  #For players with multiple MVPs, split on 'x' and extract the first element
        else:
            mvp_count += 1  #For players with only 1 MVP
    return mvp_count

#Function to safely extract Scoring Championships count
def get_scoring_champ_count(soup):
    scoring_champ_count = 0
    scoring_champ_elem = soup.find_all('li', {'data-tip': lambda x: x and 'NBA Scoring Champ' in x})
    for elem in scoring_champ_elem:
        text = elem.text.strip()
        if 'x' in text:
            scoring_champ_count += int(text.split('x')[0])  # For players with multiple scoring championships, split on 'x'
        else:
            scoring_champ_count += 1  # For players with just 1 scoring championship
    return scoring_champ_count

#Function to safely extract NBA Championships count
def get_chips_count(soup):
    chips_count = 0
    chip_elem = soup.find_all('li', class_='', string=lambda s: s and 'NBA Champ' in s)
    for elem in chip_elem:
        text = elem.text.strip()
        if 'x' in text:
            chips_count += int(text.split('x')[0])  #For players with multiple championships, split on 'x'
        else:
            chips_count += 1  #For players with 1 championship
    return chips_count

#Loop over each player URL to scrape their data
for url in player_urls:
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        try:
            #Get player name
            player_name = soup.find('h1').find('span').text.strip()

            #Get player position
            position = get_position(soup)

            #Safeguard function for extracting stats
            def safe_find(tag, text):
                element = soup.find('span', {'data-tip': text})
                if element:
                    return element.find_next('p').find_next('p').text.strip()
                return None

            games = int(safe_find('Games', 'Games'))
            ppg = float(safe_find('Points', 'Points'))
            rpg = float(safe_find('Total Rebounds', 'Total Rebounds'))
            apg = float(safe_find('Assists', 'Assists'))

            #Extract PER
            per_elem = soup.find('span', {'data-tip': lambda x: x and 'Player Efficiency Rating' in x})
            per = float(per_elem.find_next('p').find_next('p').text.strip() if per_elem else None)

            #Extract Field Goal Percentage and Free Throw Percentage
            fg_pct = float(safe_find('Field Goal Percentage', 'Field Goal Percentage'))
            ft_pct = float(safe_find('Free Throw Percentage', 'Free Throw Percentage'))

            #Extract awards counts
            mvp_count = get_mvp_count(soup)
            scoring_champ_count = get_scoring_champ_count(soup)
            chips_count = get_chips_count(soup)

            #Extract All-Stars, All-NBA, All-Defense, and other honors
            all_stars = int(soup.find('li', {'class': 'all_star'}).find('a').text.strip().split('x')[0] if soup.find('li', {'class': 'all_star'}) else 0)
            all_nba_count = sum([int(a.text.strip().split('x')[0]) if 'x' in a.text else 1 for a in soup.find('li', string=lambda s: s and 'All-NBA' in s).find('a')]) if soup.find('li', string=lambda s: s and 'All-NBA' in s) else 0
            all_defense_count = sum([int(a.text.strip().split('x')[0]) if 'x' in a.text else 1 for a in soup.find('li', {'class': 'poptip'}, string=lambda s: s and 'All-Defensive' in s).find('a')]) if soup.find_all('li', {'class': 'poptip'}, string=lambda s: s and 'All-Defensive' in s) else 0
            all_rookie = 1 if soup.find('li', {'data-tip': lambda s: s and 'All-Rookie' in s}) else 0
            roy = 1 if soup.find('li', {'class': 'poptip'}, string=lambda s: s and 'ROY' in s) else 0
            dpoy_count = sum([int(text.split('x')[0]) if 'x' in text else 1 for text in [elem.text.strip() for elem in soup.find_all('li', class_='poptip', string=lambda s: s and 'Def. POY' in s)]])
            
            #All players are in the Hall of Fame
            hof = 1

            #Store the data
            player_data = {
                'Name': player_name,
                'Position': position,
                'Games': games,
                'PPG': ppg,
                'RPG': rpg,
                'APG': apg,
                'PER': per,
                'FG%': fg_pct,
                'FT%': ft_pct,
                'All-Stars': all_stars,
                'All-NBA': all_nba_count,
                'All-Defense': all_defense_count,
                'All-Rookie Team': all_rookie,
                'MVPs': mvp_count,
                'Chips': chips_count,
                'ROY': roy,
                'DPOYs': dpoy_count,
                'Scoring Champ': scoring_champ_count,
                'HOF': hof
            }
            all_players_data.append(player_data)

        except Exception as e:
            print(f"Error scraping data for {url}: {e}")

        time.sleep(1)  #Be polite with requests, avoid overwhelming the server

#Create a DataFrame from the collected data
df = pd.DataFrame(all_players_data)

#Save to a CSV file
df.to_csv('2020_hof_players.csv', index=False)

In [2]:
#Display dataframe
df

Unnamed: 0,Name,Position,Games,PPG,RPG,APG,PER,FG%,FT%,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Jerry West,PG,932,27.0,5.8,6.7,22.9,47.4,81.4,14,12,5,0,0,1,0,0,1,1
1,Dick Barnett,SG,971,15.8,2.9,2.8,14.7,45.6,76.1,1,0,0,0,0,2,0,0,0,1
2,Chauncey Billups,PG,1043,15.2,2.9,5.4,18.8,41.5,89.4,5,3,2,0,0,1,0,0,0,1
3,Vince Carter,SG,1541,16.7,4.3,3.1,18.6,43.5,79.8,8,2,0,1,0,0,1,0,0,1
4,Michael Cooper,SG,873,8.9,3.2,4.2,12.8,46.9,83.3,0,0,8,0,0,5,0,1,0,1
5,Walter Davis,SG,1033,18.9,3.0,3.8,19.1,51.1,85.1,6,2,0,1,0,0,1,0,0,1
6,Pau Gasol,C,1226,17.0,9.2,3.2,21.4,50.7,75.3,6,4,0,1,0,2,1,0,0,1
7,Dirk Nowitzki,PF,1522,20.7,7.5,2.4,22.4,47.1,87.9,14,12,0,0,1,1,0,0,0,1
8,Tony Parker,PG,1254,15.5,2.7,5.6,18.2,49.1,75.1,6,4,0,1,0,4,0,0,0,1
9,Dwyane Wade,SG,1054,22.0,4.7,5.4,23.5,48.0,76.5,13,8,3,1,0,3,0,0,1,1


**These are regular season career stats**

In [3]:
#Check to make sure data types are correct
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             22 non-null     object 
 1   Position         22 non-null     object 
 2   Games            22 non-null     int64  
 3   PPG              22 non-null     float64
 4   RPG              22 non-null     float64
 5   APG              22 non-null     float64
 6   PER              22 non-null     float64
 7   FG%              22 non-null     float64
 8   FT%              22 non-null     float64
 9   All-Stars        22 non-null     int64  
 10  All-NBA          22 non-null     int64  
 11  All-Defense      22 non-null     int64  
 12  All-Rookie Team  22 non-null     int64  
 13  MVPs             22 non-null     int64  
 14  Chips            22 non-null     int64  
 15  ROY              22 non-null     int64  
 16  DPOYs            22 non-null     int64  
 17  Scoring Champ    2

In [4]:
#Check for missing values
df.isnull().sum()

Name               0
Position           0
Games              0
PPG                0
RPG                0
APG                0
PER                0
FG%                0
FT%                0
All-Stars          0
All-NBA            0
All-Defense        0
All-Rookie Team    0
MVPs               0
Chips              0
ROY                0
DPOYs              0
Scoring Champ      0
HOF                0
dtype: int64

In [5]:
#Lastly a couple players positions are wrong, while this is minor it's still worth fixing
df.loc[0,'Position'] = 'SG' #Jerry West's primary position was a Shooting Guard 
df.loc[13,'Position'] = 'PF' #Chris Bosh's primary position was Power Forward
df.loc[20,'Position'] = 'PF' #Tim Duncan is considerd one of the best if not the best Power Foward of all time

In [6]:
#Check dataframe
df

Unnamed: 0,Name,Position,Games,PPG,RPG,APG,PER,FG%,FT%,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Jerry West,SG,932,27.0,5.8,6.7,22.9,47.4,81.4,14,12,5,0,0,1,0,0,1,1
1,Dick Barnett,SG,971,15.8,2.9,2.8,14.7,45.6,76.1,1,0,0,0,0,2,0,0,0,1
2,Chauncey Billups,PG,1043,15.2,2.9,5.4,18.8,41.5,89.4,5,3,2,0,0,1,0,0,0,1
3,Vince Carter,SG,1541,16.7,4.3,3.1,18.6,43.5,79.8,8,2,0,1,0,0,1,0,0,1
4,Michael Cooper,SG,873,8.9,3.2,4.2,12.8,46.9,83.3,0,0,8,0,0,5,0,1,0,1
5,Walter Davis,SG,1033,18.9,3.0,3.8,19.1,51.1,85.1,6,2,0,1,0,0,1,0,0,1
6,Pau Gasol,C,1226,17.0,9.2,3.2,21.4,50.7,75.3,6,4,0,1,0,2,1,0,0,1
7,Dirk Nowitzki,PF,1522,20.7,7.5,2.4,22.4,47.1,87.9,14,12,0,0,1,1,0,0,0,1
8,Tony Parker,PG,1254,15.5,2.7,5.6,18.2,49.1,75.1,6,4,0,1,0,4,0,0,0,1
9,Dwyane Wade,SG,1054,22.0,4.7,5.4,23.5,48.0,76.5,13,8,3,1,0,3,0,0,1,1


In [7]:
#Save final dataframe to a csv file
df.to_csv('2020s_hof_players.csv',index=False)