In [10]:
#Import libraries 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

#Paste player links for the 2000s Hall of Fame class
player_urls = [
    "https://www.basketball-reference.com/players/j/jordami01.html", #Michael Jordan 2009
    "https://www.basketball-reference.com/players/r/robinda01.html", #David Robinson 2009
    "https://www.basketball-reference.com/players/s/stockjo01.html", #John Stockton 2009
    "https://www.basketball-reference.com/players/d/dantlad01.html", #Adrian Dantley 2008
    "https://www.basketball-reference.com/players/e/ewingpa01.html", #Patrick Ewing 2008
    "https://www.basketball-reference.com/players/o/olajuha01.html", #Hakeem Olajuwon 2008
    "https://www.basketball-reference.com/players/b/barklch01.html", #Charles Barkley 2006
    "https://www.basketball-reference.com/players/d/dumarjo01.html", #Joe Dumars 2006
    "https://www.basketball-reference.com/players/w/wilkido01.html", #Dominique Wilkins 2006
    "https://www.basketball-reference.com/players/d/drexlcl01.html", #Clyde Drexler 2004
    "https://www.basketball-reference.com/players/s/stokema01.html", #Maurice Stokes 2004
    "https://www.basketball-reference.com/players/p/parisro01.html", #Robert Parish 2003
    "https://www.basketball-reference.com/players/w/worthja01.html", #James Worthy 2003
    "https://www.basketball-reference.com/players/j/johnsma02.html", #Magic Johnson 2002
    "https://www.basketball-reference.com/players/p/petrodr01.html", #Drazen Petrovic 2002
    "https://www.basketball-reference.com/players/m/malonmo01.html", #Moses Malone 2001
    "https://www.basketball-reference.com/players/m/mcadobo01.html", #Bob McAdoo 2000
    "https://www.basketball-reference.com/players/t/thomais01.html" #Isiah Thomas 2000
]

#Headers for the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

#Initialize a list to store all players' data
all_players_data = []

#Player position abbreviations
position_map = {
    'Center': 'C',
    'Power Forward': 'PF',
    'Small Forward': 'SF',
    'Shooting Guard': 'SG',
    'Point Guard': 'PG'
}

#Function to clean and extract player position
def get_position(soup):
    position = None  #Default position if not found
    p_elems = soup.find_all('p')

    for p_elem in p_elems:
        if 'Position:' in p_elem.get_text():
            position_text = p_elem.get_text(separator=" ").split("Position:")[1].strip()
            position_text = position_text.replace('▪', '').strip()
            if 'Shoots:' in position_text:
                position_text = position_text.split('Shoots:')[0].strip()
            position_text = " ".join(position_text.split())
            position_text = position_text.replace(' and ', ', ')
            positions = position_text.split(',')
            primary_position = positions[0].strip()
            position = position_map.get(primary_position, primary_position)
            break
    return position



#Function to safely extract MVP count
def get_mvp_count(soup):
    mvp_count = 0
    mvp_elem = soup.find_all('li', {'class': 'poptip'}, string=lambda s: s and 'MVP' in s and 'Finals' not in s and 'AS' not in s and 'MBWA NBA' not in s)
    for elem in mvp_elem:
        text = elem.text.strip()
        if 'x' in text:
            mvp_count += int(text.split('x')[0])  #For players with multiple MVPs, split on 'x' and extract the first element
        else:
            mvp_count += 1  #For players with only 1 MVP
    return mvp_count

#Function to safely extract Scoring Championships count
def get_scoring_champ_count(soup):
    scoring_champ_count = 0
    scoring_champ_elem = soup.find_all('li', {'data-tip': lambda x: x and 'NBA Scoring Champ' in x})
    for elem in scoring_champ_elem:
        text = elem.text.strip()
        if 'x' in text:
            scoring_champ_count += int(text.split('x')[0])  # For players with multiple scoring championships, split on 'x'
        else:
            scoring_champ_count += 1  # For players with just 1 scoring championship
    return scoring_champ_count

#Function to safely extract NBA Championships count
def get_chips_count(soup):
    chips_count = 0
    chip_elem = soup.find_all('li', class_='', string=lambda s: s and ('NBA Champ' in s or 'ABA Champ' in s))
    for elem in chip_elem:
        text = elem.text.strip()
        if 'x' in text:
            chips_count += int(text.split('x')[0])  #For players with multiple championships, split on 'x'
        else:
            chips_count += 1  #For players with 1 championship
    return chips_count

#Loop over each player URL to scrape their data
for url in player_urls:
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        try:
            #Get player name
            player_name = soup.find('h1').find('span').text.strip()

            #Get player position
            position = get_position(soup)

            #Get player career length
            career_length_elem = soup.find('strong', string=lambda s: s and 'Career Length:' in s)
            career_length = int(career_length_elem.next_sibling.strip().split()[0])

            #Safeguard function for extracting stats
            def safe_find(tag, text):
                element = soup.find('span', {'data-tip': text})
                if element:
                    return element.find_next('p').find_next('p').text.strip()
                return None

            games = int(safe_find('Games', 'Games'))
            ppg = float(safe_find('Points', 'Points'))
            rpg = float(safe_find('Total Rebounds', 'Total Rebounds'))
            apg = float(safe_find('Assists', 'Assists'))

            #Extract PER
            per_elem = soup.find('span', {'data-tip': lambda x: x and 'Player Efficiency Rating' in x})
            per = float(per_elem.find_next('p').find_next('p').text.strip() if per_elem else None)

            #Extract Field Goal Percentage and Free Throw Percentage
            fg_pct = float(safe_find('Field Goal Percentage', 'Field Goal Percentage'))
            ft_pct = float(safe_find('Free Throw Percentage', 'Free Throw Percentage'))

            #Extract awards counts
            mvp_count = get_mvp_count(soup)
            scoring_champ_count = get_scoring_champ_count(soup)
            chips_count = get_chips_count(soup)

            #Extract All-Stars, All-NBA, All-Defense, and other honors
            all_stars = int(soup.find('li', {'class': 'all_star'}).find('a').text.strip().split('x')[0] if soup.find('li', {'class': 'all_star'}) else 0)
            all_nba_count = sum([int(a.text.strip().split('x')[0]) if 'x' in a.text else 1 for a in soup.find('li', string=lambda s: s and ('All-NBA' in s or 'All-ABA' in s)).find('a')]) if soup.find('li', string=lambda s: s and ('All-NBA' in s or 'All-ABA' in s)) else 0
            all_defense_count = sum([int(a.text.strip().split('x')[0]) if 'x' in a.text else 1 for a in soup.find('li', {'class': 'poptip'}, string=lambda s: s and 'All-Defensive' in s).find('a')]) if soup.find_all('li', {'class': 'poptip'}, string=lambda s: s and 'All-Defensive' in s) else 0
            all_rookie = 1 if soup.find('li', {'data-tip': lambda s: s and 'All-Rookie' in s}) else 0
            roy = 1 if soup.find('li', {'class': 'poptip'}, string=lambda s: s and 'ROY' in s) else 0
            dpoy_count = sum([int(text.split('x')[0]) if 'x' in text else 1 for text in [elem.text.strip() for elem in soup.find_all('li', class_='poptip', string=lambda s: s and 'Def. POY' in s)]])
            
            #All players are in the Hall of Fame
            hof = 1

            #Store the data
            player_data = {
                'Name': player_name,
                'Position': position,
                'Games': games,
                'Career Length': career_length,
                'PPG': ppg,
                'RPG': rpg,
                'APG': apg,
                'PER': per,
                'FG%': fg_pct,
                'FT%': ft_pct,
                'All-Stars': all_stars,
                'All-NBA': all_nba_count,
                'All-Defense': all_defense_count,
                'All-Rookie Team': all_rookie,
                'MVPs': mvp_count,
                'Chips': chips_count,
                'ROY': roy,
                'DPOYs': dpoy_count,
                'Scoring Champ': scoring_champ_count,
                'HOF': hof
            }
            all_players_data.append(player_data)

        except Exception as e:
            print(f"Error scraping data for {url}: {e}")

        time.sleep(1)  #Be polite with requests, avoid overwhelming the server

#Create a DataFrame from the collected data
df_2000s = pd.DataFrame(all_players_data)

In [11]:
df_2000s

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Michael Jordan,SG,1072,15,30.1,6.2,5.3,27.9,49.7,83.5,14,11,9,1,5,6,1,1,10,1
1,David Robinson,C,987,14,21.1,10.6,2.5,26.2,51.8,73.6,10,10,8,1,1,2,1,1,1,1
2,John Stockton,PG,1504,19,13.1,2.7,10.5,21.8,51.5,82.6,10,11,5,0,0,0,0,0,0,1
3,Adrian Dantley,SF,955,15,24.3,5.7,3.0,21.5,54.0,81.8,6,2,0,1,0,0,1,0,2,1
4,Patrick Ewing,C,1183,17,21.0,9.8,1.9,21.0,50.4,74.0,11,7,3,1,0,0,1,0,0,1
5,Hakeem Olajuwon,C,1238,18,21.8,11.1,2.5,23.6,51.2,71.2,12,12,9,1,1,2,0,2,0,1
6,Charles Barkley,PF,1073,16,22.1,11.7,3.9,24.6,54.1,73.5,11,11,0,1,1,0,0,0,0,1
7,Joe Dumars,SG,1018,14,16.1,2.2,4.5,15.3,46.0,84.3,6,3,5,1,0,2,0,0,0,1
8,Dominique Wilkins,SF,1074,15,24.8,6.7,2.5,21.6,46.1,81.1,9,7,0,1,0,0,0,0,1,1
9,Clyde Drexler,SG,1086,15,20.4,6.1,5.6,21.1,47.2,78.8,10,5,0,0,0,1,0,0,0,1


I decided to add the stat Win Shares to my dataframe so instead of trying to add it to the code (which I'll do later), I'll just manually add it for now. 

In [12]:
#Create a dictionary called win_shares to add to dataframe
win_shares_2000s = {
    0:214.0,
    1:178.7,
    2:207.7,
    3:134.2,
    4:126.4,
    5:162.8,
    6:177.2,
    7:86.2,
    8:117.5,
    9:135.6,
    10:16.1,
    11:147.0,
    12:81.2,
    13:155.8,
    14:21.5,
    15:179.1,
    16:89.1,
    17:80.7
}

#Create new column and map the win_shares dictionary to it
df_2000s['Win Shares'] = df_2000s.index.map(win_shares_2000s)

In [14]:
df_2000s.columns

Index(['Name', 'Position', 'Games', 'Career Length', 'PPG', 'RPG', 'APG',
       'PER', 'FG%', 'FT%', 'All-Stars', 'All-NBA', 'All-Defense',
       'All-Rookie Team', 'MVPs', 'Chips', 'ROY', 'DPOYs', 'Scoring Champ',
       'HOF', 'Win Shares'],
      dtype='object')

In [20]:
#Define new column order
new_column_order_2000s = ['Name', 'Position', 'Games', 'Career Length', 'PPG', 'RPG', 'APG',
       'PER', 'FG%', 'FT%', 'Win Shares', 'All-Stars', 'All-NBA', 'All-Defense',
       'All-Rookie Team', 'MVPs', 'Chips', 'ROY', 'DPOYs', 'Scoring Champ',
       'HOF']
df_2000s = df_2000s[new_column_order_2000s]

#Make sure all columns are viewable
pd.set_option('display.max_columns', None)

#Display Reordered Dataframe
print('Reordered Dataframe:')
df_2000s

Reordered Dataframe:


Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Michael Jordan,SG,1072,15,30.1,6.2,5.3,27.9,49.7,83.5,214.0,14,11,9,1,5,6,1,1,10,1
1,David Robinson,C,987,14,21.1,10.6,2.5,26.2,51.8,73.6,178.7,10,10,8,1,1,2,1,1,1,1
2,John Stockton,PG,1504,19,13.1,2.7,10.5,21.8,51.5,82.6,207.7,10,11,5,0,0,0,0,0,0,1
3,Adrian Dantley,SF,955,15,24.3,5.7,3.0,21.5,54.0,81.8,134.2,6,2,0,1,0,0,1,0,2,1
4,Patrick Ewing,C,1183,17,21.0,9.8,1.9,21.0,50.4,74.0,126.4,11,7,3,1,0,0,1,0,0,1
5,Hakeem Olajuwon,C,1238,18,21.8,11.1,2.5,23.6,51.2,71.2,162.8,12,12,9,1,1,2,0,2,0,1
6,Charles Barkley,PF,1073,16,22.1,11.7,3.9,24.6,54.1,73.5,177.2,11,11,0,1,1,0,0,0,0,1
7,Joe Dumars,SG,1018,14,16.1,2.2,4.5,15.3,46.0,84.3,86.2,6,3,5,1,0,2,0,0,0,1
8,Dominique Wilkins,SF,1074,15,24.8,6.7,2.5,21.6,46.1,81.1,117.5,9,7,0,1,0,0,0,0,1,1
9,Clyde Drexler,SG,1086,15,20.4,6.1,5.6,21.1,47.2,78.8,135.6,10,5,0,0,0,1,0,0,0,1


In [21]:
#Read in other 2000s csv files in order to combine them all together
df_2010s = pd.read_csv('2010s_hof_players.csv')
df_2020s = pd.read_csv('2020s_hof_players.csv')

In [24]:
#Add Career length to dataframe for 2020s
career_length_2020s = {
    0:14,
    1:14,
    2:17,
    3:22,
    4:12,
    5:15,
    6:18,
    7:21,
    8:18,
    9:16,
    10:16,
    11:13,
    12:13,
    13:13,
    14:13,
    15:13,
    16:19,
    17:16,
    18:15,
    19:20,
    20:19,
    21:21
}

In [None]:
#Add win share column
#2020s first
win_shares_2020s = {
    0:162.6,
    1:68.7,
    2:120.8,
    3:125.3,
    4:52.5,
    5:76.9,
    6:144.1,
    7:206.3,
    8:111.3,
    9:120.7,
    10:106.4,
    11:85.0,
    12:81.0,
    13:106.0,
    14:83.0,
    15:59.6,
    16:150.0,
    17:93.5,
    18:84.7,
    19:172.7,
    20:206.4,
    21:191.4
}

#Create new columns and map the dictionarys to it
df_2020s['Career Length'] = df_2020s.index.map(career_length_2020s)
df_2020s['Win Shares'] = df_2020s.index.map(win_shares_2020s)

#Define new column order
new_column_order_2020s = ['Name', 'Position', 'Games', 'Career Length', 'PPG', 'RPG', 'APG',
       'PER', 'FG%', 'FT%', 'Win Shares', 'All-Stars', 'All-NBA', 'All-Defense',
       'All-Rookie Team', 'MVPs', 'Chips', 'ROY', 'DPOYs', 'Scoring Champ',
       'HOF']
df_2020s = df_2020s[new_column_order_2020s]
df_2020s

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Jerry West,SG,932,14,27.0,5.8,6.7,22.9,47.4,81.4,162.6,14,12,5,0,0,1,0,0,1,1
1,Dick Barnett,SG,971,14,15.8,2.9,2.8,14.7,45.6,76.1,68.7,1,0,0,0,0,2,0,0,0,1
2,Chauncey Billups,PG,1043,17,15.2,2.9,5.4,18.8,41.5,89.4,120.8,5,3,2,0,0,1,0,0,0,1
3,Vince Carter,SG,1541,22,16.7,4.3,3.1,18.6,43.5,79.8,125.3,8,2,0,1,0,0,1,0,0,1
4,Michael Cooper,SG,873,12,8.9,3.2,4.2,12.8,46.9,83.3,52.5,0,0,8,0,0,5,0,1,0,1
5,Walter Davis,SG,1033,15,18.9,3.0,3.8,19.1,51.1,85.1,76.9,6,2,0,1,0,0,1,0,0,1
6,Pau Gasol,C,1226,18,17.0,9.2,3.2,21.4,50.7,75.3,144.1,6,4,0,1,0,2,1,0,0,1
7,Dirk Nowitzki,PF,1522,21,20.7,7.5,2.4,22.4,47.1,87.9,206.3,14,12,0,0,1,1,0,0,0,1
8,Tony Parker,PG,1254,18,15.5,2.7,5.6,18.2,49.1,75.1,111.3,6,4,0,1,0,4,0,0,0,1
9,Dwyane Wade,SG,1054,16,22.0,4.7,5.4,23.5,48.0,76.5,120.7,13,8,3,1,0,3,0,0,1,1


In [27]:
#Add career length and win shares column to 2010s dataset
career_length_2010s = {
    0:13,
    1:6,
    2:16,
    3:12,
    4:11,
    5:14,
    6:12,
    7:18,
    8:15,
    9:18,
    10:19,
    11:18,
    12:4,
    13:10,
    14:11,
    15:16,
    16:12,
    17:14,
    18:8,
    19:19,
    20:12,
    21:13,
    22:18,
    23:12,
    24:7,
    25:15,
    26:14,
    27:12,
    28:8,
    29:13,
    30:14,
    31:17,
    32:9,
    33:18,
    34:9,
    35:13,
    36:12,
    37:17,
    38:16,
    39:14,
    40:7,
    41:14,
    42:10,
    43:19,
    44:17
}
win_shares_2010s = {
    0:64.3,
    1:11.6,
    2:96.4,
    3:94.1,
    4:90.3,
    5:112.5,
    6:67.7,
    7:145.1,
    8:103.5,
    9:99.9,
    10:138.6,
    11:129.7,
    12:14.3,
    13:41.4,
    14:74.6,
    15:97.3,
    16:106.0,
    17:99.0,
    18:65.9,
    19:181.7,
    20:74.1,
    21:78.5,
    22:117.0,
    23:54.0,
    24:20.0,
    25:89.7,
    26:79.3,
    27:33.3,
    28:65.2,
    29:69.2,
    30:75.4,
    31:145.5,
    32:64.8,
    33:174.4,
    34:20.1,
    35:117.4,
    36:71.3,
    37:189.7,
    38:93.1,
    39:89.8,
    40:47.3,
    41:82.6,
    42:36.9,
    43:234.6,
    44:125.1
}

#Create new columns and map the dictionarys to it
df_2010s['Career Length'] = df_2010s.index.map(career_length_2010s)
df_2010s['Win Shares'] = df_2010s.index.map(win_shares_2010s)

#Define new column order
new_column_order_2010s = ['Name', 'Position', 'Games', 'Career Length', 'PPG', 'RPG', 'APG',
       'PER', 'FG%', 'FT%', 'Win Shares', 'All-Stars', 'All-NBA', 'All-Defense',
       'All-Rookie Team', 'MVPs', 'Chips', 'ROY', 'DPOYs', 'Scoring Champ',
       'HOF']
df_2010s = df_2010s[new_column_order_2010s]
df_2010s

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Carl Braun,PG,788,13,13.5,3.4,3.7,15.8,38.3,80.4,64.3,5,2,0,0,0,1,0,0,0,1
1,Chuck Cooper,SF,409,6,6.7,5.9,1.8,11.1,33.9,74.3,11.6,0,0,0,0,0,0,0,0,0,1
2,Vlade Divac,C,1134,16,11.8,8.2,3.1,17.7,49.5,69.2,96.4,1,0,0,1,0,0,0,0,0,1
3,Bobby Jones,PF,941,12,12.1,6.1,2.7,18.2,56.0,76.6,94.1,5,1,11,1,0,1,0,0,0,1
4,Sidney Moncrief,SG,767,11,15.6,4.7,3.6,18.7,50.2,83.1,90.3,5,5,5,0,0,0,0,2,0,1
5,Jack Sikma,C,1107,14,15.6,9.8,3.2,17.3,46.4,84.9,112.5,7,0,1,1,0,1,0,0,0,1
6,Paul Westphal,SG,823,12,15.6,1.9,4.4,19.4,50.4,82.0,67.7,5,4,0,0,0,1,0,0,0,1
7,Ray Allen,SG,1300,18,18.9,4.1,3.4,18.6,45.2,89.4,145.1,10,2,0,1,0,2,0,0,0,1
8,Maurice Cheeks,PG,1101,15,11.1,2.8,6.7,16.5,52.3,79.3,103.5,4,0,5,0,0,1,0,0,0,1
9,Grant Hill,SF,1026,18,16.7,6.0,4.1,19.0,48.3,76.9,99.9,7,5,0,1,0,0,1,0,0,1


In [30]:
#Lastly let's join together these dataframes
df = pd.concat([df_2020s,df_2010s,df_2000s], ignore_index=True)
df

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Jerry West,SG,932,14,27.0,5.8,6.7,22.9,47.4,81.4,162.6,14,12,5,0,0,1,0,0,1,1
1,Dick Barnett,SG,971,14,15.8,2.9,2.8,14.7,45.6,76.1,68.7,1,0,0,0,0,2,0,0,0,1
2,Chauncey Billups,PG,1043,17,15.2,2.9,5.4,18.8,41.5,89.4,120.8,5,3,2,0,0,1,0,0,0,1
3,Vince Carter,SG,1541,22,16.7,4.3,3.1,18.6,43.5,79.8,125.3,8,2,0,1,0,0,1,0,0,1
4,Michael Cooper,SG,873,12,8.9,3.2,4.2,12.8,46.9,83.3,52.5,0,0,8,0,0,5,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,Magic Johnson,PG,906,13,19.5,7.2,11.2,24.1,52.0,84.8,155.8,12,10,0,1,3,5,0,0,0,1
81,Dražen Petrović,SG,290,4,15.4,2.3,2.4,16.4,50.6,84.1,21.5,0,1,0,0,0,0,0,0,0,1
82,Moses Malone,C,1455,21,20.3,12.3,1.3,22.0,49.5,76.0,179.1,13,8,2,1,3,1,0,0,0,1
83,Bob McAdoo,C,852,14,22.1,9.4,2.3,20.7,50.3,75.4,89.1,5,2,0,1,1,2,1,0,3,1


In [31]:
#Save to a csv file
df.to_csv('HOF Players 2020-2000.csv',index=False)