In [5]:
#Import libraries 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

#Scrape the rest of the links for the players inducted in the hall of fame in the 1900s

#Paste player links for the 1900s Hall of Fame class
player_urls = [
    "https://www.basketball-reference.com/players/c/chambwi01.html", #Wilt Chamberlain 1979
    "https://www.basketball-reference.com/players/a/arizipa01.html", #Paul Arizin 1978
    "https://www.basketball-reference.com/players/f/fulksjo01.html", #Joe Fulks 1978
    "https://www.basketball-reference.com/players/h/hagancl01.html", #Cliff Hagan 1978
    "https://www.basketball-reference.com/players/p/pollaji01.html", #Jim Polllard 1978
    "https://www.basketball-reference.com/players/b/bayloel01.html", #Elgin Baylor 1977
    "https://www.basketball-reference.com/players/g/golato01.html", #Tom Gola 1976
    "https://www.basketball-reference.com/players/s/sharmbi01.html", #Bill Sherman 1976
    "https://www.basketball-reference.com/players/r/russebi01.html", #Bill Russell 1975
    "https://www.basketball-reference.com/players/s/schaydo01.html", #Dolph Schayes 1973
    "https://www.basketball-reference.com/players/c/cousybo01.html", #Bob Cousy 1971
    "https://www.basketball-reference.com/players/p/pettibo01.html", #Bob Pettit 1971
    "https://www.basketball-reference.com/players/d/daviebo01.html", #Bob Davies 1970
    "https://www.basketball-reference.com/players/p/phillan01.html", #Andy Phillip 1961
    "https://www.basketball-reference.com/players/m/macaued01.html", #Ed Macauley 1960
    "https://www.basketball-reference.com/players/m/mikange01.html" #George Mikan 1959
]

#Headers for the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

#Initialize a list to store all players' data
all_players_data = []

#Player position abbreviations
position_map = {
    'Center': 'C',
    'Power Forward': 'PF',
    'Small Forward': 'SF',
    'Shooting Guard': 'SG',
    'Point Guard': 'PG'
}

#Function to clean and extract player position
def get_position(soup):
    position = None  #Default position if not found
    p_elems = soup.find_all('p')

    for p_elem in p_elems:
        if 'Position:' in p_elem.get_text():
            position_text = p_elem.get_text(separator=" ").split("Position:")[1].strip()
            position_text = position_text.replace('▪', '').strip()
            if 'Shoots:' in position_text:
                position_text = position_text.split('Shoots:')[0].strip()
            position_text = " ".join(position_text.split())
            position_text = position_text.replace(' and ', ', ')
            positions = position_text.split(',')
            primary_position = positions[0].strip()
            position = position_map.get(primary_position, primary_position)
            break
    return position



#Function to safely extract MVP count
def get_mvp_count(soup):
    mvp_count = 0
    mvp_elem = soup.find_all('li', {'class': 'poptip'}, string=lambda s: s and 'MVP' in s and 'Finals' not in s and 'AS' not in s and 'MBWA NBA' not in s and 'USBWA MVP' not in s)
    for elem in mvp_elem:
        text = elem.text.strip()
        if 'x' in text:
            mvp_count += int(text.split('x')[0])  #For players with multiple MVPs, split on 'x' and extract the first element
        else:
            mvp_count += 1  #For players with only 1 MVP
    return mvp_count

#Function to safely extract Scoring Championships count
def get_scoring_champ_count(soup):
    scoring_champ_count = 0
    scoring_champ_elem = soup.find_all('li', {'data-tip': lambda x: x and 'NBA Scoring Champ' in x})
    for elem in scoring_champ_elem:
        text = elem.text.strip()
        if 'x' in text:
            scoring_champ_count += int(text.split('x')[0])  # For players with multiple scoring championships, split on 'x'
        else:
            scoring_champ_count += 1  # For players with just 1 scoring championship
    return scoring_champ_count

#Function to safely extract NBA Championships count
def get_chips_count(soup):
    chips_count = 0
    chip_elem = soup.find_all('li', class_='', string=lambda s: s and ('NBA Champ' in s or 'ABA Champ' in s or 'BAA Champ' in s))
    for elem in chip_elem:
        text = elem.text.strip()
        if 'x' in text:
            chips_count += int(text.split('x')[0])  #For players with multiple championships, split on 'x'
        else:
            chips_count += 1  #For players with 1 championship
    return chips_count

#Loop over each player URL to scrape their data
for url in player_urls:
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        try:
            #Get player name
            player_name = soup.find('h1').find('span').text.strip()

            #Get player position
            position = get_position(soup)

            #Get player career length
            career_length_elem = soup.find('strong', string=lambda s: s and 'Career Length:' in s)
            career_length = int(career_length_elem.next_sibling.strip().split()[0])

            #Safeguard function for extracting stats
            def safe_find(tag, text):
                element = soup.find('span', {'data-tip': text})
                if element:
                    return element.find_next('p').find_next('p').text.strip()
                return 0.0

            games = int(safe_find('Games', 'Games'))
            ppg = float(safe_find('Points', 'Points'))
            rpg = float(safe_find('Total Rebounds', 'Total Rebounds'))
            apg = float(safe_find('Assists', 'Assists'))

            #Extract PER
            per_elem = soup.find('span', {'data-tip': lambda x: x and 'Player Efficiency Rating' in x})
            per = float(per_elem.find_next('p').find_next('p').text.strip() if per_elem else 0.0)

            #Extract Field Goal Percentage and Free Throw Percentage
            fg_pct = float(safe_find('Field Goal Percentage', 'Field Goal Percentage'))
            ft_pct = float(safe_find('Free Throw Percentage', 'Free Throw Percentage'))

            #Extract Win Shares
            win_shares_elem = soup.find('span', {'data-tip': lambda x: x and 'Win Shares' in x})
            win_shares = float(win_shares_elem.next_sibling.find_next('p').text.strip() if win_shares_elem else 0.0)
            
            #Extract awards counts
            mvp_count = get_mvp_count(soup)
            scoring_champ_count = get_scoring_champ_count(soup)
            chips_count = get_chips_count(soup)

            #Extract All-Stars, All-NBA, All-Defense, and other honors
            all_stars = int(soup.find('li', {'class': 'all_star'}).find('a').text.strip().split('x')[0] if soup.find('li', {'class': 'all_star'}) else 0)
            
            #Extract All-NBA, All-ABA, or All-BAA awards
            def award_count(soup, award_name, tag='a', attributes=None):
                #Find the award element
                elem = soup.find(tag, attributes, string=lambda s: s and award_name in s)
                if elem:
                    if 'x' in elem.text:
                        return int(elem.text.strip().split('x')[0])
                    else:
                        return 1
                else:
                    return 0
            all_nba = award_count(soup, 'All-NBA', tag='li', attributes={'class':  ""})
            all_aba = award_count(soup, 'All-ABA')
            all_baa = award_count(soup, 'All-BAA')
            all_nba_total = all_nba + all_aba + all_baa
            
            all_defense_count = sum([int(a.text.strip().split('x')[0]) if 'x' in a.text else 1 for a in soup.find('li', {'class': 'poptip'}, string=lambda s: s and 'All-Defensive' in s).find('a')]) if soup.find_all('li', {'class': 'poptip'}, string=lambda s: s and 'All-Defensive' in s) else 0
            all_rookie = 1 if soup.find('li', {'data-tip': lambda s: s and 'All-Rookie' in s}) else 0
            roy = 1 if soup.find('li', {'class': 'poptip'}, string=lambda s: s and 'ROY' in s and 'MBWA NBA' not in s) else 0
            dpoy_count = sum([int(text.split('x')[0]) if 'x' in text else 1 for text in [elem.text.strip() for elem in soup.find_all('li', class_='poptip', string=lambda s: s and 'Def. POY' in s)]])
            
            #All players are in the Hall of Fame
            hof = 1

            #Store the data
            player_data = {
                'Name': player_name,
                'Position': position,
                'Games': games,
                'Career Length': career_length,
                'PPG': ppg,
                'RPG': rpg,
                'APG': apg,
                'PER': per,
                'FG%': fg_pct,
                'FT%': ft_pct,
                'Win Shares': win_shares,
                'All-Stars': all_stars,
                'All-NBA': all_nba_total,
                'All-Defense': all_defense_count,
                'All-Rookie Team': all_rookie,
                'MVPs': mvp_count,
                'Chips': chips_count,
                'ROY': roy,
                'DPOYs': dpoy_count,
                'Scoring Champ': scoring_champ_count,
                'HOF': hof
            }
            all_players_data.append(player_data)

        except Exception as e:
            print(f"Error scraping data for {url}: {e}")

        time.sleep(1)  #Be polite with requests, avoid overwhelming the server

#Create a DataFrame from the collected data
df_1900s = pd.DataFrame(all_players_data)

In [3]:
#Display all comumns
pd.set_option('display.max_columns', None)

In [6]:
df_1900s

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Wilt Chamberlain,C,1045,14,30.1,22.9,4.4,26.2,54.0,51.1,247.3,13,10,2,0,4,2,1,0,7,1
1,Paul Arizin,SF,713,10,22.8,8.6,2.3,19.8,42.1,81.0,108.8,10,4,0,0,0,1,1,0,2,1
2,Joe Fulks,PF,489,8,16.4,5.3,1.2,10.9,30.2,76.6,29.2,2,4,0,0,0,1,0,0,1,1
3,Cliff Hagan,SF,839,13,17.7,6.6,3.2,19.8,45.4,79.9,85.3,6,2,0,0,0,1,0,0,0,1
4,Jim Pollard,SF,438,7,13.2,7.8,3.2,15.6,36.0,75.0,34.9,4,4,0,0,0,5,0,0,0,1
5,Elgin Baylor,SF,846,14,27.4,13.5,4.3,22.7,43.1,78.0,104.2,11,10,0,0,0,0,1,0,0,1
6,Tom Gola,SG,698,10,11.3,8.0,4.2,14.2,43.1,76.0,53.2,5,1,0,0,0,1,0,0,0,1
7,Bill Sharman,SG,711,11,17.8,3.9,3.0,18.3,42.6,88.3,82.8,8,7,0,0,0,4,0,0,0,1
8,Bill Russell,C,963,13,15.1,22.5,4.3,18.9,44.0,56.1,163.5,12,11,1,0,5,11,0,0,0,1
9,Dolph Schayes,PF,996,15,18.5,12.1,3.1,22.1,38.0,84.9,142.4,12,12,0,0,0,1,0,0,0,1


In [23]:
#Import other 1900s HOF players csv files
df_1990s = pd.read_csv('1990s HOF Players.csv',index_col=0)
df_1980s = pd.read_csv('1980s HOF players.csv',index_col=0)

In [24]:
#Display 1990s dataframe
df_1990s.head()

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Kevin McHale,PF,971,13,17.9,7.3,1.7,20.0,55.4,79.8,113.0,7,1,6,1,0,3,0,0,0,1
1,Larry Bird,SF,897,13,24.3,10.0,6.3,23.5,49.6,88.6,145.8,12,10,3,1,3,3,1,0,0,1
2,Arnie Risen,C,637,10,12.0,9.7,1.7,16.7,38.1,69.9,56.0,4,1,0,0,0,2,0,0,0,1
3,Alex English,SF,1193,15,21.5,5.5,3.6,19.9,50.7,83.2,100.7,8,3,0,0,0,0,0,0,1,1
4,Bailey Howell,PF,950,12,18.7,9.9,2.0,19.1,48.0,76.2,114.8,6,1,0,0,0,2,0,0,0,1


In [25]:
#Display 1980s dataframe
df_1980s.head()

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,K.C. Jones,PG,676,9,7.4,3.5,4.3,10.4,38.7,64.7,38.6,0,0,0,0,0,8,0,0,0,1
1,Lenny Wilkens,PG,1077,15,16.5,4.7,6.7,16.8,43.2,77.4,95.5,9,0,0,0,0,0,0,0,0,1
2,Clyde Lovellette,C,704,11,17.0,9.5,1.6,21.7,44.3,75.7,70.6,4,1,0,0,0,3,0,0,0,1
3,Wes Unseld,C,984,13,10.8,14.0,3.9,16.0,50.9,63.3,110.1,5,1,0,1,1,1,1,0,0,1
4,Rick Barry,SF,1020,14,24.8,6.7,4.9,21.0,45.6,89.3,128.9,12,10,0,1,0,1,1,0,1,1


In [30]:
#Concat dataframes
df = pd.concat([df_1990s,df_1980s,df_1900s],ignore_index=True)

In [31]:
#Check out dataframe
df

Unnamed: 0,Name,Position,Games,Career Length,PPG,RPG,APG,PER,FG%,FT%,Win Shares,All-Stars,All-NBA,All-Defense,All-Rookie Team,MVPs,Chips,ROY,DPOYs,Scoring Champ,HOF
0,Kevin McHale,PF,971,13,17.9,7.3,1.7,20.0,55.4,79.8,113.0,7,1,6,1,0,3,0,0,0,1
1,Larry Bird,SF,897,13,24.3,10.0,6.3,23.5,49.6,88.6,145.8,12,10,3,1,3,3,1,0,0,1
2,Arnie Risen,C,637,10,12.0,9.7,1.7,16.7,38.1,69.9,56.0,4,1,0,0,0,2,0,0,0,1
3,Alex English,SF,1193,15,21.5,5.5,3.6,19.9,50.7,83.2,100.7,8,3,0,0,0,0,0,0,1,1
4,Bailey Howell,PF,950,12,18.7,9.9,2.0,19.1,48.0,76.2,114.8,6,1,0,0,0,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Bob Pettit,PF,792,11,26.4,16.2,3.0,25.4,43.6,76.1,136.0,11,11,0,0,2,1,1,0,2,1
63,Bob Davies,PG,462,7,14.3,2.9,4.9,18.1,37.8,75.9,49.7,4,5,0,0,0,1,0,0,0,1
64,Andy Phillip,PG,701,11,9.1,4.4,5.4,14.1,36.8,69.5,60.5,5,2,0,0,0,1,0,0,0,1
65,Ed Macauley,C,641,10,17.5,7.5,3.2,20.5,43.6,76.1,100.4,7,4,0,0,0,1,0,0,0,1


In [32]:
#Save to csv file
df.to_csv('1900s HOF Players.csv',index=False)