In [7]:
import pandas as pd
import requests
import string
from time import sleep
from bs4 import BeautifulSoup

In [3]:
def scrape_players_by_letter(letter):
    base_url = f"https://www.basketball-reference.com/players/{letter}/"
    page = requests.get(base_url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Extracting player information
    tag1 = soup.find_all('th', {'data-stat': 'player'})
    names = [tag.text.strip() for tag in tag1]

    tag2 = soup.find_all('td', {'data-stat': 'year_min'})
    start_year = [tag.text.strip() for tag in tag2]

    tag3 = soup.find_all('td', {'data-stat': 'year_max'})
    end_year = [tag.text.strip() for tag in tag3]

    tag4 = soup.find_all('td', {'data-stat': 'pos'})
    position = [tag.text.strip() for tag in tag4]

    tag5 = soup.find_all('td', {'data-stat': 'height'})
    height = [tag.text.strip() for tag in tag5]

    tag6 = soup.find_all('td', {'data-stat': 'weight'})
    weight = [tag.text.strip() for tag in tag6]

    tag7 = soup.find_all('td', {'data-stat': 'birth_date'})
    birth_day = [tag.text.strip() for tag in tag7]

    tag8 = soup.find_all('td', {'data-stat': 'colleges'})
    colleges = [tag.text.strip() for tag in tag8]

    tag9 = soup.select("#players > tbody > tr > td:nth-child(8) > a ")
    college_links = ["https://www.basketball-reference.com" + tag['href'] for tag in tag9]

    # Ensure all lists have the same length
    min_length = min(len(names), len(start_year), len(end_year), len(position), len(height), len(weight), len(birth_day), len(colleges), len(college_links))
    names = names[:min_length]
    start_year = start_year[:min_length]
    end_year = end_year[:min_length]
    position = position[:min_length]
    height = height[:min_length]
    weight = weight[:min_length]
    birth_day = birth_day[:min_length]
    colleges = colleges[:min_length]
    college_links = college_links[:min_length]

    df = pd.DataFrame({
        'Player': names,
        'Start Year': start_year,
        'End Year': end_year,
        'Position': position,
        'Height': height,
        'Weight': weight,
        'Birthday': birth_day,
        'Colleges': colleges,
        'College Link': college_links
    })

    return df


In [8]:
# get lowercase alphabets using string module
alph = list(string.ascii_lowercase)

# Iterate over all letters from 'a' to 'z'
all_dataframes = []

for letter in alph:
    sleep(1)
    df = scrape_players_by_letter(letter)
    all_dataframes.append(df)

result_df = pd.concat(all_dataframes, ignore_index=True)

In [11]:
result_df.head()

Unnamed: 0,Player,Start Year,End Year,Position,Height,Weight,Birthday,Colleges,College Link
0,Player,1991,1995,F-C,6-10,240,"June 24, 1968",Duke,https://www.basketball-reference.com/friv/coll...
1,Alaa Abdelnaby,1969,1978,C-F,6-9,235,"April 7, 1946",Iowa State,https://www.basketball-reference.com/friv/coll...
2,Zaid Abdul-Aziz,1970,1989,C,7-2,225,"April 16, 1947",UCLA,https://www.basketball-reference.com/friv/coll...
3,Kareem Abdul-Jabbar*,1991,2001,G,6-1,162,"March 9, 1969",LSU,https://www.basketball-reference.com/friv/coll...
4,Mahmoud Abdul-Rauf,1998,2003,F,6-6,223,"November 3, 1974","Michigan, San Jose State",https://www.basketball-reference.com/friv/coll...


In [12]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5137 entries, 0 to 5136
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Player        5137 non-null   object
 1   Start Year    5137 non-null   object
 2   End Year      5137 non-null   object
 3   Position      5137 non-null   object
 4   Height        5137 non-null   object
 5   Weight        5137 non-null   object
 6   Birthday      5137 non-null   object
 7   Colleges      5137 non-null   object
 8   College Link  5137 non-null   object
dtypes: object(9)
memory usage: 361.3+ KB


In [13]:
result_df.to_csv('nba-players.csv')