In [1]:
import time
import numpy as np
import pandas as pd

# for Web Scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# user agent
my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.7499.170 Safari/537.36"

# for BeautifulSoup
headers = {'User-Agent': my_user_agent}

# for selenium
options = Options()
options.add_argument("--headless")
options.add_argument(f"--user-agent={my_user_agent}")
options.add_argument("--disable-blink-features=AutomationControlled")
browser = webdriver.Chrome(options=options)

#### Get links to team pages

In [3]:
def get_teams_page_links(url):

    browser.get(url)
    page_links = browser.find_elements(By.CSS_SELECTOR, 'th[data-stat="franch_name"] a')

    team_page_list = []

    for link in page_links:
        link_page = link.get_attribute("href")
        team_page_list.append(link_page)
    
    return team_page_list

In [5]:
link = "https://www.basketball-reference.com/teams/"
link_team_page = get_teams_page_links(link)
link_team_page

['https://www.basketball-reference.com/teams/ATL/',
 'https://www.basketball-reference.com/teams/BOS/',
 'https://www.basketball-reference.com/teams/NJN/',
 'https://www.basketball-reference.com/teams/CHA/',
 'https://www.basketball-reference.com/teams/CHI/',
 'https://www.basketball-reference.com/teams/CLE/',
 'https://www.basketball-reference.com/teams/DAL/',
 'https://www.basketball-reference.com/teams/DEN/',
 'https://www.basketball-reference.com/teams/DET/',
 'https://www.basketball-reference.com/teams/GSW/',
 'https://www.basketball-reference.com/teams/HOU/',
 'https://www.basketball-reference.com/teams/IND/',
 'https://www.basketball-reference.com/teams/LAC/',
 'https://www.basketball-reference.com/teams/LAL/',
 'https://www.basketball-reference.com/teams/MEM/',
 'https://www.basketball-reference.com/teams/MIA/',
 'https://www.basketball-reference.com/teams/MIL/',
 'https://www.basketball-reference.com/teams/MIN/',
 'https://www.basketball-reference.com/teams/NOH/',
 'https://ww

In [None]:
# Last active team
last_req_link = "https://www.basketball-reference.com/teams/WAS/"
link_team_page.index(last_req_link)

29

#### Get information about each team's players

In [None]:
def get_teams_player(url,n):

    links_list = get_teams_page_links(url)

    players_list = []
    for link in links_list[:n]:
        browser.get(link)
        print(link)
        teams_name = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "h1")))
        #teams_name = browser.find_element(By.CSS_SELECTOR, "h1").text

        players_url = link + "/players.html"
        tables = pd.read_html(players_url)
        players_table = tables[0]

        for index in range(len(players_table)):
            data = tuple(players_table.iloc[index].values)
            data = data + (teams_name.text,)
            players_list.append(data)
            # time.sleep(1) 

        time.sleep(2) 

    browser.quit()

    return players_list

In [None]:
# ---------- Store team player information ------------
link = "https://www.basketball-reference.com/teams/"
players_team_info = get_teams_player(link,30)

https://www.basketball-reference.com/teams/OKC/
https://www.basketball-reference.com/teams/ORL/
https://www.basketball-reference.com/teams/PHI/
https://www.basketball-reference.com/teams/PHO/
https://www.basketball-reference.com/teams/POR/
https://www.basketball-reference.com/teams/SAC/
https://www.basketball-reference.com/teams/SAS/
https://www.basketball-reference.com/teams/TOR/
https://www.basketball-reference.com/teams/UTA/
https://www.basketball-reference.com/teams/WAS/


#### Remove incorrect lines and sort the file

In [9]:
col_name_list = ['Rk','Player','From','To','Yrs','G','MP','FG','FGA','3P','3PA',
                'FT','FTA','ORB','TRB','AST','STL','BLK','TOV','PF','PTS','FG%',
                '3P%','FT%','MP_PG','PTS_PG','TRB_PG','AST_PG','STL_PG','BLK_PG','Team']
teams_players_df = pd.DataFrame(players_team_info, columns= col_name_list)

# delete wrong rows from df
teams_players_df = teams_players_df[teams_players_df["G"] != 'Totals']
teams_players_df = teams_players_df[teams_players_df["Rk"] != 'Rk']

# select some columns
col = ['Team','Player','From','To','Yrs','G','MP','PTS_PG','PTS','FG%','3P%','FT%']
df = teams_players_df[col]

In [None]:
team_players_excel = df.to_excel("team_players_info.xlsx", index=False)

#### List of the teams

In [17]:
team_url = "https://www.basketball-reference.com/teams/"

tables = pd.read_html(team_url)
team_table = tables[0]
team_df = pd.DataFrame(team_table)
team_df

Unnamed: 0,Franchise,Lg,From,To,Yrs,G,W,L,W/L%,Plyfs,Div,Conf,Champ
0,Atlanta Hawks,NBA,1949-50,2025-26,77,6073,2993,3079,0.493,49,12,0,1
1,Atlanta Hawks,NBA,1968-69,2025-26,58,4654,2295,2359,0.493,36,6,0,0
2,St. Louis Hawks,NBA,1955-56,1967-68,13,1005,553,452,0.550,12,6,0,1
3,Milwaukee Hawks,NBA,1951-52,1954-55,4,282,91,190,0.324,0,0,0,0
4,Tri-Cities Blackhawks,NBA,1949-50,1950-51,2,132,54,78,0.409,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,Washington Bullets,NBA,1974-75,1996-97,23,1886,887,999,0.470,13,2,3,1
81,Capital Bullets,NBA,1973-74,1973-74,1,82,47,35,0.573,1,1,0,0
82,Baltimore Bullets,NBA,1963-64,1972-73,10,813,401,412,0.493,7,4,1,0
83,Chicago Zephyrs,NBA,1962-63,1962-63,1,80,25,55,0.313,0,0,0,0


In [18]:
team_df = team_df.drop(columns=["G","W","L","W/L%","Plyfs","Div"] , axis=1)
team_df

Unnamed: 0,Franchise,Lg,From,To,Yrs,Conf,Champ
0,Atlanta Hawks,NBA,1949-50,2025-26,77,0,1
1,Atlanta Hawks,NBA,1968-69,2025-26,58,0,0
2,St. Louis Hawks,NBA,1955-56,1967-68,13,0,1
3,Milwaukee Hawks,NBA,1951-52,1954-55,4,0,0
4,Tri-Cities Blackhawks,NBA,1949-50,1950-51,2,0,0
...,...,...,...,...,...,...,...
80,Washington Bullets,NBA,1974-75,1996-97,23,3,1
81,Capital Bullets,NBA,1973-74,1973-74,1,0,0
82,Baltimore Bullets,NBA,1963-64,1972-73,10,1,0
83,Chicago Zephyrs,NBA,1962-63,1962-63,1,0,0


In [25]:
# save to excel
team_excel = team_df.to_excel("team_info.xlsx", index=False)

#### Leage index

In [None]:
leage_url = "https://www.basketball-reference.com/leagues/"

tables_leage = pd.read_html(leage_url)
leage_table = tables_leage[0]

# create new dataframe 
leage_list = []

for index in range(len(leage_table)):
    data = tuple(leage_table.iloc[index].values)
    leage_list.append(data)  

leage_df = pd.DataFrame(leage_list, columns= ['Season','Lg','Champion','MVP','Rookie of the Year',
                                              'Points','Rebounds','Assists','Win Shares'])

leage_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Award Winners,Award Winners,Top Performers,Top Performers,Top Performers,Top Performers
Unnamed: 0_level_1,Season,Lg,Champion,MVP,Rookie of the Year,Points,Rebounds,Assists,Win Shares
0,2025-26,NBA,,,,S. Gilgeous-Alexander (1558),K. Towns (572),C. Cunningham (429),S. Gilgeous-Alexander (11.4)
1,2024-25,NBA,Oklahoma City Thunder,S. Gilgeous-Alexander,S. Castle,S. Gilgeous-Alexander (2484),I. Zubac (1010),T. Young (880),S. Gilgeous-Alexander (16.7)
2,2023-24,NBA,Boston Celtics,N. Jokić,V. Wembanyama,L. Dončić (2370),D. Sabonis (1120),T. Haliburton (752),N. Jokić (17.0)
3,2022-23,NBA,Denver Nuggets,J. Embiid,P. Banchero,J. Tatum (2225),D. Sabonis (973),T. Young (741),N. Jokić (14.9)
4,2021-22,NBA,Golden State Warriors,N. Jokić,S. Barnes,T. Young (2155),N. Jokić (1019),T. Young (737),N. Jokić (15.2)
...,...,...,...,...,...,...,...,...,...
84,1950-51,NBA,Rochester Royals,,P. Arizin,G. Mikan (1932),D. Schayes (1080),A. Phillip (414),G. Mikan (23.4)
85,1949-50,NBA,Minneapolis Lakers,,A. Groza,G. Mikan (1865),,D. McGuire (386),G. Mikan (21.1)
86,1948-49,BAA,Minneapolis Lakers,,H. Shannon,G. Mikan (1698),,B. Davies (321),G. Mikan (20.9)
87,1947-48,BAA,Baltimore Bullets,,P. Hoffman,M. Zaslofsky (1007),,H. Dallmar (120),B. Feerick (10.5)


In [23]:
# save to excel
leage_excel = leage_df.to_excel("leage_info.xlsx", index=False)

## merge

In [32]:
df_merged =  team_df.merge(leage_df, left_on="Franchise", right_on="Champion", how="left")
df_merged

Unnamed: 0,Franchise,Lg_x,From,To,Yrs,Conf,Champ,Season,Lg_y,Champion,MVP,Rookie of the Year,Points,Rebounds,Assists,Win Shares
0,Atlanta Hawks,NBA,1949-50,2025-26,77,0,1,,,,,,,,,
1,Atlanta Hawks,NBA,1968-69,2025-26,58,0,0,,,,,,,,,
2,St. Louis Hawks,NBA,1955-56,1967-68,13,0,1,1957-58,NBA,St. Louis Hawks,B. Russell,W. Sauldsberry,G. Yardley (2001),B. Russell (1564),B. Cousy (463),D. Schayes (13.7)
3,Milwaukee Hawks,NBA,1951-52,1954-55,4,0,0,,,,,,,,,
4,Tri-Cities Blackhawks,NBA,1949-50,1950-51,2,0,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Washington Bullets,NBA,1974-75,1996-97,23,3,1,1977-78,NBA,Washington Bullets,B. Walton,W. Davis,G. Gervin (2232),T. Robinson (1288),K. Porter (837),D. Thompson (12.7)
170,Capital Bullets,NBA,1973-74,1973-74,1,0,0,,,,,,,,,
171,Baltimore Bullets,NBA,1963-64,1972-73,10,1,0,1947-48,BAA,Baltimore Bullets,,P. Hoffman,M. Zaslofsky (1007),,H. Dallmar (120),B. Feerick (10.5)
172,Chicago Zephyrs,NBA,1962-63,1962-63,1,0,0,,,,,,,,,


In [33]:
df_merged = df_merged.drop(columns=["Champion"])
team_info_excel = df_merged.to_excel("Teams_info.xlsx", index=False)