In [15]:
#get the data for 2003 to 2024 and years, conference, and division columns
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to parse the table and convert to DataFrame
def parse_table(table, year, conference):
    # Extract headers
    header_row = table.find('thead').find_all('tr')[-1]
    headers = [th.getText() for th in header_row.find_all('th')]
    headers[0] = 'Team'  
    
    # Extract rows
    rows = table.find('tbody').find_all('tr')
    data = []
    current_division = None  # Initialize current division as None

    for row in rows:
        # Check if the row is a division header
        if 'class' in row.attrs and 'thead' in row.attrs['class']:
            current_division = row.get_text(strip=True)
            continue  

        cells = row.find_all(['th', 'td'])
        row_data = [cell.get_text(strip=True) for cell in cells]
        row_data.append(current_division)  
        data.append(row_data)
    
    headers.append('Division')  
    df = pd.DataFrame(data, columns=headers)
    df['Year'] = year  
    df['Conference'] = conference  
    return df

# Initialize WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())

# Initialize an empty list to hold all the DataFrames
all_standings = []

# Loop through the years 2003 to 2024
for year in range(2003, 2025):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
    driver.get(url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    eastern_table = soup.find('table', id='divs_standings_E')
    western_table = soup.find('table', id='divs_standings_W')
    if eastern_table:
        eastern_df = parse_table(eastern_table, year, 'Eastern Conference')
        all_standings.append(eastern_df)
    else:
        print(f"Eastern Conference table not found for {year}.")
    if western_table:
        western_df = parse_table(western_table, year, 'Western Conference')
        all_standings.append(western_df)
    else:
        print(f"Western Conference table not found for {year}.")

driver.quit()
# Combine all the DataFrames into one
final_standings = pd.concat(all_standings, ignore_index=True)
print("Combined NBA Standings from 2003 to 2024:")
print(final_standings)
# Optionally, save the final DataFrame to a CSV file
final_standings.to_csv("NBA_Standings.csv", index=False)


Combined NBA Standings from 2004 to 2024:
                      Team   W   L  W/L%    GB   PS/G   PA/G    SRS  \
0         New Jersey Nets*  49  33  .598     —   95.4   90.1   4.42   
1      Philadelphia 76ers*  48  34  .585   1.0   96.8   94.5   1.76   
2          Boston Celtics*  44  38  .537   5.0   92.7   93.1  -0.75   
3           Orlando Magic*  42  40  .512   7.0   98.5   98.4  -0.39   
4       Washington Wizards  37  45  .451  12.0   91.5   92.5  -1.47   
..                     ...  ..  ..   ...   ...    ...    ...    ...   
653      Dallas Mavericks*  50  32  .610     —  117.9  115.6   2.30   
654  New Orleans Pelicans*  49  33  .598   1.0  115.1  110.7   4.46   
655        Houston Rockets  41  41  .500   9.0  114.3  113.2   1.24   
656      Memphis Grizzlies  27  55  .329  23.0  105.8  112.8  -6.57   
657      San Antonio Spurs  22  60  .268  28.0  112.1  118.6  -5.80   

               Division  Year          Conference  
0     Atlantic Division  2003  Eastern Conference  
1