In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Selenium
options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0")

driver = webdriver.Chrome(options=options)

# Go to the page
url = "https://www.baseball-reference.com/leagues/majors/2025-standard-batting.shtml"
driver.get(url)

# Wait longer to make sure JS finishes loading
time.sleep(5)

# Parse page
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# ✅ Now the table is directly in the page, not hidden in a comment
table = soup.find('table', {'id': 'players_standard_batting'})

# Check if we found it
if table is None:
    print("❌ Table not found after JS load")
    driver.quit()
    raise ValueError("Table still not found after waiting.")

# Read table into pandas
df = pd.read_html(str(table))[0]

# Clean up repeated headers
df = df[df['Rk'] != 'Rk'].reset_index(drop=True)

print("✅ DataFrame preview:")
print(df.head())

driver.quit()


  df = pd.read_html(str(table))[0]


✅ DataFrame preview:
  Rk           Player Age Team  Lg  WAR   G   PA   AB   R  ...  rOBA Rbat+  \
0  1        Ian Happ#  30  CHC  NL  1.1  36  173  150  29  ...  .355   128   
1  2    Jarren Duran*  28  BOS  AL  0.5  35  168  155  19  ...  .335   106   
2  3     Kyle Tucker*  28  CHC  NL  1.5  36  168  141  30  ...  .418   163   
3  4  Corbin Carroll*  24  ARI  NL  1.4  35  165  150  28  ...  .404   155   
4  5   Rafael Devers*  28  BOS  AL  0.8  36  165  136  22  ...  .365   133   

   TB GIDP HBP SH SF IBB    Pos Awards  
0  61    3   0  0  2   0     *7    NaN  
1  65    1   2  0  1   0  *7/89    NaN  
2  79    3   0  0  1   3   *9/D    NaN  
3  89    1   4  0  0   1   *9/H    NaN  
4  60    2   1  0  2   2     *D    NaN  

[5 rows x 34 columns]


In [2]:
import unicodedata


In [3]:
df2=df[['Player','Team']]

In [None]:
multi_team_flags = ['2TM', '3TM', '4TM', '5TM']
filtered_df = df2[~df2['team'].isin(multi_team_flags)].copy()


# Step 2: Keep only the LAST team listed for each player (i.e., current team)
df2 = filtered_df.drop_duplicates(subset='player', keep='last').reset_index(drop=True)

In [5]:
def clean_name_symbols(name):
    if not isinstance(name, str):
        return name
    return name.replace('*', '').replace('#', '')
df2['Player'] = df2['Player'].apply(clean_name_symbols)

In [6]:
#normalizing the names, getting rid of accents
def normalize_name(name):
    if not isinstance(name, str):
        return name
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
df2['Player'] = df2['Player'].apply(normalize_name)


In [7]:
df2['Player'].unique()

array(['Ian Happ', 'Jarren Duran', 'Kyle Tucker', 'Corbin Carroll',
       'Rafael Devers', 'Lars Nootbaar', 'Francisco Lindor',
       'Pete Alonso', 'Juan Soto', 'Alex Bregman', 'Bryan Reynolds',
       'Brent Rooker', 'Willy Adames', 'Xavier Edwards',
       'Jackson Chourio', 'Elly De La Cruz', 'Aaron Judge',
       'Bobby Witt Jr.', 'Geraldo Perdomo', 'Julio Rodriguez',
       'Bo Bichette', 'James Wood', 'Bryce Harper', 'Shohei Ohtani',
       'Cal Raleigh', 'Dansby Swanson', 'Yandy Diaz', 'TJ Friedl',
       'Lawrence Butler', 'Matt Chapman', 'Willson Contreras',
       'Kyle Schwarber', 'Tyler Soderstrom', 'Pete Crow-Armstrong',
       'Austin Riley', 'Heliot Ramos', 'Spencer Torkelson',
       'Vladimir Guerrero Jr.', 'Matt Olson', 'Ozzie Albies',
       'Steven Kwan', 'Josh Naylor', 'Vinnie Pasquantino',
       'Brendan Donovan', 'Paul Goldschmidt', 'Anthony Santander',
       'Brice Turang', 'Jose Altuve', 'JJ Bleday', 'Jung Hoo Lee',
       'Nathaniel Lowe', 'Christian Yeli

In [8]:
df2.to_csv('../batter_record_1_hit/stats/batter_team_2025.csv')