In [None]:
from selenium import webdriver                                            
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import random
from bs4 import BeautifulSoup

url = 'https://www.baseball-reference.com/leagues/MLB-schedule.shtml' 
os.makedirs(save_dir, exist_ok=True)

options = webdriver.ChromeOptions()
options.add_argument('--headless')  
driver = webdriver.Chrome(options=options)

def save_fully_loaded_html(box_url, save_path):
    driver.get(box_url)
    try:
        WebDriverWait(driver, 5, poll_frequency=0.1).until(EC.presence_of_element_located((By.ID, "content")))
    except Exception as e:
        print(f"Warning: {e}")
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(driver.page_source)

def get_box_score_links(main_url):
    driver.get(main_url)
    WebDriverWait(driver, 5, poll_frequency=0.1).until(EC.presence_of_element_located((By.ID, "content")))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    links = []
    for link in soup.find_all('a', string='Boxscore'):
        href = link.get('href')
        if href and href.startswith('/boxes/'):
            links.append('https://www.baseball-reference.com' + href)
    return links

def scrape_box_scores(box_score_links):
    total_links = len(box_score_links)
    print(f"Total box scores to scrape: {total_links}")

    for i, link in enumerate(box_score_links):
        file_name = link.split('/')[-1] + '.html'
        save_path = os.path.join(save_dir, file_name)
        
        if os.path.exists(save_path):
            print(f"{i+1}/{total_links}: {file_name} already exists, skipping.")
            continue

        success = False
        delay = 300  
        while not success:
            try:
                save_fully_loaded_html(link, save_path)
                print(f"{i+1}/{total_links}: Saved {file_name}")
                success = True
                time.sleep(random.uniform(1, 2))
            except Exception as e:
                print(f"Error saving {file_name}: {e}")
                print(f"Retrying in {delay // 60} minutes...")
                time.sleep(delay)
                delay *= 2 

box_score_links = get_box_score_links(url)
scrape_box_scores(box_score_links)

driver.quit()

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import re
import numpy as np
import datetime

dir_path = 'Box_Scores_New'
box_scores = [file for file in os.listdir(dir_path) if not file.startswith('.') and file.endswith('.html')]
file_paths = [os.path.join(dir_path, file) for file in box_scores]

consolidated_file_path = 'New_Box_Scores.csv'
if os.path.exists(consolidated_file_path):
    consolidated_df = pd.read_csv(consolidated_file_path)
    processed_files = set(consolidated_df['Source_File'].unique())
else:
    consolidated_df = pd.DataFrame()
    processed_files = set()

def parse_html(box_score):
    with open(box_score, 'r', encoding='utf-8') as f:
        html = f.read()
    return BeautifulSoup(html, 'html.parser')

def read_line_score(soup):
    linescore_table = soup.find("table", {"class": "linescore"})
    if linescore_table:
        table_str = str(linescore_table)
        tables = pd.read_html(StringIO(table_str))
        if tables:
            line_score = tables[0]
            line_score_filtered = line_score[line_score.apply(lambda row: str(row.iloc[-3]).isdigit() and str(row.iloc[-2]).isdigit() and str(row.iloc[-1]).isdigit(), axis=1)]
            line_score_filtered = line_score_filtered.iloc[:, [1, -3, -2, -1]]
            line_score_filtered.columns = ["Team", "TR", "TH", "TE"]
            return line_score_filtered
    return None

def extract_position(df, name_column='Name'):
    positions = {'C', '1B', '2B', 'SS', '3B', 'LF', 'RF', 'CF', 'P', 'DH', 'PR', 'PH'}
    position_regex = '|'.join(positions)
    df[['Name', 'Position']] = df[name_column].str.extract(r'^(.*)\s+((?:{})-(?:{})-(?:{})|(?:{})-(?:{})|(?:{}))$'.format(position_regex, position_regex, position_regex, position_regex, position_regex, position_regex), expand=True)
    df['Position'] = df['Position'].replace('Unknown', np.nan)
    return df

def extract_detail_columns(df, column_name):
    unique_details = set()
    for details in df[column_name].dropna():
        detail_items = details.split(',')
        for item in detail_items:
                        unique_details.add(item.strip())
    for detail in unique_details:
        df[detail] = 0

    def populate_details(row):
        if pd.isna(row[column_name]): return row
        detail_items = row[column_name].split(',')
        for item in detail_items:
            detail = item.strip()
            if detail in df.columns:
                row[detail] += 1
        return row

    df = df.apply(populate_details, axis=1)
    df.drop(columns=[column_name], inplace=True)
    return df

def read_batting_away(soup, away_team):
    batting_divs = soup.find_all("div", {"id": lambda x: x and x.endswith("batting")})
    if len(batting_divs) > 0:
        batting_table = batting_divs[0].find("table")
        if batting_table:
            table_str = str(batting_table)
            tables = pd.read_html(StringIO(table_str))
            if tables:
                batting = tables[0]
                batting = batting.dropna(subset=['Batting'])
                batting_cleaned = batting.rename(columns={'Batting': 'Name'}, inplace=False)
                batting_cleaned = extract_position(batting_cleaned, 'Name')
                batting_cleaned = extract_detail_columns(batting_cleaned, 'Details')
                if batting_cleaned.index[-1] == batting_cleaned.index[-1]:  
                    batting_cleaned.loc[batting_cleaned.index[-1], 'Name'] = f'{away_team} Batting Totals'
                return batting_cleaned
    return None

def read_batting_home(soup, home_team, line_score):
    team_name_batting = home_team + 'batting'
    tnbns = team_name_batting.replace(" ", "")
    batting_divs = soup.find_all("div", {"id": lambda x: x and x.endswith(tnbns)})
    if len(batting_divs) > 1:
        batting_table = batting_divs[1].find("table")
        if batting_table:
            table_str = str(batting_table)
            tables = pd.read_html(StringIO(table_str))
            if tables:
                batting = tables[0]
                batting = batting.dropna(subset=['Batting'])
                batting_cleaned = batting.rename(columns={'Batting': 'Name'}, inplace=False)
                batting_cleaned = extract_position(batting_cleaned, 'Name')
                batting_cleaned = extract_detail_columns(batting_cleaned, 'Details')
                if batting_cleaned.index[-1] == batting_cleaned.index[-1]:  
                    batting_cleaned.loc[batting_cleaned.index[-1], 'Name'] = f'{home_team} Batting Totals'
                return batting_cleaned
    return None


def read_pitching_away(soup, away_team):
    pitching_divs = soup.find_all("div", {"id": lambda x: x and x.endswith("pitching")})
    if len(pitching_divs) > 0:
        pitching_table = pitching_divs[0].find("table")
        if pitching_table:
            table_str = str(pitching_table)
            tables = pd.read_html(StringIO(table_str))
            if tables:
                pitching = tables[0]
                pitching_cleaned = pitching.dropna(subset=['Pitching'])
                pitching_cleaned['Pitching'] = pitching_cleaned['Pitching'].apply(lambda x: x.split(',')[0])
                pitching_cleaned2 = pitching_cleaned.rename(columns={'Pitching': 'Name', 'R': 'R-A', 'H': 'H-A', 'BB': 'BB-A', 'SO': 'SO-A', 'Str': 'Str-A', 'WPA': 'WPA-A', 'cWPA': 'CWPA-A', 'acLI': 'acLI-A', 'RE24': 'RE24-A', 'aLI': 'aLI-A', 'HR': 'HR-A', 'Pit': 'Pit-A'}, inplace=False)
                pitching_cleaned2['Position'] = 'P'
                if pitching_cleaned2.index[-1] == pitching_cleaned2.index[-1]:  
                    pitching_cleaned2.loc[pitching_cleaned2.index[-1], 'Name'] = f'{away_team} Pitching Totals'
                return pitching_cleaned2
    return None

def read_pitching_home(soup, home_team, line_score):
    team_name_pitching = home_team + 'pitching'
    tnpns = team_name_pitching.replace(" ", "")
    pitching_divs = soup.find_all("div", {"id": lambda x: x and x.endswith(tnpns)})
    if len(pitching_divs) > 1:
        pitching_table = pitching_divs[1].find("table")
        if pitching_table:
            table_str = str(pitching_table)
            tables = pd.read_html(StringIO(table_str))
            if tables:
                pitching = tables[0]
                pitching_cleaned = pitching.dropna(subset=['Pitching'])
                pitching_cleaned['Pitching'] = pitching_cleaned['Pitching'].apply(lambda x: x.split(',')[0])
                pitching_cleaned2 = pitching_cleaned.rename({'Pitching': 'Name', 'R': 'R-A', 'H': 'H-A', 'BB': 'BB-A', 'SO': 'SO-A', 'Str': 'Str-A', 'WPA': 'WPA-A', 'cWPA': 'CWPA-A', 'acLI': 'acLI-A', 'RE24': 'RE24-A', 'aLI': 'aLI-A', 'HR': 'HR-A', 'Pit': 'Pit-A'}, axis=1)
                pitching_cleaned2['Position'] = 'P'
                if pitching_cleaned2.index[-1] == pitching_cleaned2.index[-1]: 
                    pitching_cleaned2.loc[pitching_cleaned2.index[-1], 'Name'] = f'{home_team} Pitching Totals'
                return pitching_cleaned2
    return None
    
def read_game_info(soup):
    teams = [team.get_text() for team in soup.select('strong a')]
    scores = [score.get_text() for score in soup.select('div.scores div.score')]
    date = soup.select_one('.scorebox_meta div:nth-of-type(1)').get_text()
    start_time = soup.select_one('.scorebox_meta div:nth-of-type(2)').get_text().replace("Start Time: ", "")
    attendance = soup.select_one('.scorebox_meta div:nth-of-type(3)').get_text().replace("Attendance: ", "")
    venue = soup.select_one('.scorebox_meta div:nth-of-type(4)').get_text().replace("Venue: ", "")
    duration = soup.select_one('.scorebox_meta div:nth-of-type(5)').get_text().replace("Duration: ", "")
    conditions = soup.select_one('.scorebox_meta div:nth-of-type(6)').get_text()
    data = {'Date': date,
        'Start Time': start_time,
        'Attendance': attendance,
        'Venue': venue,
        'Duration': duration,
        'Conditions': conditions
    }
    return pd.DataFrame([data])

def process_file(file_path):
    try:
        soup = parse_html(file_path)
        line_score = read_line_score(soup)
        if line_score is not None:
            away_team = line_score.iloc[0, 0]
            home_team = line_score.iloc[1, 0]
            batting_away = read_batting_away(soup, away_team)
            batting_home = read_batting_home(soup, home_team, line_score)
            pitching_away = read_pitching_away(soup, away_team)
            pitching_home = read_pitching_home(soup, home_team, line_score)
            game_info = read_game_info(soup)

        away_merged = batting_away.merge(pitching_away, on='Name', how='outer', suffixes=('', '_Pitching'))
        home_merged = batting_home.merge(pitching_home, on='Name', how='outer', suffixes=('', '_Pitching'))

        if 'Position' in away_merged.columns and 'Position_Pitching' in away_merged.columns:
            away_merged['Position'] = away_merged['Position'].combine_first(away_merged['Position_Pitching'])
            away_merged.drop(columns=['Position_Pitching'], inplace=True)

        if 'Position' in home_merged.columns and 'Position_Pitching' in home_merged.columns:
            home_merged['Position'] = home_merged['Position'].combine_first(home_merged['Position_Pitching'])
            home_merged.drop(columns=['Position_Pitching'], inplace=True)

        if line_score is not None and not line_score.empty:
            for col in line_score.columns:
                away_merged[col] = line_score.iloc[0][col]
                home_merged[col] = line_score.iloc[1][col]

        home_merged['Home/Away'] = 1
        away_merged['Home/Away'] = 0
        away_opposing_team = home_merged.loc[0, 'Team'] 
        home_opposing_team = away_merged.loc[0, 'Team'] 
        away_merged['Opposing Team'] = away_opposing_team
        home_merged['Opposing Team'] = home_opposing_team
        away_opposing_pitcher = pitching_home.loc[0, 'Name']
        home_opposing_pitcher = pitching_away.loc[0, 'Name']
        away_merged['Opposing Pitcher'] = away_opposing_pitcher
        home_merged['Opposing Pitcher'] = home_opposing_pitcher
        home_runs = line_score.loc[1, 'TR']
        away_runs = line_score.loc[0, 'TR']
        if home_runs > away_runs: 
            home_merged['W/L'] = 1
            away_merged['W/L'] = 0
        else:
             home_merged['W/L'] = 0
             away_merged['W/L'] = 1

        all_merged = away_merged.merge(home_merged, on= ['Name', 'W/L', 'Opposing Pitcher', 'Opposing Team', 'Home/Away'], how='outer', suffixes=('_away', '_home'))
        for col in batting_away.columns.union(batting_home.columns).union(pitching_away.columns).union(pitching_home.columns):
            if col == 'Name':
                continue
            away_col = f'{col}_away'
            home_col = f'{col}_home'
            if away_col in all_merged.columns and home_col in all_merged.columns:
                all_merged[col] = all_merged[away_col].combine_first(all_merged[home_col])
                all_merged.drop(columns=[away_col, home_col], inplace=True)
            elif away_col in all_merged.columns:
                all_merged.rename(columns={away_col: col}, inplace=True)
            elif home_col in all_merged.columns:
                all_merged.rename(columns={home_col: col}, inplace=True)

        specific_columns = ['Team', 'TR', 'TH', 'TE']
        for col in specific_columns:
            away_col = f'{col}_away'
            home_col = f'{col}_home'
            if away_col in all_merged.columns and home_col in all_merged.columns:
                all_merged[col] = all_merged[away_col].combine_first(all_merged[home_col])
                all_merged.drop(columns=[away_col, home_col], inplace=True)
            elif away_col in all_merged.columns:
                all_merged.rename(columns={away_col: col}, inplace=True)
            elif home_col in all_merged.columns:
                all_merged.rename(columns={home_col: col}, inplace=True)

        if game_info is not None and not game_info.empty:
            for col in game_info.columns:
                all_merged[col] = game_info.iloc[0][col]

        def convert_percentages_to_decimals(df, columns):
            for column in columns:
                df[column] = df[column].str.replace('%', '').astype(float) / 100
            return df

        percentage_columns = ['CWPA-A', 'cWPA']
        df = convert_percentages_to_decimals(all_merged, percentage_columns)

        all_merged.fillna(0, inplace=True)
        all_merged.replace('*', 0, inplace=True)
        all_merged = all_merged[all_merged['Name'] != '0']
        all_merged['Date'] = pd.to_datetime(all_merged['Date'], format='%A, %B %d, %Y').dt.strftime('%Y-%m-%d')
        all_merged['H+R+RBI'] = all_merged[['H', 'R', 'RBI']].sum(axis=1)
        Date = all_merged.loc[0, 'Date']
        Year = Date[:4]
        Month = Date[5:7]
        Day = Date[8:]
        all_merged['Year'] = Year
        all_merged['Month'] = Month
        all_merged['Day'] = Day
        date_str = all_merged.loc[0, 'Date']
        date_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d")
        day_of_week = date_obj.strftime("%A")
        all_merged['Day_of_Week'] = day_of_week

        all_merged['Source_File'] = os.path.basename(file_path)
        return all_merged
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return pd.DataFrame()

for i, file_path in enumerate(file_paths):
    if os.path.basename(file_path) in processed_files:
        continue

    print(f"Processing file {i+1}/{len(file_paths)}: {file_path}")
    processed_df = process_file(file_path)
    if not processed_df.empty:
        consolidated_df = pd.concat([consolidated_df, processed_df], ignore_index=True)

    # Save periodically
    if (i + 1) % 10 == 0:
        consolidated_df.to_csv(consolidated_file_path, index=False)
        print(f"Progress saved after processing {i+1} files.")

# Final save
consolidated_df.to_csv(consolidated_file_path, index=False)
print("All files processed and saved successfully.")

In [None]:
import pandas as pd
import joblib
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('New_Box_Scores.csv')
df = df.fillna(0)
df = df.loc[df['Name'] != '0']

columns_to_encode = [
    'Opposing Team', 'Opposing Pitcher', 'Position', 'Team', 
    'Venue', 'Conditions', 'Day_of_Week', 'Name', 'Start Time']

encoders = {}

for col in columns_to_encode:
    encoders[col] = joblib.load(f'{col}_encoder.joblib')

for col in columns_to_encode:
    le = encoders[col]
    new_classes = set(df[col].unique()) - set(le.classes_)
    le.classes_ = np.concatenate([le.classes_, list(new_classes)])
    df[f'{col} Encoded'] = le.transform(df[col])

df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Name Encoded', 'Date'])

excluded_columns = [
    'Date', 'Start Time', 'Attendance', 'Duration', 'Source File', 
    'Opposing Team', 'Opposing Pitcher', 'Position', 'Team', 
    'Venue', 'Conditions', 'Day_of_Week', 'Name', 
    'Opposing Team Encoded', 'Opposing Pitcher Encoded', 'Position Encoded', 
    'Team Encoded', 'Venue Encoded', 'Conditions Encoded', 'Day_of_Week Encoded', 
    'Name Encoded', 'Start Time Encoded']

numeric_columns = df.select_dtypes(include='number').columns.tolist()
columns_to_average = [col for col in numeric_columns if col not in excluded_columns]                  #When sorting with these long strings, there is a flukey issue I've encountered within Pandas where some of the strings switch columns for whatever reason. For a more advanced model leveranging string type data, make sure you do not sort the CSV file before encoding the data 

for col in tqdm(columns_to_average, desc="Processing columns"):
    rolling_avg_col_name = f'{col} Last 5 Avg'
    df[rolling_avg_col_name] = df.groupby('Name Encoded')[col].shift().rolling(window=5).mean()

df = df.fillna(0)

df.to_csv('New_Box_Scores_Filtered.csv', index=False)

In [None]:
import pandas as pd     #Merges Current Season Data with Past Seasons Data

New_Box_Scores_Filtered = pd.read_csv('New_Box_Scores_Filtered.csv')
box_scores_filtered = pd.read_csv('box_scores_filtered.csv')

combined_df = pd.concat([New_Box_Scores_Filtered, box_scores_filtered], axis=0, ignore_index=True)
combined_df = combined_df.drop_duplicates()
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
combined_df = combined_df.fillna(0)

combined_df.to_csv('Updated_Box_Scores.csv', index = False)

In [None]:
import pandas as pd                                        #This Script updates the model with the data from recent games 
from sklearn.model_selection import train_test_split       #Should be done periodically for best results
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

df = pd.read_csv('Updated_Box_Scores.csv')                    #Basic Model

target_columns = ['H-A', 'BB-A', 'H+R+RBI', 'RBI', 'R', 'SO', 'SO-A', 'W/L', 'HR']

feature_columns = [col for col in df.columns if col.endswith('Avg') or col in [
    'Home/Away', 'Year', 'Month', 'Day', 'Opposing Team Encoded', 
    'Opposing Pitcher Encoded',  'Team Encoded', 'Name Encoded']]

X = df[feature_columns]
y = df[target_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print(f"Mean Squared Error: {mse}")

joblib.dump(model, 'basic_mlb_player_stats_linear_model.joblib')      #Saves in place of Old Model
print("Model saved to 'basic_mlb_player_stats_linear_model.joblib'")