In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

: 

In [None]:
def process_red_sox_data(filepath):
    """
    Reads and cleans the batting split data from the given file.

    This function handles file loading, filters for 'Batting' splits,
    converts necessary columns to numeric types, and calculates the
    'RC_per_PA' statistic.

    Args:
        filepath (str): The path to the data file.

    Returns:
        pd.DataFrame: A cleaned and processed DataFrame with RC/PA,
                      or None if an error occurs during file processing.
    """
    
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

    # --- Data Cleaning and Filtering ---
    # Filter for rows related to batting splits
    batting_df = df[df['Split'].str.startswith('Batting', na=False)].copy()

    # Convert relevant columns to numeric, coercing errors to NaN
    for col in ['Year', 'RC', 'PA']:
        batting_df[col] = pd.to_numeric(batting_df[col], errors='coerce')

    # Drop rows where essential data is missing
    batting_df.dropna(subset=['Year', 'RC', 'PA'], inplace=True)

    # Calculate Runs Created per Plate Appearance (RC/PA)
    batting_df = batting_df[batting_df['PA'] != 0].copy()
    batting_df['RC_per_PA'] = batting_df['RC'] / batting_df['PA']

    # Ensure 'Year' is an integer type
    batting_df['Year'] = batting_df['Year'].astype(int)

    return batting_df

In [None]:
def create_red_sox_plots(df):
    """
    Generates and displays a 3x3 grid of subplots for each batting split,
    showing RC/PA over time.

    Args:
        df (pd.DataFrame): The processed DataFrame containing batting data.
    """

    analysis_df = df.copy()

    unique_splits = sorted(analysis_df['Split'].unique())

    if not unique_splits:
        print("No data rows found to plot for the given criteria.")
        return

    nrows, ncols = 3, 3
    _, axes = plt.subplots(nrows, ncols, figsize=(18, 15))

    # Flatten the 2D axes array into a 1D array for easy iteration
    axes = axes.flatten()

    # Loop through each split and its corresponding subplot axis
    for i, split in enumerate(unique_splits):
        ax = axes[i]
        split_data = analysis_df[analysis_df['Split'] == split]

        # Ensure there are at least two data points to create a line
        if split_data.shape[0] < 2:
            print(f"Skipping plot for '{split}' due to insufficient data points.")
            ax.set_title(f"'{split}'\n(Not enough data)")
            ax.axis('off') # Hide axis for empty plot
            continue

        x_axis = split_data['Year']
        y_axis = split_data['RC_per_PA']

        m, b = np.polyfit(x_axis, y_axis, 1)

        # Create the scatter plot and the best-fit line on the specific subplot
        ax.scatter(x_axis, y_axis, label='Actual RC/PA Data', color='royalblue', alpha=0.8)
        ax.plot(x_axis, m * x_axis + b, color='red', linewidth=2, label=f'Fit (y={m:.4f}x + {b:.4f})')

        # Format the subplot
        ax.set_title(f"RC/PA vs Year", fontsize=12, weight='bold')
        ax.set_xlabel("Year", fontsize=10)
        ax.set_ylabel("RC per Plate Appearance", fontsize=10)
        ax.legend(fontsize=8)
        ax.grid(True, linestyle='--', alpha=0.6)
        ax.tick_params(axis='x', rotation=45, labelsize=9)
        ax.tick_params(axis='y', labelsize=9)
        ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True, nbins=6))

    # Turn off any unused subplots in the grid
    for j in range(len(unique_splits), len(axes)):
        axes[j].axis('off')

    # Adjust layout to prevent titles and labels from overlapping
    plt.tight_layout(pad=3.0)

    plt.show()

In [None]:
def create_summary_table(df, start_years):
    """
    Calculates trendline gradients for different start years and prints a summary table.

    Args:
        df (pd.DataFrame): The processed DataFrame containing batting data.
        start_years (list of int): A list of years to use as starting points for trend analysis.
    """
    summary_data = {}
    all_splits = sorted(df['Split'].unique())

    # Calculate gradients for each defined start year period
    for year in start_years:
        yearly_gradients = {}
        period_df = df[df['Year'] >= year]

        for split in all_splits:
            split_data = period_df[period_df['Split'] == split]

            # Calculate gradient only if there's enough data for a line
            if split_data.shape[0] >= 2:
                m, _ = np.polyfit(split_data['Year'], split_data['RC_per_PA'], 1)
                yearly_gradients[split] = m
            else:
                yearly_gradients[split] = np.nan  # Use NaN for insufficient data

        summary_data[f'Gradient ({year}-Present)'] = yearly_gradients

    summary_df = pd.DataFrame(summary_data, index=all_splits)
    summary_df.index.name = 'Batting Split'

    print("\n--- Summary of Gradients (Trend of RC/PA over Different Periods) ---")
    print(summary_df.to_string(float_format="{:+.6f}".format))

In [None]:
if __name__ == '__main__':
    file_path = 'redsox_2003_2025_team_batting_splits.csv'

    # Define the start years to be used as columns in the summary table
    summary_start_years = [2003, 2008, 2013, 2018]
 
    # 1. Load and process the data
    processed_df = process_red_sox_data(file_path)

    if processed_df is not None and not processed_df.empty:
        # 2. Create and display plots for the full data range
        min_year = processed_df['Year'].min()
        create_red_sox_plots(processed_df, start_year=min_year)

        # 3. Create and print the summary table with multiple trend periods
        create_summary_table(processed_df, summary_start_years)
    else:
        print("Analysis could not be completed due to data processing errors.")