In [9]:
import pandas as pd
import numpy as np

In [10]:
# ============================================================================
# Part 1
# ============================================================================
def linear_weights(filename: str):
    """
    Uses the equations: A.TABeta = A.Tb
    and returns the coefficient vector Beta and the R squared value.

    Parameters
    ----------
    filename : str
        csv file path 

    Returns
    -------
    beta : numpy.ndarray, shape (6,)
        The estimated coefficients [Beta0, Beta1, Beta2, Beta3, Beta4, Beta5].
    R2 : float
        Coefficient of R squared
    """
    # Load
    df = pd.read_csv(filename)
    X  = df[["BB", "1B", "2B", "3B", "HR"]].to_numpy(dtype=float)
    b  = df["runs"].to_numpy(dtype=float)
    n  = b.shape[0]

    # Construct A
    A = np.column_stack([np.ones(n, dtype=float), X])

    # (A^T A) beta = (A^T b)
    AtA  = A.T @ A
    Atb  = A.T @ b
    beta = np.linalg.solve(AtA, Atb)

    # Residuals and SSE
    resid = A @ beta - b
    SSE   = float(resid @ resid)

    # SSTO
    oneTb = float(b.sum())
    SSTO  = float(b @ b) - (oneTb * oneTb) / n

    # R-squared
    R2 = 1.0 - SSE / SSTO

    return beta, R2

# ============================================================================
# Part 2
# ============================================================================
def linear_weights_test(filename: str):
    """
    Evaluate the accuracy of the model by rerunning 
    the season and comparing predicted winners
    against actual game outcomes. A prediction is correct if 
    the team predicted to score more runs actually did.

    Parameters
    ----------
    filename : str
        csv file path

    Returns
    -------
    percent_correct : float
        The percentage of games where the predicted winner
        matched the actual winner.
    """
    # Coefficients
    beta, _ = linear_weights(filename)

    # Matrix & Predictions
    df   = pd.read_csv(filename)
    Aev  = df[["BB", "1B", "2B", "3B", "HR"]].to_numpy(dtype=float)  # (n,5)
    yhat = Aev @ beta[1:]
    y    = df["runs"].to_numpy(dtype=float)

    # Compute differences
    pred_diff = yhat[0::2] - yhat[1::2]
    act_diff  = y[0::2]    - y[1::2]

    # Correct if predicted and actual signs match
    correct = ((pred_diff >= 0) & (act_diff >= 0)) | ((pred_diff < 0) & (act_diff < 0))

    percent_correct = 100.0 * float(correct.mean())
    return percent_correct

# ============================================================================
# Part 3
# ============================================================================
def run_values(filename_games, filename_players):
    """
    Use the coefficients estimated to predict 
    how many runs each player contributes.

    Parameters
    ----------
    filename_games :
        csv path to the game data
    filename_players : str
        csv path to the player data

    Returns
    -------
    df_players : pandas.DataFrame
        Players DataFrame with two added columns:
        'estimated_runvalues' and 'estimated_runvalues_per_PA'.
    """
    # Goefficients from game data
    beta, _ = linear_weights(filename_games)
    wBB, w1B, w2B, w3B, wHR = beta[1:].tolist()

    # Load player data
    dfp = pd.read_csv(filename_players).copy()
    for col in ["BB", "1B", "2B", "3B", "HR"]:
        if col not in dfp.columns:
            dfp[col] = 0

    # Run contributions
    dfp["estimated_runvalues"] = (
        dfp["BB"].to_numpy(float) * wBB +
        dfp["1B"].to_numpy(float) * w1B +
        dfp["2B"].to_numpy(float) * w2B +
        dfp["3B"].to_numpy(float) * w3B +
        dfp["HR"].to_numpy(float) * wHR
    )

    # Compute per-PA rate
    if "PA" in dfp.columns:
        dfp["estimated_runvalues_per_PA"] = dfp["estimated_runvalues"] / dfp["PA"].replace(0, np.nan)
    else:
        dfp["estimated_runvalues_per_PA"] = np.nan

    return dfp