In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import Ridge
import pulp

Restricted license - for non-production use only - expires 2026-11-23
Restricted license - for non-production use only - expires 2026-11-23
Restricted license - for non-production use only - expires 2026-11-23
Restricted license - for non-production use only - expires 2026-11-23
Restricted license - for non-production use only - expires 2026-11-23
Restricted license - for non-production use only - expires 2026-11-23


In [3]:
# ==========================================
# 1. USER CONFIGURATION & INPUTS
# ==========================================
# Change these values to run different simulations
FOCAL_TEAM = "Canada"
OPPONENT   = "Sweden" 
RATING_CAP = 8.0
LINEUP_SIZE = 4
ALPHA = 0.1  # Regularization strength

# File Paths (Adjust these for your environment)
PLAYER_DATA_PATH = '/Users/jordanngo/Documents/4B/MSE 433/RugbyLineupOptimizer-main/player_data.csv'
STINT_DATA_PATH  = '/Users/jordanngo/Documents/4B/MSE 433/RugbyLineupOptimizer-main/stint_data.csv'

In [4]:
# ==========================================
# 2. CORE PROCESSING FUNCTIONS
# ==========================================

def get_focal_players(df, focal_team):
    """Identifies which players belong to the team we are analyzing."""
    is_home = (df["h_team"] == focal_team)
    home_cols = ["home1", "home2", "home3", "home4"]
    away_cols = ["away1", "away2", "away3", "away4"]
    
    home_lists = df[home_cols].values.tolist()
    away_lists = df[away_cols].values.tolist()
    
    return pd.Series(
        [home_lists[i] if is_home.iloc[i] else away_lists[i] for i in range(len(df))],
        index=df.index
    )

def solve_lineup_optimization(coefs, rating_map, cap, size):
    """Solves the constrained optimization problem using PuLP."""
    df = pd.DataFrame({"player": coefs.index, "value": coefs.values})
    df["rating"] = df["player"].map(rating_map)
    df = df.dropna(subset=["rating"])

    prob = pulp.LpProblem("Lineup_Optimization", pulp.LpMaximize)
    
    # Binary decision variables: 1 if selected, 0 if not
    vars = {p: pulp.LpVariable(f"p_{p}", 0, 1, cat="Binary") for p in df["player"]}

    # Objective: Maximize total coefficient value
    prob += pulp.lpSum(df.loc[df["player"] == p, "value"].values[0] * vars[p] for p in df["player"])

    # Constraints
    prob += pulp.lpSum(vars[p] for p in df["player"]) == size  # Lineup size
    prob += pulp.lpSum(df.loc[df["player"] == p, "rating"].values[0] * vars[p] for p in df["player"]) <= cap

    prob.solve(pulp.PULP_CBC_CMD(msg=False))
    
    selected = [p for p in df["player"] if pulp.value(vars[p]) > 0.5]
    return df[df["player"].isin(selected)].sort_values("value", ascending=False)

In [9]:
# ==========================================
# 3. EXECUTION AND TABULAR OUTPUT
# ==========================================

def run_analysis():
    # Load Data
    players = pd.read_csv(PLAYER_DATA_PATH)
    stints = pd.read_csv(STINT_DATA_PATH)
    rating_map = dict(zip(players["player"], players["rating"]))

    # Clean Stints
    stints["total_goals"] = pd.to_numeric(stints["h_goals"]) + pd.to_numeric(stints["a_goals"])
    stints["goal_rate"] = stints["total_goals"] / stints["minutes"].replace(0, np.nan)
    stints = stints.dropna(subset=["goal_rate", "h_team", "a_team"])

    # Filter for Matchup
    match_df = stints[((stints["h_team"] == FOCAL_TEAM) & (stints["a_team"] == OPPONENT)) |
                      ((stints["a_team"] == FOCAL_TEAM) & (stints["h_team"] == OPPONENT))].copy()
    
    match_df["players_on_court"] = get_focal_players(match_df, FOCAL_TEAM)

    # Fit Model
    mlb = MultiLabelBinarizer(sparse_output=True)
    X = mlb.fit_transform(match_df["players_on_court"])
    y = match_df["goal_rate"].values
    model = Ridge(alpha=ALPHA).fit(X, y)
    base_coefs = pd.Series(model.coef_, index=mlb.classes_)

    # Run Scenarios
    summary_data = []
    for name, params in SCENARIOS.items():
        # Apply filters and penalties
        c_coefs = base_coefs.drop(labels=params["unavailable"], errors="ignore").copy()
        for player in params["tired"]:
            if player in c_coefs.index:
                c_coefs[player] *= (1 - params["penalty"])
        
        # Optimize
        best_lineup = solve_lineup_optimization(c_coefs, rating_map, RATING_CAP, LINEUP_SIZE)
        
        summary_data.append({
            "Scenario": name,
            "Lineup": ", ".join(best_lineup["player"].tolist()),
            "Total Value": round(best_lineup["value"].sum(), 4),
            "Total Rating": round(best_lineup["rating"].sum(), 2)
        })

    return pd.DataFrame(summary_data)


SCENARIOS = {
    "Baseline": {
        "unavailable": [],
        "tired": [],
        "penalty": 0.0
    },
    "Missing Star": {
        "unavailable": ["Canada_p2"],  # <-- THIS is the missing player
        "tired": [],
        "penalty": 0.0
    },
    "Fatigue": {
        "unavailable": [],
        "tired": ["Canada_p2"],         # <-- THIS is the fatigued player
        "penalty": 0.90
    }
}

# Run and Display
results_df = run_analysis()
results_df

Unnamed: 0,Scenario,Lineup,Total Value,Total Rating
0,Baseline,"Canada_p2, Canada_p8, Canada_p1, Canada_p6",1.9739,8.0
1,Missing Star,"Canada_p8, Canada_p11, Canada_p6, Canada_p10",0.7811,8.0
2,Fatigue,"Canada_p8, Canada_p1, Canada_p6, Canada_p2",1.3758,8.0
