In [230]:
# Ingraham ELO Simulation Notebook
# =================================
# Purpose: Estimate playoff and state championship chances
# based on current ELO ratings and season structure.

import pandas as pd
import numpy as np
import random
import copy
from collections import defaultdict
from itertools import permutations

In [231]:
# ------------------
# Step 1: Load Data
# ------------------
results_path = "C:/Users/User/Desktop/Soccer Footage/metroleague_soccer_results_filtered.csv"
class_path = "C:/Users/User/Desktop/Soccer Footage/school_classification_by_season.csv"
elo_log_path = "C:/Users/User/Desktop/Soccer Footage/metroleague_elo_log.csv"

matches_df = pd.read_csv(results_path)
class_df = pd.read_csv(class_path)
elo_log_df = pd.read_csv(elo_log_path)

# Merge classification into match results
matches_df = matches_df.merge(
    class_df, left_on=["Season", "Home Team"], right_on=["Season", "School"], how="left"
).rename(columns={"Classification": "Home Classification"}).drop(columns=["School"])

matches_df = matches_df.merge(
    class_df, left_on=["Season", "Away Team"], right_on=["Season", "School"], how="left"
).rename(columns={"Classification": "Away Classification"}).drop(columns=["School"])

matches_df["Home Score"] = pd.to_numeric(matches_df["Home Score"], errors="coerce")
matches_df["Away Score"] = pd.to_numeric(matches_df["Away Score"], errors="coerce")
matches_df = matches_df.dropna(subset=["Home Score", "Away Score"])




In [232]:
# Sanity check for looking at most recent ELO, Team, and Classification
# Load most recent ELOs
elo_dict = load_latest_elos_from_log(elo_log_df)
elo_df = pd.DataFrame(elo_dict.items(), columns=["Team", "ELO"])

# Add classification from class_df
season = "2024-25"
class_latest = class_df[class_df["Season"] == season][["School", "Classification"]].rename(columns={"School": "Team"})
elo_df = elo_df.merge(class_latest, on="Team", how="left")
TEAM_NAME_CORRECTIONS = {
    "Lakeside": "Lakeside (Seattle)",
    "Seattle Prep.": "Seattle Prep",
    "O'Dea": "O'Dea",  # optional if inconsistencies exist elsewhere
    # Add more as needed
}

def normalize_team_name(name):
    return TEAM_NAME_CORRECTIONS.get(name, name)

# Add division info
def get_division(team):
    if team in init_mountain:
        return "Mountain"
    elif team in init_sound:
        return "Sound"
    else:
        return "Other"

elo_df["Division"] = elo_df["Team"].apply(get_division)

# Final sorted output
elo_df = elo_df.sort_values(by="ELO", ascending=False).reset_index(drop=True)
elo_df

Unnamed: 0,Team,ELO,Classification,Division
0,Lincoln (Seattle),2118.137887,4A,Mountain
1,Bishop Blanchet,1825.993109,3A,Mountain
2,Seattle Prep,1819.396287,,Mountain
3,Lakeside (Seattle),1802.280707,3A,Mountain
4,O'Dea,1749.317223,3A,Mountain
5,Ballard,1716.123242,3A,Mountain
6,Garfield,1714.066464,3A,Mountain
7,Eastside Catholic,1610.821761,3A,Sound
8,Roosevelt,1586.831396,3A,Mountain
9,Lake Washington,1561.312561,,Other


In [233]:
# ------------------
# Round-Robin Scheduler for Remaining Matches
# ------------------
def generate_round_robin_schedule(teams, played_df, season):
    possible_matches = pd.DataFrame([
        {"Home Team": home, "Away Team": away}
        for home, away in permutations(teams, 2)
        if home != away
    ])
    possible_matches["key"] = possible_matches["Home Team"] + "_" + possible_matches["Away Team"]
    
    played = played_df.copy()
    played["key"] = played["Home Team"] + "_" + played["Away Team"]
    
    remaining = possible_matches[~possible_matches["key"].isin(played["key"])].drop(columns="key")
    remaining["Season"] = season
    remaining["Home Score"] = np.nan
    remaining["Away Score"] = np.nan
    # assign a fake date in the near future (for consistency if needed)
    remaining["Date"] = pd.Timestamp.today() + pd.to_timedelta(np.random.randint(1, 30, len(remaining)), unit="D")
    
    return remaining


In [234]:
# ------------------
# Step 2: Load ELOs from Log
# ------------------
def load_latest_elos_from_log(elo_log):
    home_elos = elo_log.sort_values(by=["Season", "Date"]).groupby("Home Team")["Home ELO After"].last()
    away_elos = elo_log.sort_values(by=["Season", "Date"]).groupby("Away Team")["Away ELO After"].last()

    combined = defaultdict(list)
    for team, elo in home_elos.items():
        combined[team].append(elo)
    for team, elo in away_elos.items():
        combined[team].append(elo)

    return {team: sum(elos) / len(elos) for team, elos in combined.items()}


In [235]:
# ------------------
# Simulate a single match result and update ELOs
# ------------------
def simulate_match(home_elo, away_elo, hfa=100):
    win_prob = 1 / (1 + 10 ** ((away_elo - home_elo + hfa) / 400))
    result = np.random.rand()
    if result < win_prob:
        return "H"
    elif result > win_prob + (1 - win_prob) / 2:
        return "A"
    else:
        return "T"

def simulate_remaining_matches(schedule_df, elo_dict, k=40, hfa=100):
    matches = []
    for _, row in schedule_df.iterrows():
        home = row['Home Team']
        away = row['Away Team']
        home_elo = elo_dict.get(home, 1500)
        away_elo = elo_dict.get(away, 1500)

        result = simulate_match(home_elo, away_elo, hfa)
        expected = 1 / (1 + 10 ** ((away_elo - home_elo + hfa) / 400))
        actual = 1 if result == "H" else 0 if result == "A" else 0.5
        delta = k * (actual - expected)
        elo_dict[home] += delta
        elo_dict[away] -= delta
        matches.append({"Home Team": home, "Away Team": away, "Result": result})
    return pd.DataFrame(matches), elo_dict

def compute_standings(matches):
    points = defaultdict(int)
    for _, row in matches.iterrows():
        if row["Result"] == "H":
            points[row["Home Team"]] += 3
        elif row["Result"] == "A":
            points[row["Away Team"]] += 3
        else:
            points[row["Home Team"]] += 1
            points[row["Away Team"]] += 1
    return pd.DataFrame(list(points.items()), columns=["Team", "Points"]).sort_values("Points", ascending=False)


In [236]:
# ------------------
# Simulate playoffs with detailed tracking of playoff advancement stages.
# Stages are defined as:
#  1: Reached Round 1
#  2: Reached Quarterfinals
#  3: Reached Semifinals
#  4: Reached Finals
#  5: Won Championship
# ------------------
def simulate_playoffs_detailed(mountain_top7, sound_top5, elo_dict, hfa=100):
    advancement = defaultdict(int)

    # If not enough teams, return empty results.
    if len(mountain_top7) < 7 or len(sound_top5) < 6:
        return [], None, advancement

    def play_game(t1, t2, stage, neutral=False):
        hfa_adj = 0 if neutral else hfa
        elo1 = elo_dict.get(t1, 1500)
        elo2 = elo_dict.get(t2, 1500)
        win_prob = 1 / (1 + 10 ** ((elo2 - elo1 + hfa_adj) / 400))
        winner = t1 if np.random.rand() < win_prob else t2
        # record the highest stage reached by the winner
        advancement[winner] = max(advancement[winner], stage)
        return winner

    # Round 1
    W1 = play_game(sound_top5[0], sound_top5[4], stage=1)
    W2 = play_game(mountain_top7[4], sound_top5[3], stage=1)
    W3 = play_game(sound_top5[1], mountain_top7[6], stage=1)
    W4 = play_game(mountain_top7[5], sound_top5[2], stage=1)

    # Quarterfinals
    W5 = play_game(W1, mountain_top7[0], stage=2)
    W6 = play_game(W2, mountain_top7[3], stage=2)
    W7 = play_game(W3, mountain_top7[1], stage=2)
    W8 = play_game(W4, mountain_top7[2], stage=2)

    state_qualifiers = [W5, W6, W7, W8]

    # Semifinals (played on a neutral field)
    S1 = play_game(W5, W6, stage=3, neutral=True)
    S2 = play_game(W7, W8, stage=3, neutral=True)

    # Finals (neutral field)
    final = play_game(S1, S2, stage=4, neutral=True)

    # Champion: mark the champion as reaching stage 5
    advancement[final] = 5

    return state_qualifiers, final, advancement

In [239]:
# ------------------
# Helper Function: Convert placement_stats to DataFrame
# ------------------
def placement_stats_to_df(placement_dict, division_name, n_sims, max_rank=7):
    """
    Convert nested placement stats into a DataFrame with percentage probabilities by seed rank.
    
    Args:
        placement_dict (dict): results["Placement"] from the simulation
        division_name (str): "Mountain" or "Sound"
        n_sims (int): total number of simulations
        max_rank (int): maximum rank to include (7 for Mountain, 6 for Sound)

    Returns:
        pd.DataFrame: table showing % chance of finishing in each seed
    """
    records = []
    division_stats = placement_dict.get(division_name, {})
    
    for team, ranks in division_stats.items():
        row = {"Team": team}
        for rank in range(1, max_rank + 1):
            row[f"Seed {rank} %"] = 100 * ranks.get(rank, 0) / n_sims
        records.append(row)
    
    df = pd.DataFrame(records)
    return df.sort_values("Seed 1 %", ascending=False).reset_index(drop=True)


In [250]:
# ------------------
# Monte Carlo Simulation with Detailed Division Placement & Playoff Advancement Tracking
# ------------------
def monte_carlo_simulation_rr_detailed(played_df, init_mountain, init_sound, class_df, elo_log_df, season="2024-25", n_sims=1000):
    # Overall counters for playoff, state berth, and championship appearances
    all_state, all_champs, all_playoffs = defaultdict(int), defaultdict(int), defaultdict(int)

    # Tracking division placement: for each division, record the number of simulations finishing at a given rank
    placement_stats = {
        "Mountain": defaultdict(lambda: defaultdict(int)),  # team -> rank -> count
        "Sound": defaultdict(lambda: defaultdict(int))
    }
    # Tracking playoff advancement: team -> stage (1: Round 1, 2: QF, 3: SF, 4: Finals, 5: Champion) -> count
    playoff_stages = defaultdict(lambda: defaultdict(int))

    base_elos = load_latest_elos_from_log(elo_log_df)
    # Get eligible teams from classification (e.g., 3A)
    eligible_teams = set(class_df[(class_df["Season"] == season) & (class_df["Classification"] == "3A")]["School"])

    # Generate the synthetic future schedule for both divisions
    future_df = pd.concat([
        generate_round_robin_schedule(init_mountain, played_df, season),
        generate_round_robin_schedule(init_sound, played_df, season)
    ], ignore_index=True)

    for sim in range(n_sims):
        if sim % max(1, n_sims // 10) == 0:
            print(f"🌀 Simulation {sim}/{n_sims}...")

        elo_dict = copy.deepcopy(base_elos)
        sim_matches, elo_dict = simulate_remaining_matches(future_df, elo_dict)

        # Combine already played matches with simulated matches
        all_matches = pd.concat([
            played_df.assign(Result=np.nan)[["Home Team", "Away Team", "Result"]],
            sim_matches
        ])

        # Separate matches by division
        m_matches = all_matches[
            all_matches["Home Team"].isin(init_mountain) & all_matches["Away Team"].isin(init_mountain)
        ]
        s_matches = all_matches[
            all_matches["Home Team"].isin(init_sound) & all_matches["Away Team"].isin(init_sound)
        ]

        m_stand = compute_standings(m_matches)
        s_stand = compute_standings(s_matches)

        m_top = m_stand[m_stand["Team"].isin(eligible_teams)].head(7)["Team"].tolist()
        s_top = s_stand[s_stand["Team"].isin(eligible_teams)].head(6)["Team"].tolist()

                # Track division placement: record each eligible team's finishing position
        for rank, row in enumerate(m_stand.itertuples(index=False), start=1):
            if row.Team in eligible_teams:
                placement_stats["Mountain"][row.Team][rank] += 1

        for rank, row in enumerate(s_stand.itertuples(index=False), start=1):
            if row.Team in eligible_teams:
                placement_stats["Sound"][row.Team][rank] += 1


        # Track playoff appearance (if a team is in the top lists)
        for team in m_top + s_top:
            all_playoffs[team] += 1

        # Simulate playoffs and track advancement
        state_teams, final_winner, advancement = simulate_playoffs_detailed(m_top, s_top, elo_dict)
        for team in state_teams:
            if team in eligible_teams:
                all_state[team] += 1
        if final_winner and final_winner in eligible_teams:
            all_champs[final_winner] += 1
        for team, stage in advancement.items():
            if team in eligible_teams:
                playoff_stages[team][stage] += 1

    # Helper function: convert a counter to a DataFrame with percentages
    def to_df(counter, label):
        df = pd.DataFrame(counter.items(), columns=["Team", label])
        df[label + " %"] = 100 * df[label] / n_sims
        return df.sort_values(label + " %", ascending=False).reset_index(drop=True)

    # Build DataFrame for playoff advancement rates
    def playoff_stage_df(playoff_stages, stage_labels):
        rows = []
        for team, stage_counts in playoff_stages.items():
            row = {"Team": team}
            for stage in range(1, 6):
                row[stage_labels[stage]] = 100 * stage_counts.get(stage, 0) / n_sims
            rows.append(row)
        return pd.DataFrame(rows).sort_values(stage_labels[1], ascending=False)



    # For overall rates
    results = {
        "Playoffs": to_df(all_playoffs, "Playoff Berths"),
        "State": to_df(all_state, "State Berths"),
        "Champion": to_df(all_champs, "Championships"),
        "Advancement": playoff_stage_df(playoff_stages, {
            1: "Round 1", 2: "Quarterfinals", 3: "Semifinals", 4: "Finals", 5: "Championship"
        })
    }
    
    # Also include the raw placement stats (which are nested dictionaries)
    results["Placement"] = placement_stats

    return results



In [252]:
# ------------------
# Step 4: Run Simulation and Display Results
# ------------------

# Normalize team names in ELO log before simulation
elo_log_df["Home Team"] = elo_log_df["Home Team"].apply(normalize_team_name)
elo_log_df["Away Team"] = elo_log_df["Away Team"].apply(normalize_team_name)
class_df["School"] = class_df["School"].apply(normalize_team_name)  # optional safety

# Define your divisions
init_mountain = [
    'Seattle Prep', 'Garfield', "O'Dea", 'Roosevelt',
    'Lincoln (Seattle)', 'Bishop Blanchet', 'Ballard', 'Lakeside (Seattle)'
]
init_sound = [
    'Nathan Hale', 'Chief Sealth', 'Cleveland', 'Franklin',
    'Rainier Beach', 'West Seattle', 'Ingraham', 'Eastside Catholic', 'Seattle Academy'
]

# Normalize team names if needed
init_mountain = [normalize_team_name(t) for t in init_mountain]
init_sound = [normalize_team_name(t) for t in init_sound]

# Split played vs. future matches
played_df = matches_df[
    (matches_df["Season"] == "2024-25") &
    matches_df["Home Score"].notna() & matches_df["Away Score"].notna()
]
future_df = matches_df[
    (matches_df["Season"] == "2024-25") &
    matches_df["Home Score"].isna() & matches_df["Away Score"].isna()
]

# Number of simulations to run
n_simulations = 100000

# Run the simulation
results = monte_carlo_simulation_rr_detailed(
    played_df, init_mountain, init_sound, class_df, elo_log_df,
    season="2024-25", n_sims=n_simulations
)

mountain_seeding = placement_stats_to_df(results["Placement"], "Mountain", n_simulations, max_rank=7)
sound_seeding = placement_stats_to_df(results["Placement"], "Sound", n_simulations, max_rank=6)

# Display results
print("🔼 Mountain Seeding Probabilities")
print(mountain_seeding)

print("\n🌊 Sound Seeding Probabilities")
print(sound_seeding)

print("\n🏅 State Berth Odds")
print(results["State"])

print("\n🏆 Championship Odds")
print(results["Champion"])

print("\n📈 Playoff Advancement Breakdown")
print(results["Advancement"])


🌀 Simulation 0/100000...
🌀 Simulation 10000/100000...
🌀 Simulation 20000/100000...
🌀 Simulation 30000/100000...
🌀 Simulation 40000/100000...
🌀 Simulation 50000/100000...
🌀 Simulation 60000/100000...
🌀 Simulation 70000/100000...
🌀 Simulation 80000/100000...
🌀 Simulation 90000/100000...
🔼 Mountain Seeding Probabilities
                 Team  Seed 1 %  Seed 2 %  Seed 3 %  Seed 4 %  Seed 5 %  \
0        Seattle Prep    12.387    33.593    17.144    11.937     8.640   
1     Bishop Blanchet     2.646    13.274    18.810    19.084    16.842   
2  Lakeside (Seattle)     2.626    11.546    16.048    16.307    15.879   
3               O'Dea     2.410    12.180    17.059    16.780    15.729   
4            Garfield     1.120     6.467    11.067    13.682    15.837   
5             Ballard     0.798     7.041    12.507    14.954    17.204   
6           Roosevelt     0.153     1.186     3.043     5.468     9.070   

   Seed 6 %  Seed 7 %  
0     6.782     5.442  
1    14.583     9.961  
2    14.