In [1]:
import zipfile
import os

# Define the path to the ZIP file and extraction directory
zip_file_path = r"C:\Users\jthun\Downloads\archive (2).zip"  # Add .zip extension if not included
extraction_dir = r"C:\Users\jthun\Downloads\march_madness_data"

# Create extraction directory if it doesn't exist
os.makedirs(extraction_dir, exist_ok=True)

# Extract the contents of the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

# List the extracted files
extracted_files = os.listdir(extraction_dir)
print("Extracted files:", extracted_files)

Extracted files: ['DEV _ March Madness.csv', 'INT _ KenPom _ Defense.csv', 'INT _ KenPom _ Efficiency.csv', 'INT _ KenPom _ Height.csv', 'INT _ KenPom _ Miscellaneous Team Stats.csv', 'INT _ KenPom _ Offense.csv', 'INT _ KenPom _ Point Distribution.csv', 'INT _ KenPom _ Summary.csv', 'REF _ Current NCAAM Coaches (2025).csv', 'REF _ NCAAM Conference and ESPN Team Name Mapping.csv', 'REF _ Post-Season Tournament Teams.csv', 'REF _ Top 12 in AP Top 25 During Week 6.csv']


In [2]:
import pandas as pd

In [3]:
# List all CSV files in the extraction directory
csv_files = [f for f in os.listdir(extraction_dir) if f.endswith('.csv')]

# Read all CSV files into a dictionary of DataFrames
dataframes = {file.replace(".csv", ""): pd.read_csv(os.path.join(extraction_dir, file)) for file in csv_files}

# Print the names of the loaded DataFrames
print("Loaded DataFrames:", list(dataframes.keys()))

Loaded DataFrames: ['DEV _ March Madness', 'INT _ KenPom _ Defense', 'INT _ KenPom _ Efficiency', 'INT _ KenPom _ Height', 'INT _ KenPom _ Miscellaneous Team Stats', 'INT _ KenPom _ Offense', 'INT _ KenPom _ Point Distribution', 'INT _ KenPom _ Summary', 'REF _ Current NCAAM Coaches (2025)', 'REF _ NCAAM Conference and ESPN Team Name Mapping', 'REF _ Post-Season Tournament Teams', 'REF _ Top 12 in AP Top 25 During Week 6']


In [4]:
march_madness_df = pd.read_csv(os.path.join(extraction_dir, "DEV _ March Madness.csv"))


In [5]:
# Filter for only teams in March Madness
tournament_teams_df = march_madness_df[
    march_madness_df["Post-Season Tournament"] == "March Madness"
]

# Reset index for clarity
tournament_teams_df.reset_index(drop=True, inplace=True)

# Confirm the number of teams
print(f"Total tournament teams: {tournament_teams_df.shape[0]}")

Total tournament teams: 1535


In [6]:
tournament_teams_2025_df = tournament_teams_df[tournament_teams_df["Season"] == 2025]

# Reset index for clarity
tournament_teams_2025_df.reset_index(drop=True, inplace=True)

tournament_teams_2025_df

Unnamed: 0,Season,Short Conference Name,Adjusted Temo,Adjusted Tempo Rank,Raw Tempo,Raw Tempo Rank,Adjusted Offensive Efficiency,Adjusted Offensive Efficiency Rank,Raw Offensive Efficiency,Raw Offensive Efficiency Rank,...,Active Coaching Length Index,Seed,Region,Post-Season Tournament,Post-Season Tournament Sorting Index,Vulnerable Top 2 Seed?,Tournament Winner?,Tournament Championship?,Final Four?,Top 12 in AP Top 25 During Week 6?
0,2025,ACC,65.7,266,66.6,256,128.0,3,124.3,1,...,3.0,1,East,March Madness,1,No,No,No,No,Yes
1,2025,SEC,69.6,59,70.8,44,128.6,1,120.4,5,...,3.0,1,West,March Madness,1,No,No,No,No,Yes
2,2025,B12,61.4,360,62.4,360,123.2,10,116.9,18,...,11.0,1,Midwest,March Madness,1,No,No,No,No,No
3,2025,SEC,67.6,153,68.9,135,128.4,2,121.0,4,...,11.0,1,South,March Madness,1,No,No,No,No,Yes
4,2025,SEC,63.6,346,65.5,308,120.3,18,114.1,40,...,10.0,2,Midwest,March Madness,1,No,No,No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,2025,OVC,65.8,261,67.2,227,101.9,255,104.6,214,...,6.0,16,Midwest,March Madness,1,No,No,No,No,No
64,2025,PL,63.5,349,63.7,357,102.1,252,104.6,215,...,2.0,16,East,March Madness,1,No,No,No,No,No
65,2025,MAAC,67.4,170,67.5,204,100.1,292,101.6,282,...,1.0,16,East,March Madness,1,No,No,No,No,No
66,2025,SWAC,67.7,142,69.1,120,101.4,270,104.0,228,...,3.0,16,South,March Madness,1,No,No,No,No,No


In [7]:
numeric_columns = [
    "Seed", "AdjEM", "Adjusted Offensive Efficiency Rank", "Adjusted Defensive Efficiency Rank",
    "Adjusted Tempo Rank", "Raw Offensive Efficiency Rank", "Raw Defensive Efficiency Rank",
    "eFGPct", "RankeFGPct", "TOPct", "RankTOPct", "ORPct", "RankORPct", "FTRate", "RankFTRate",
    "FG3Pct", "RankFG3Pct", "FG3Rate", "RankFG3Rate", "Def3PtFG", "RankDef3PtFG",
    "OppFG3Pct", "RankOppFG3Pct", "OppFG3Rate", "RankOppFG3Rate",
    "AvgHeight", "RankAvgHeight", "EffectiveHeight", "RankEffectiveHeight",
    "Experience", "RankExperience", "Bench", "BenchRank", "Net Rating", "Net Rating Rank",
    "Active Coaching Length", "Active Coaching Length Index", "CenterHeight", "PFHeight", "SFHeight",
    "SGHeight", "PGHeight", "BlockPct", "RankBlockPct", "OppBlockPct", "RankOppBlockPct",
    "CenterPts", "PFPts", "SFPts", "SGPts", "PGPts", "CenterOR", "PFOR", "SFOR", "SGOR", "PGOR",
    "CenterDR", "PFDR", "SFDR", "SGDR", "PGDR"
]

for col in numeric_columns:
    if col in tournament_teams_2025_df.columns:
        tournament_teams_2025_df[col] = pd.to_numeric(tournament_teams_2025_df[col], errors="coerce")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tournament_teams_2025_df[col] = pd.to_numeric(tournament_teams_2025_df[col], errors="coerce")


In [8]:
tournament_teams_2025_df = tournament_teams_2025_df.copy()


In [9]:
for col in numeric_columns:
    if col in tournament_teams_2025_df.columns:
        tournament_teams_2025_df.loc[:, col] = pd.to_numeric(tournament_teams_2025_df[col], errors="coerce")


In [10]:
tournament_teams_2025_df["Elite Team"] = (
    (tournament_teams_2025_df["AdjEM"] > 18) &
    (tournament_teams_2025_df["RankOE"] <= 15) &
    (tournament_teams_2025_df["RankDE"] <= 15)
)

tournament_teams_2025_df["Balanced Team"] = (
    (tournament_teams_2025_df["Adjusted Offensive Efficiency Rank"] <= 40) &
    (tournament_teams_2025_df["Adjusted Defensive Efficiency Rank"] <= 40)
)

tournament_teams_2025_df["Overperforming Team"] = (
    (tournament_teams_2025_df["Seed"] >= 8) & 
    (tournament_teams_2025_df["AdjEM"] >= 12)
)

tournament_teams_2025_df["Underperforming High Seed"] = (
    (tournament_teams_2025_df["Seed"] <= 4) &
    (tournament_teams_2025_df["AdjEM"] < 10)
)


In [11]:
tournament_teams_2025_df["Fast-Paced Team"] = (tournament_teams_2025_df["RankAdjTempo"] <= 50)

tournament_teams_2025_df["Slow & Methodical Team"] = (tournament_teams_2025_df["RankAdjTempo"] >= 300)

tournament_teams_2025_df["Elite Defense Team"] = (
    (tournament_teams_2025_df["RankDE"] <= 20) &  
    (tournament_teams_2025_df["RankAdjTempo"] >= 250)
)

tournament_teams_2025_df["Turnover-Creating Team"] = (tournament_teams_2025_df["RankStlRate"] <= 30)


In [12]:
tournament_teams_2025_df["Three-Point Heavy Team"] = (
    (tournament_teams_2025_df["FG3Rate"] > 40) & 
    (tournament_teams_2025_df["FG3Pct"] > 37)
)

tournament_teams_2025_df["Inside Scoring Team"] = (tournament_teams_2025_df["FG2Pct"] >= 55)

tournament_teams_2025_df["Free Throw Reliant Team"] = (tournament_teams_2025_df["FTRate"] >= 40)

tournament_teams_2025_df["Poor Free Throw Team"] = (tournament_teams_2025_df["FTRate"] < 25)


In [13]:
tournament_teams_2025_df["Elite 3PT Defense"] = (tournament_teams_2025_df["Def3PtFG"] < 30)

tournament_teams_2025_df["Elite Interior Defense"] = (
    (tournament_teams_2025_df["RankDef2PtFG"] <= 20) & 
    (tournament_teams_2025_df["RankBlockPct"] <= 30)
)

tournament_teams_2025_df["Weak 3PT Defense"] = (tournament_teams_2025_df["Def3PtFG"] > 35)

tournament_teams_2025_df["Weak Interior Defense"] = (
    (tournament_teams_2025_df["RankDef2PtFG"] >= 250) & 
    (tournament_teams_2025_df["RankBlockPct"] >= 250)
)


In [14]:
tournament_teams_2025_df["Dominant Rebounding Team"] = (
    (tournament_teams_2025_df["ORPct"] >= 35))

tournament_teams_2025_df["Tallest Team"] = (tournament_teams_2025_df["AvgHeight"] >= 78)

tournament_teams_2025_df["Small Ball Team"] = (tournament_teams_2025_df["AvgHeight"] <= 75)

tournament_teams_2025_df["Elite Shot-Blocking Team"] = (tournament_teams_2025_df["BlockPct"] >= 12)


In [15]:
# Create a total defensive rebounding metric
tournament_teams_2025_df.loc[:, "Total Defensive Rebounding"] = (
    tournament_teams_2025_df["CenterDR"] +
    tournament_teams_2025_df["PFDR"] +
    tournament_teams_2025_df["SFDR"] +
    tournament_teams_2025_df["SGDR"] +
    tournament_teams_2025_df["PGDR"]
)

# Identify teams that are elite at defensive rebounding
tournament_teams_2025_df.loc[:, "Elite Defensive Rebounding Team"] = (
    tournament_teams_2025_df["Total Defensive Rebounding"] > tournament_teams_2025_df["Total Defensive Rebounding"].median()
)


In [16]:
tournament_teams_2025_df["Experienced Team"] = (
    (tournament_teams_2025_df["RankExperience"] <= 50) &  
    (tournament_teams_2025_df["BenchRank"] <= 50)
)

tournament_teams_2025_df["Elite Coaching Team"] = (
    (tournament_teams_2025_df["Active Coaching Length Index"] >= 10)
)

tournament_teams_2025_df["Deep Bench Team"] = (tournament_teams_2025_df["BenchRank"] <= 50)


In [17]:
tournament_teams_2025_df["Momentum Team"] = (tournament_teams_2025_df["Top 12 in AP Top 25 During Week 6?"] == "Yes")

tournament_teams_2025_df["Vulnerable Top Seed"] = (
    (tournament_teams_2025_df["Seed"] <= 2) & 
    (tournament_teams_2025_df["Vulnerable Top 2 Seed?"] == "Yes")
)

tournament_teams_2025_df["Undervalued Lower Seed"] = (
    (tournament_teams_2025_df["Seed"] >= 10) & 
    (tournament_teams_2025_df["AdjEM"] >= 10)
)


In [18]:
def predict_matchup(team1, team2, df):
    """
    Compares two teams and predicts a winner based on deep analytical selection criteria.

    Parameters:
    - team1 (str): Name of the first team (as in "Mapped ESPN Team Name").
    - team2 (str): Name of the second team.
    - df (DataFrame): The tournament dataset containing team stats.

    Returns:
    - str: The predicted winner.
    """

    # Ensure both teams exist in the dataset
    if team1 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team1} not found in dataset"
    if team2 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team2} not found in dataset"

    # Extract team data
    t1 = df[df["Mapped ESPN Team Name"] == team1].iloc[0]
    t2 = df[df["Mapped ESPN Team Name"] == team2].iloc[0]

    # Base Case: Higher seed wins unless overridden
    favored_team, underdog = (t1, t2) if t1["Seed"] < t2["Seed"] else (t2, t1)

    # 🔹 Efficiency & Strength Factors
    if underdog["AdjEM"] > favored_team["AdjEM"] + 3:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Elite Team"]:
        return favored_team["Mapped ESPN Team Name"]
    elif underdog["Elite Team"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Balanced Team"]:
        return favored_team["Mapped ESPN Team Name"]
    elif underdog["Balanced Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Playstyle Matchups
    if favored_team["Elite Defense Team"] and underdog["Fast-Paced Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Elite Defense Team"] and favored_team["Fast-Paced Team"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Slow & Methodical Team"] and underdog["Fast-Paced Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Slow & Methodical Team"] and favored_team["Fast-Paced Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Turnover Creation
    if favored_team["Turnover-Creating Team"] and not underdog["Turnover-Creating Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Turnover-Creating Team"] and not favored_team["Turnover-Creating Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Rebounding & Size Adjustments
    if favored_team["Elite Defensive Rebounding Team"] and not underdog["Elite Defensive Rebounding Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Elite Defensive Rebounding Team"] and not favored_team["Elite Defensive Rebounding Team"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Dominant Rebounding Team"] and not underdog["Dominant Rebounding Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Dominant Rebounding Team"] and not favored_team["Dominant Rebounding Team"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Tallest Team"] and underdog["Small Ball Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Tallest Team"] and favored_team["Small Ball Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Shooting & Matchups
    if underdog["Three-Point Heavy Team"] and favored_team["Elite 3PT Defense"]:
        return favored_team["Mapped ESPN Team Name"]
    if favored_team["Three-Point Heavy Team"] and underdog["Elite 3PT Defense"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Inside Scoring Team"] and not underdog["Elite Interior Defense"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Inside Scoring Team"] and not favored_team["Elite Interior Defense"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Free Throw Reliant Team"] and not underdog["Poor Free Throw Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Free Throw Reliant Team"] and not favored_team["Poor Free Throw Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Defensive Weakness Exploitation
    if favored_team["Elite 3PT Defense"] and underdog["Weak 3PT Defense"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Elite 3PT Defense"] and favored_team["Weak 3PT Defense"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Elite Interior Defense"] and underdog["Weak Interior Defense"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Elite Interior Defense"] and favored_team["Weak Interior Defense"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Shot-Blocking
    if favored_team["Elite Shot-Blocking Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Elite Shot-Blocking Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Coaching & Experience
    if favored_team["Elite Coaching Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Elite Coaching Team"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Experienced Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Experienced Team"]:
        return underdog["Mapped ESPN Team Name"]

    if favored_team["Deep Bench Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Deep Bench Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Upset Potential & Vulnerability
    if underdog["Undervalued Lower Seed"]:
        return underdog["Mapped ESPN Team Name"]
    if favored_team["Vulnerable Top Seed"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Momentum
    if favored_team["Momentum Team"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Momentum Team"]:
        return underdog["Mapped ESPN Team Name"]

    # 🔹 Overperforming vs. Underperforming Teams
    if favored_team["Overperforming Team"] and underdog["Underperforming High Seed"]:
        return favored_team["Mapped ESPN Team Name"]
    if underdog["Overperforming Team"] and favored_team["Underperforming High Seed"]:
        return underdog["Mapped ESPN Team Name"]

    # Default to higher-seeded team if all else is even
    return favored_team["Mapped ESPN Team Name"]


In [19]:
team1 = "UCLA"
team2 = "Utah State"

winner = predict_matchup(team1, team2, tournament_teams_2025_df)
print(f"Predicted Winner: {winner}")

Predicted Winner: UCLA


In [20]:
def simulate_bracket_standard(df):
    """
    Simulates a full NCAA 64-team bracket using standard seeding matchups.
    Assumes each region (e.g., East, West, South, Midwest) has exactly 16 teams.
    
    Parameters:
    - df (DataFrame): The tournament dataset with all necessary columns.
    
    Returns:
    - dict: A dictionary containing results for each round.
    """
    round_results = {}  # To store matchup outcomes for each round
    regions = df["Region"].unique()
    bracket = {}
    
    # Set up the first-round matchups for each region using standard seeding
    for region in regions:
        region_df = df[df["Region"] == region].sort_values("Seed")
        # Create a mapping of seed -> team name
        seed_to_team = {int(row["Seed"]): row["Mapped ESPN Team Name"] for _, row in region_df.iterrows()}
        
        # Check if the region has 16 teams
        if len(seed_to_team) != 16:
            return f"Error: Region {region} does not have 16 teams. Found: {len(seed_to_team)}"
        
        # Predetermined first-round matchups
        matchups = [
            (seed_to_team[1], seed_to_team[16]),
            (seed_to_team[8], seed_to_team[9]),
            (seed_to_team[5], seed_to_team[12]),
            (seed_to_team[4], seed_to_team[13]),
            (seed_to_team[6], seed_to_team[11]),
            (seed_to_team[3], seed_to_team[14]),
            (seed_to_team[7], seed_to_team[10]),
            (seed_to_team[2], seed_to_team[15])
        ]
        bracket[region] = matchups
    
    # Initialize round_results for Round of 64
    round_results["Round of 64"] = {}
    region_winners = {}
    
    # Process Round of 64 for each region
    for region in regions:
        winners = []
        for team1, team2 in bracket[region]:
            winner = predict_matchup(team1, team2, df)
            round_results["Round of 64"][f"{region}: {team1} vs {team2}"] = winner
            winners.append(winner)
        region_winners[region] = winners  # This should be 8 winners per region
    
    # Now simulate subsequent rounds within each region.
    # We'll create a helper function to simulate a single region until one team remains.
    def simulate_region(winners_list, round_name, region):
        matchups = [(winners_list[i], winners_list[i+1]) for i in range(0, len(winners_list), 2)]
        round_results[round_name] = round_results.get(round_name, {})
        new_winners = []
        for team1, team2 in matchups:
            winner = predict_matchup(team1, team2, df)
            round_results[round_name][f"{region}: {team1} vs {team2}"] = winner
            new_winners.append(winner)
        return new_winners

    # Round of 32
    for region in regions:
        region_winners[region] = simulate_region(region_winners[region], "Round of 32", region)
    # Sweet 16
    for region in regions:
        region_winners[region] = simulate_region(region_winners[region], "Sweet 16", region)
    # Elite 8 (Regional Finals) - now each region should have 1 winner
    final_four = {}
    for region in regions:
        winners = simulate_region(region_winners[region], "Elite 8", region)
        if len(winners) != 1:
            return f"Error: Region {region} did not produce a single winner in Elite 8."
        final_four[region] = winners[0]
    
    # Final Four: Combine the 4 region winners. Assume region order: East, West, South, Midwest.
    final_four_order = sorted(final_four.keys())  # or specify order explicitly
    final_four_teams = [final_four[region] for region in final_four_order]
    
    round_results["Final Four"] = {}
    ff_matchup1 = (final_four_teams[0], final_four_teams[1])
    ff_matchup2 = (final_four_teams[2], final_four_teams[3])
    winner_ff1 = predict_matchup(ff_matchup1[0], ff_matchup1[1], df)
    winner_ff2 = predict_matchup(ff_matchup2[0], ff_matchup2[1], df)
    round_results["Final Four"][f"{ff_matchup1[0]} vs {ff_matchup1[1]}"] = winner_ff1
    round_results["Final Four"][f"{ff_matchup2[0]} vs {ff_matchup2[1]}"] = winner_ff2
    
    # Championship
    round_results["Championship"] = {}
    champ_matchup = (winner_ff1, winner_ff2)
    champion = predict_matchup(champ_matchup[0], champ_matchup[1], df)
    round_results["Championship"][f"{champ_matchup[0]} vs {champ_matchup[1]}"] = champion

    return round_results

# Run the simulation
bracket_results = simulate_bracket_standard(tournament_teams_2025_df)

# Display the results:
print("\n🏆 March Madness Bracket Simulation Results 🏆")
for round_name, matchups in bracket_results.items():
    print(f"\n🔹 {round_name}")
    if isinstance(matchups, dict):
        for matchup, winner in matchups.items():
            print(f"{matchup} → 🏀 {winner}")
    else:
        print(matchups)



🏆 March Madness Bracket Simulation Results 🏆

🔹 Round of 64
East: Duke vs Mount St. Mary's → 🏀 Duke
East: Mississippi State vs Baylor → 🏀 Mississippi State
East: Oregon vs Liberty → 🏀 Oregon
East: Arizona vs Akron → 🏀 Arizona
East: BYU vs VCU → 🏀 VCU
East: Wisconsin vs Montana → 🏀 Wisconsin
East: Saint Mary's vs Vanderbilt → 🏀 Vanderbilt
East: Alabama vs Robert Morris → 🏀 Alabama
West: Florida vs Norfolk State → 🏀 Florida
West: UConn vs Oklahoma → 🏀 UConn
West: Memphis vs Colorado State → 🏀 Colorado State
West: Maryland vs Grand Canyon → 🏀 Maryland
West: Missouri vs Drake → 🏀 Drake
West: Texas Tech vs UNC Wilmington → 🏀 Texas Tech
West: Kansas vs Arkansas → 🏀 Kansas
West: St. John's vs Omaha → 🏀 St. John's
Midwest: Houston vs SIU Edwardsville → 🏀 Houston
Midwest: Gonzaga vs Georgia → 🏀 Gonzaga
Midwest: Clemson vs McNeese → 🏀 Clemson
Midwest: Purdue vs High Point → 🏀 Purdue
Midwest: Illinois vs Xavier → 🏀 Illinois
Midwest: Kentucky vs Troy → 🏀 Troy
Midwest: UCLA vs Utah State → 🏀 UCLA


In [21]:
def predict_matchup_weighted(team1, team2, df):
    """
    Compares two teams and predicts a winner based on a weighted scoring system
    that incorporates deep analytical selection criteria.

    Parameters:
    - team1 (str): Name of the first team (as in "Mapped ESPN Team Name").
    - team2 (str): Name of the second team.
    - df (DataFrame): The tournament dataset containing team stats.

    Returns:
    - str: The predicted winner.
    """
    # Ensure both teams exist in the dataset
    if team1 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team1} not found in dataset"
    if team2 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team2} not found in dataset"
    
    # Extract team data
    t1 = df[df["Mapped ESPN Team Name"] == team1].iloc[0]
    t2 = df[df["Mapped ESPN Team Name"] == team2].iloc[0]
    
    # Base: Favor the higher seed slightly
    # (Lower Seed value means better seed)
    base_weight = 2.0
    score1 = base_weight * (t2["Seed"] - t1["Seed"])  # positive if t1 is higher seeded
    score2 = base_weight * (t1["Seed"] - t2["Seed"])
    
    # Define weights for each category (these can be tuned)
    weights = {
        "efficiency": 1.0,
        "playstyle": 0.8,
        "rebounding": 0.7,
        "shooting": 0.8,
        "coaching": 0.5,
        "momentum": 0.6,
        "upset": 1.0
    }
    
    # Efficiency & Overall Strength (using AdjEM)
    med_eff = df["AdjEM"].median()  # Use median as baseline
    score1 += weights["efficiency"] * (t1["AdjEM"] - med_eff)
    score2 += weights["efficiency"] * (t2["AdjEM"] - med_eff)
    
    # Bonus for being flagged as Elite Team
    if t1.get("Elite Team", False):
        score1 += 3
    if t2.get("Elite Team", False):
        score2 += 3

    # Bonus for being a Balanced Team
    if t1.get("Balanced Team", False):
        score1 += 2
    if t2.get("Balanced Team", False):
        score2 += 2
    
    # Playstyle: Fast-Paced vs. Slow & Methodical & Elite Defense
    if t1.get("Fast-Paced Team", False):
        score1 += weights["playstyle"] * 1.5
    if t2.get("Fast-Paced Team", False):
        score2 += weights["playstyle"] * 1.5
    
    if t1.get("Slow & Methodical Team", False):
        score1 += weights["playstyle"] * 1.0
    if t2.get("Slow & Methodical Team", False):
        score2 += weights["playstyle"] * 1.0

    if t1.get("Elite Defense Team", False):
        score1 += weights["playstyle"] * 2
    if t2.get("Elite Defense Team", False):
        score2 += weights["playstyle"] * 2

    # Rebounding & Size (using ORPct and Total Defensive Rebounding)
    if t1.get("Dominant Rebounding Team", False):
        score1 += weights["rebounding"] * 2
    if t2.get("Dominant Rebounding Team", False):
        score2 += weights["rebounding"] * 2

    if t1.get("Tallest Team", False):
        score1 += weights["rebounding"] * 1.5
    if t2.get("Tallest Team", False):
        score2 += weights["rebounding"] * 1.5

    # Shooting & Scoring
    if t1.get("Three-Point Heavy Team", False):
        score1 += weights["shooting"] * 2
    if t2.get("Three-Point Heavy Team", False):
        score2 += weights["shooting"] * 2

    if t1.get("Inside Scoring Team", False):
        score1 += weights["shooting"] * 1.5
    if t2.get("Inside Scoring Team", False):
        score2 += weights["shooting"] * 1.5

    # Defensive shooting: if a team has Elite 3PT Defense against a Three-Point Heavy opponent
    if t1.get("Elite 3PT Defense", False) and t2.get("Three-Point Heavy Team", False):
        score1 += weights["shooting"] * 1.5
    if t2.get("Elite 3PT Defense", False) and t1.get("Three-Point Heavy Team", False):
        score2 += weights["shooting"] * 1.5

    # Coaching & Experience
    if t1.get("Elite Coaching Team", False):
        score1 += weights["coaching"] * 2
    if t2.get("Elite Coaching Team", False):
        score2 += weights["coaching"] * 2

    if t1.get("Experienced Team", False):
        score1 += weights["coaching"] * 1.5
    if t2.get("Experienced Team", False):
        score2 += weights["coaching"] * 1.5

    if t1.get("Deep Bench Team", False):
        score1 += weights["coaching"] * 1.0
    if t2.get("Deep Bench Team", False):
        score2 += weights["coaching"] * 1.0

    # Upset Potential & Vulnerability
    if t1.get("Undervalued Lower Seed", False):
        score1 += weights["upset"] * 2
    if t2.get("Undervalued Lower Seed", False):
        score2 += weights["upset"] * 2

    if t1.get("Vulnerable Top Seed", False):
        score2 += weights["upset"] * 2  # Opponent gets bonus
    if t2.get("Vulnerable Top Seed", False):
        score1 += weights["upset"] * 2

    # Momentum
    if t1.get("Momentum Team", False):
        score1 += weights["momentum"] * 2
    if t2.get("Momentum Team", False):
        score2 += weights["momentum"] * 2

    # Debug: Uncomment to see computed scores
    # print(f"{team1} score: {score1}, {team2} score: {score2}")

    # Final decision based on the weighted score
    if score1 > score2:
        return team1
    else:
        return team2


In [22]:
# 1. Ensure your 2025 tournament data is prepared
# (Assume tournament_teams_2025_df has been cleaned, play-in teams removed, and selection criteria computed)

# 2. Define your weighted predictor function (as defined in the previous step)
# (e.g., predict_matchup_weighted or predict_matchup_weighted_hybrid)

# 3. Define a bracket simulation function that uses the weighted predictor:
def simulate_bracket_standard_weighted(df):
    """
    Simulates a full NCAA 64-team bracket using standard seeding matchups and the weighted matchup function.
    Assumes each region (East, West, South, Midwest) has exactly 16 teams.
    
    Parameters:
    - df (DataFrame): The tournament dataset with all necessary columns.
    
    Returns:
    - dict: A dictionary containing results for each round.
    """
    round_results = {}  # To store matchup outcomes for each round
    regions = df["Region"].unique()
    bracket = {}

    # Set up first-round matchups for each region using standard seeding
    for region in regions:
        region_df = df[df["Region"] == region].sort_values("Seed")
        seed_to_team = {int(row["Seed"]): row["Mapped ESPN Team Name"] for _, row in region_df.iterrows()}
        if len(seed_to_team) != 16:
            return f"Error: Region {region} does not have 16 teams. Found: {len(seed_to_team)}"
        matchups = [
            (seed_to_team[1], seed_to_team[16]),
            (seed_to_team[8], seed_to_team[9]),
            (seed_to_team[5], seed_to_team[12]),
            (seed_to_team[4], seed_to_team[13]),
            (seed_to_team[6], seed_to_team[11]),
            (seed_to_team[3], seed_to_team[14]),
            (seed_to_team[7], seed_to_team[10]),
            (seed_to_team[2], seed_to_team[15])
        ]
        bracket[region] = matchups

    # Round of 64
    round_results["Round of 64"] = {}
    region_winners = {}
    for region in regions:
        winners = []
        for team1, team2 in bracket[region]:
            winner = predict_matchup_weighted(team1, team2, df)
            round_results["Round of 64"][f"{region}: {team1} vs {team2}"] = winner
            winners.append(winner)
        region_winners[region] = winners  # 8 winners per region

    # Helper function to simulate a region's subsequent round
    def simulate_region(winners_list, round_name, region):
        matchups = [(winners_list[i], winners_list[i+1]) for i in range(0, len(winners_list), 2)]
        round_results[round_name] = round_results.get(round_name, {})
        new_winners = []
        for team1, team2 in matchups:
            winner = predict_matchup_weighted(team1, team2, df)
            round_results[round_name][f"{region}: {team1} vs {team2}"] = winner
            new_winners.append(winner)
        return new_winners

    # Round of 32, Sweet 16, and Elite 8
    for round_name in ["Round of 32", "Sweet 16", "Elite 8"]:
        for region in regions:
            region_winners[region] = simulate_region(region_winners[region], round_name, region)
    
    # After Elite 8, each region should have 1 winner
    final_four = {region: region_winners[region][0] for region in regions}
    final_four_order = sorted(final_four.keys())  # or specify an explicit order
    final_four_teams = [final_four[region] for region in final_four_order]

    round_results["Final Four"] = {}
    ff_matchup1 = (final_four_teams[0], final_four_teams[1])
    ff_matchup2 = (final_four_teams[2], final_four_teams[3])
    winner_ff1 = predict_matchup_weighted(ff_matchup1[0], ff_matchup1[1], df)
    winner_ff2 = predict_matchup_weighted(ff_matchup2[0], ff_matchup2[1], df)
    round_results["Final Four"][f"{ff_matchup1[0]} vs {ff_matchup1[1]}"] = winner_ff1
    round_results["Final Four"][f"{ff_matchup2[0]} vs {ff_matchup2[1]}"] = winner_ff2

    round_results["Championship"] = {}
    champ_matchup = (winner_ff1, winner_ff2)
    champion = predict_matchup_weighted(champ_matchup[0], champ_matchup[1], df)
    round_results["Championship"][f"{champ_matchup[0]} vs {champ_matchup[1]}"] = champion

    return round_results

# 4. Run the full simulation for this year's tournament:
bracket_results = simulate_bracket_standard_weighted(tournament_teams_2025_df)

# 5. Display the results:
print("\n🏆 March Madness Bracket Simulation Results (Weighted) 🏆")
for round_name, matchups in bracket_results.items():
    print(f"\n🔹 {round_name}")
    if isinstance(matchups, dict):
        for matchup, winner in matchups.items():
            print(f"{matchup} → 🏀 {winner}")
    else:
        print(matchups)



🏆 March Madness Bracket Simulation Results (Weighted) 🏆

🔹 Round of 64
East: Duke vs Mount St. Mary's → 🏀 Duke
East: Mississippi State vs Baylor → 🏀 Mississippi State
East: Oregon vs Liberty → 🏀 Oregon
East: Arizona vs Akron → 🏀 Arizona
East: BYU vs VCU → 🏀 BYU
East: Wisconsin vs Montana → 🏀 Wisconsin
East: Saint Mary's vs Vanderbilt → 🏀 Saint Mary's
East: Alabama vs Robert Morris → 🏀 Alabama
West: Florida vs Norfolk State → 🏀 Florida
West: UConn vs Oklahoma → 🏀 UConn
West: Memphis vs Colorado State → 🏀 Memphis
West: Maryland vs Grand Canyon → 🏀 Maryland
West: Missouri vs Drake → 🏀 Missouri
West: Texas Tech vs UNC Wilmington → 🏀 Texas Tech
West: Kansas vs Arkansas → 🏀 Kansas
West: St. John's vs Omaha → 🏀 St. John's
Midwest: Houston vs SIU Edwardsville → 🏀 Houston
Midwest: Gonzaga vs Georgia → 🏀 Gonzaga
Midwest: Clemson vs McNeese → 🏀 Clemson
Midwest: Purdue vs High Point → 🏀 Purdue
Midwest: Illinois vs Xavier → 🏀 Illinois
Midwest: Kentucky vs Troy → 🏀 Kentucky
Midwest: UCLA vs Utah St

In [23]:
def predict_matchup_top_features(team1, team2, df):
    """
    Predicts the winner of a matchup using a composite score computed from the most
    important features identified from historical data.
    
    Parameters:
      - team1 (str): The team name (as in "Mapped ESPN Team Name") for the first team.
      - team2 (str): The team name for the second team.
      - df (DataFrame): The tournament dataset.
      
    Returns:
      - str: The predicted winner.
    """
    # Ensure both teams exist
    if team1 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team1} not found."
    if team2 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team2} not found."
    
    # Extract team data
    t1 = df[df["Mapped ESPN Team Name"] == team1].iloc[0]
    t2 = df[df["Mapped ESPN Team Name"] == team2].iloc[0]
    
    # Compute composite scores for each team.
    def compute_score(team):
        score = 0
        # Higher is better
        score += 1.5 * team["AdjEM"]
        score += 0.8 * team["ORPct"]
        score += 0.6 * team["CenterDR"]
        # Lower is better (subtract the weighted value)
        score -= 1.0 * team["Net Rating Rank"]
        score -= 0.3 * team["Seed"]
        score -= 1.0 * team["Adjusted Defensive Efficiency Rank"]
        score -= 1.0 * team["Adjusted Offensive Efficiency Rank"]
        score -= 0.8 * team["Raw Offensive Efficiency Rank"]
        score -= 0.8 * team["Raw Defensive Efficiency Rank"]
        return score

    score1 = compute_score(t1)
    score2 = compute_score(t2)
    
    # Debug: Uncomment to see scores during testing.
    # print(f"{team1} score: {score1}, {team2} score: {score2}")
    
    return team1 if score1 > score2 else team2

# Example test:
# winner_example = predict_matchup_top_features("Duke", "Michigan State", tournament_teams_2025_df)
# print(f"Predicted Winner: {winner_example}")


In [24]:
def simulate_bracket_top_features(df):
    """
    Simulates a full NCAA 64-team bracket using standard seeding matchups,
    with match outcomes determined by the weighted predictor using only the most
    important features.
    
    Assumes each region (e.g., East, West, South, Midwest) has exactly 16 teams.
    
    Parameters:
      - df (DataFrame): The tournament dataset with all necessary columns.
      
    Returns:
      - dict: A dictionary containing results for each round.
    """
    round_results = {}  # To store matchup outcomes for each round
    regions = df["Region"].unique()
    bracket = {}
    
    # Set up the first-round matchups for each region using standard seeding
    for region in regions:
        region_df = df[df["Region"] == region].sort_values("Seed")
        # Map seed to team
        seed_to_team = {int(row["Seed"]): row["Mapped ESPN Team Name"] for _, row in region_df.iterrows()}
        if len(seed_to_team) != 16:
            return f"Error: Region {region} does not have 16 teams. Found: {len(seed_to_team)}"
        
        # Standard matchups: 1 vs 16, 8 vs 9, 5 vs 12, 4 vs 13, 6 vs 11, 3 vs 14, 7 vs 10, 2 vs 15
        matchups = [
            (seed_to_team[1], seed_to_team[16]),
            (seed_to_team[8], seed_to_team[9]),
            (seed_to_team[5], seed_to_team[12]),
            (seed_to_team[4], seed_to_team[13]),
            (seed_to_team[6], seed_to_team[11]),
            (seed_to_team[3], seed_to_team[14]),
            (seed_to_team[7], seed_to_team[10]),
            (seed_to_team[2], seed_to_team[15])
        ]
        bracket[region] = matchups
    
    # Round of 64
    round_results["Round of 64"] = {}
    region_winners = {}
    for region in regions:
        winners = []
        for team1, team2 in bracket[region]:
            winner = predict_matchup_top_features(team1, team2, df)
            round_results["Round of 64"][f"{region}: {team1} vs {team2}"] = winner
            winners.append(winner)
        region_winners[region] = winners  # 8 winners per region
    
    # Helper function to simulate a round within a region
    def simulate_region(winners_list, round_name, region):
        matchups = [(winners_list[i], winners_list[i+1]) for i in range(0, len(winners_list), 2)]
        round_results[round_name] = round_results.get(round_name, {})
        new_winners = []
        for team1, team2 in matchups:
            winner = predict_matchup_top_features(team1, team2, df)
            round_results[round_name][f"{region}: {team1} vs {team2}"] = winner
            new_winners.append(winner)
        return new_winners
    
    # Round of 32
    for region in regions:
        region_winners[region] = simulate_region(region_winners[region], "Round of 32", region)
    # Sweet 16
    for region in regions:
        region_winners[region] = simulate_region(region_winners[region], "Sweet 16", region)
    # Elite 8 (Region Finals) - each region should have 1 winner now
    final_four = {}
    for region in regions:
        winners = simulate_region(region_winners[region], "Elite 8", region)
        if len(winners) != 1:
            return f"Error: Region {region} did not produce a single winner in Elite 8."
        final_four[region] = winners[0]
    
    # Final Four: Combine the 4 region winners.
    # Here, we assume regions can be sorted alphabetically or by a pre-defined order.
    final_four_order = sorted(final_four.keys())
    final_four_teams = [final_four[region] for region in final_four_order]
    
    round_results["Final Four"] = {}
    ff_matchup1 = (final_four_teams[0], final_four_teams[1])
    ff_matchup2 = (final_four_teams[2], final_four_teams[3])
    winner_ff1 = predict_matchup_top_features(ff_matchup1[0], ff_matchup1[1], df)
    winner_ff2 = predict_matchup_top_features(ff_matchup2[0], ff_matchup2[1], df)
    round_results["Final Four"][f"{ff_matchup1[0]} vs {ff_matchup1[1]}"] = winner_ff1
    round_results["Final Four"][f"{ff_matchup2[0]} vs {ff_matchup2[1]}"] = winner_ff2
    
    # Championship: Final matchup between the two Final Four winners
    round_results["Championship"] = {}
    champ_matchup = (winner_ff1, winner_ff2)
    champion = predict_matchup_top_features(champ_matchup[0], champ_matchup[1], df)
    round_results["Championship"][f"{champ_matchup[0]} vs {champ_matchup[1]}"] = champion
    
    return round_results

# Run the full bracket simulation using only the top features predictor
bracket_results_top = simulate_bracket_top_features(tournament_teams_2025_df)

# Display the results:
print("\n🏆 March Madness Bracket Simulation Results (Top Features Only) 🏆")
for round_name, matchups in bracket_results_top.items():
    print(f"\n🔹 {round_name}")
    if isinstance(matchups, dict):
        for matchup, winner in matchups.items():
            print(f"{matchup} → 🏀 {winner}")
    else:
        print(matchups)



🏆 March Madness Bracket Simulation Results (Top Features Only) 🏆

🔹 Round of 64
East: Duke vs Mount St. Mary's → 🏀 Duke
East: Mississippi State vs Baylor → 🏀 Baylor
East: Oregon vs Liberty → 🏀 Oregon
East: Arizona vs Akron → 🏀 Arizona
East: BYU vs VCU → 🏀 BYU
East: Wisconsin vs Montana → 🏀 Wisconsin
East: Saint Mary's vs Vanderbilt → 🏀 Saint Mary's
East: Alabama vs Robert Morris → 🏀 Alabama
West: Florida vs Norfolk State → 🏀 Florida
West: UConn vs Oklahoma → 🏀 UConn
West: Memphis vs Colorado State → 🏀 Colorado State
West: Maryland vs Grand Canyon → 🏀 Maryland
West: Missouri vs Drake → 🏀 Missouri
West: Texas Tech vs UNC Wilmington → 🏀 Texas Tech
West: Kansas vs Arkansas → 🏀 Kansas
West: St. John's vs Omaha → 🏀 St. John's
Midwest: Houston vs SIU Edwardsville → 🏀 Houston
Midwest: Gonzaga vs Georgia → 🏀 Gonzaga
Midwest: Clemson vs McNeese → 🏀 Clemson
Midwest: Purdue vs High Point → 🏀 Purdue
Midwest: Illinois vs Xavier → 🏀 Illinois
Midwest: Kentucky vs Troy → 🏀 Kentucky
Midwest: UCLA vs Ut

In [25]:
def construct_matchup_dataset(df):
    """
    Constructs a dataset for pairwise matchups from historical tournament data.
    This example uses first-round matchups for each region.
    
    Parameters:
    - df (DataFrame): Historical tournament data (e.g. March_madness_df from 2002 to 2025)
    
    Returns:
    - DataFrame: A new DataFrame where each row represents a matchup with feature differences and a target outcome.
    """
    matchup_rows = []
    seasons = df['Season'].unique()
    
    # Standard first-round seed matchups:
    standard_matchups = [(1, 16), (8, 9), (5, 12), (4, 13), (6, 11), (3, 14), (7, 10), (2, 15)]
    
    # Define the top features we are interested in:
    top_features = [
        "AdjEM",
        "Net Rating",         # Assumes this column exists
        "Net Rating Rank",    # Assumes this column exists
        "Seed",
        "Adjusted Defensive Efficiency Rank",
        "Adjusted Offensive Efficiency Rank",
        "Raw Offensive Efficiency Rank",
        "Raw Defensive Efficiency Rank",
        "ORPct",
        "CenterDR"
    ]
    
    for season in seasons:
        season_df = df[df['Season'] == season]
        regions = season_df['Region'].unique()
        for region in regions:
            region_df = season_df[season_df['Region'] == region].sort_values(by="Seed")
            # Filter out rows with non-numeric Seed values:
            region_df_numeric = region_df[pd.to_numeric(region_df["Seed"], errors="coerce").notnull()].copy()
            region_df_numeric["Seed"] = region_df_numeric["Seed"].astype(int)
            
            # Create a dictionary mapping seed to the row (as a Series)
            seed_to_team = {}
            for idx, row in region_df_numeric.iterrows():
                seed_to_team[int(row["Seed"])] = row
                
            # Ensure region has 16 teams
            if len(seed_to_team) != 16:
                continue  # Skip regions that don't have the full bracket
            
            for seed1, seed2 in standard_matchups:
                if seed1 in seed_to_team and seed2 in seed_to_team:
                    team1 = seed_to_team[seed1]
                    team2 = seed_to_team[seed2]
                    
                    # Determine outcome:
                    # For example, if team1's "Tournament Winner?" equals "Yes", then team1 wins.
                    # This is a placeholder; adjust according to your data.
                    outcome = 1 if team1["Tournament Winner?"] == "Yes" else 0
                    
                    # Calculate differences for each feature
                    features = {}
                    for feat in top_features:
                        value1 = pd.to_numeric(team1[feat], errors="coerce")
                        value2 = pd.to_numeric(team2[feat], errors="coerce")
                        features[f"diff_{feat}"] = value1 - value2
                    
                    # Also add a seed difference (convert seeds to numeric)
                    features["seed_diff"] = pd.to_numeric(team1["Seed"], errors="coerce") - pd.to_numeric(team2["Seed"], errors="coerce")
                    
                    matchup_rows.append({**features, "target": outcome})
                    
    return pd.DataFrame(matchup_rows)

# Construct historical matchup dataset:
matchup_data = construct_matchup_dataset(march_madness_df)
print("Matchup dataset preview:")
print(matchup_data.head())


Matchup dataset preview:
   diff_AdjEM  diff_Net Rating  diff_Net Rating Rank  diff_Seed  \
0    44.82361             44.9                 -5739        -15   
1    -1.04050             -1.0                    70         -1   
2     6.61260              6.5                  -764         -7   
3    19.14571             19.1                 -2244         -9   
4     1.78600              1.8                  -126         -5   

   diff_Adjusted Defensive Efficiency Rank  \
0                                     -172   
1                                      -11   
2                                      -17   
3                                     -150   
4                                       45   

   diff_Adjusted Offensive Efficiency Rank  \
0                                     -289   
1                                        7   
2                                      -41   
3                                      -54   
4                                      -30   

   diff_Raw Offens

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
# Assume matchup_data is already constructed from historical data
feature_cols = [col for col in matchup_data.columns if col != "target"]

X = matchup_data[feature_cols]
y = matchup_data["target"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using the median
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Check that there are no NaN values
print("Number of NaNs in X_train_imputed:", np.isnan(X_train_imputed).sum())

# Train the Logistic Regression model using the imputed data
ml_model = LogisticRegression(max_iter=1000)
ml_model.fit(X_train_imputed, y_train)

# Evaluate the model
train_acc = ml_model.score(X_train_imputed, y_train)
test_acc = ml_model.score(X_test_imputed, y_test)
cv_scores = cross_val_score(ml_model, X_train_imputed, y_train, cv=5)

print(f"Training Accuracy: {train_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average CV Score: {np.mean(cv_scores):.2f}")


Number of NaNs in X_train_imputed: 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy: 0.97
Test Accuracy: 0.97
Cross-Validation Scores: [0.97709924 0.96946565 0.98461538 0.96923077 0.95384615]
Average CV Score: 0.97


In [28]:
ml_model.fit(X_train_imputed, y_train)


In [29]:
print("Model classes:", ml_model.classes_)


Model classes: [0 1]


In [30]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# -----------------------------
# Define the ML feature columns based on our top features
ml_feature_columns = [
    "AdjEM",
    "Net Rating",
    "Net Rating Rank",
    "Seed",
    "Adjusted Defensive Efficiency Rank",
    "Adjusted Offensive Efficiency Rank",
    "Raw Offensive Efficiency Rank",
    "Raw Defensive Efficiency Rank",
    "ORPct",
    "CenterDR"
]

# -----------------------------
# Assume your ML model is already trained on your matchup dataset.
# For example, you might have done:
# from sklearn.impute import SimpleImputer
# from sklearn.linear_model import LogisticRegression
# imputer = SimpleImputer(strategy="median")
# X_train_imputed = imputer.fit_transform(X_train)
# ml_model = LogisticRegression(max_iter=5000)
# ml_model.fit(X_train_imputed, y_train)
# (Here we assume ml_model is available and fitted.)
# -----------------------------

# Hybrid Predictor Functions

def predict_matchup_top_features(team1, team2, df):
    """
    Predicts the winner of a matchup using a composite score computed from the most important features.
    Uses weighted metrics based on historical research.
    """
    if team1 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team1} not found."
    if team2 not in df["Mapped ESPN Team Name"].values:
        return f"Error: {team2} not found."
        
    t1 = df[df["Mapped ESPN Team Name"] == team1].iloc[0]
    t2 = df[df["Mapped ESPN Team Name"] == team2].iloc[0]
    
    # Define weights – note: lower Seed is better, so we subtract it.
    # We reduce the Seed weight (0.3) to avoid overfavoring high seeds.
    weights = {
        "AdjEM": 1.5,
        "Net Rating": 0.8,
        "Net Rating Rank": 1.0,  # Lower is better
        "Seed": 0.3,             # Lower is better
        "Adjusted Defensive Efficiency Rank": 1.0,
        "Adjusted Offensive Efficiency Rank": 1.0,
        "Raw Offensive Efficiency Rank": 0.8,
        "Raw Defensive Efficiency Rank": 0.8,
        "ORPct": 0.8,
        "CenterDR": 0.6
    }
    
    def compute_score(team):
        score = 0
        score += weights["AdjEM"] * team["AdjEM"]
        score += weights["Net Rating"] * team["Net Rating"]
        score -= weights["Net Rating Rank"] * team["Net Rating Rank"]
        score -= weights["Seed"] * team["Seed"]
        score -= weights["Adjusted Defensive Efficiency Rank"] * team["Adjusted Defensive Efficiency Rank"]
        score -= weights["Adjusted Offensive Efficiency Rank"] * team["Adjusted Offensive Efficiency Rank"]
        score -= weights["Raw Offensive Efficiency Rank"] * team["Raw Offensive Efficiency Rank"]
        score -= weights["Raw Defensive Efficiency Rank"] * team["Raw Defensive Efficiency Rank"]
        score += weights["ORPct"] * team["ORPct"]
        score += weights["CenterDR"] * team["CenterDR"]
        return score

    score1 = compute_score(t1)
    score2 = compute_score(t2)
    # Debug: Uncomment to print scores
    # print(f"{team1} score: {score1}, {team2} score: {score2}")
    return team1 if score1 > score2 else team2

def hybrid_predict_matchup(team1, team2, df, model, feature_columns, rule_weight=0.5):
    """
    Combines the rule-based predictor and ML model predictions into a hybrid outcome.
    The parameter rule_weight (0.0 to 1.0) controls the influence of the rule-based system.
    """
    # Rule-based prediction:
    rule_winner = predict_matchup_top_features(team1, team2, df)
    rule_outcome = 1 if rule_winner == team1 else 0
    
    # Build feature differences (team1 minus team2) for ML model input:
    features = {}
    for col in feature_columns:
        features[f"diff_{col}"] = (df[df["Mapped ESPN Team Name"] == team1].iloc[0][col] -
                                     df[df["Mapped ESPN Team Name"] == team2].iloc[0][col])
    features["seed_diff"] = (df[df["Mapped ESPN Team Name"] == team1].iloc[0]["Seed"] -
                             df[df["Mapped ESPN Team Name"] == team2].iloc[0]["Seed"])
    
    # Convert features to a numpy array (ensure the order matches the training data)
    X_pred = np.array([list(features.values())])
    
    if not hasattr(model, "classes_"):
        raise ValueError("The ML model does not appear to be fitted. Please fit the model before prediction.")
    
    ml_prob = model.predict_proba(X_pred)[0][1]  # Probability that team1 wins
    
    # Combine predictions using a weighted average:
    final_score = rule_weight * rule_outcome + (1 - rule_weight) * ml_prob
    # Debug: Uncomment to print intermediate probabilities:
    # print(f"Rule outcome: {rule_outcome}, ML probability: {ml_prob}, Final Score: {final_score}")
    return team1 if final_score >= 0.5 else team2

def simulate_bracket_hybrid(df, model, feature_columns, rule_weight=0.5):
    """
    Simulates a full NCAA 64-team bracket using standard seeding matchups and the hybrid predictor.
    Each region must have exactly 16 teams.
    """
    round_results = {}
    regions = df["Region"].unique()
    bracket = {}

    # Set up first-round matchups for each region (using standard seeding)
    for region in regions:
        region_df = df[df["Region"] == region].sort_values("Seed")
        seed_to_team = {int(row["Seed"]): row["Mapped ESPN Team Name"] for _, row in region_df.iterrows()}
        if len(seed_to_team) != 16:
            return f"Error: Region {region} does not have 16 teams. Found: {len(seed_to_team)}"
        matchups = [
            (seed_to_team[1], seed_to_team[16]),
            (seed_to_team[8], seed_to_team[9]),
            (seed_to_team[5], seed_to_team[12]),
            (seed_to_team[4], seed_to_team[13]),
            (seed_to_team[6], seed_to_team[11]),
            (seed_to_team[3], seed_to_team[14]),
            (seed_to_team[7], seed_to_team[10]),
            (seed_to_team[2], seed_to_team[15])
        ]
        bracket[region] = matchups

    round_results["Round of 64"] = {}
    region_winners = {}
    for region in regions:
        winners = []
        for team1, team2 in bracket[region]:
            winner = hybrid_predict_matchup(team1, team2, df, model, feature_columns, rule_weight)
            round_results["Round of 64"][f"{region}: {team1} vs {team2}"] = winner
            winners.append(winner)
        region_winners[region] = winners  # Should be 8 winners per region

    def simulate_region(winners_list, round_name, region):
        # If an odd number of teams remains, give the last team a bye:
        if len(winners_list) % 2 != 0:
            bye_team = winners_list[-1]
            winners_list = winners_list[:-1]
            bye = [bye_team]
        else:
            bye = []
        matchups = [(winners_list[i], winners_list[i+1]) for i in range(0, len(winners_list), 2)]
        round_results[round_name] = round_results.get(round_name, {})
        new_winners = []
        for team1, team2 in matchups:
            winner = hybrid_predict_matchup(team1, team2, df, model, feature_columns, rule_weight)
            round_results[round_name][f"{region}: {team1} vs {team2}"] = winner
            new_winners.append(winner)
        return new_winners + bye

    # Simulate rounds: Round of 32, Sweet 16, and Elite 8 (regional finals)
    for round_name in ["Round of 32", "Sweet 16", "Elite 8"]:
        for region in regions:
            region_winners[region] = simulate_region(region_winners[region], round_name, region)
    
    # Each region should now produce 1 winner for the Final Four.
    final_four = {region: region_winners[region][0] for region in regions if len(region_winners[region]) == 1}
    if len(final_four) != 4:
        return f"Error: Expected 4 region winners for Final Four, got {len(final_four)}."
    
    final_four_order = sorted(final_four.keys())
    final_four_teams = [final_four[region] for region in final_four_order]
    
    round_results["Final Four"] = {}
    ff_matchup1 = (final_four_teams[0], final_four_teams[1])
    ff_matchup2 = (final_four_teams[2], final_four_teams[3])
    winner_ff1 = hybrid_predict_matchup(ff_matchup1[0], ff_matchup1[1], df, model, feature_columns, rule_weight)
    winner_ff2 = hybrid_predict_matchup(ff_matchup2[0], ff_matchup2[1], df, model, feature_columns, rule_weight)
    round_results["Final Four"][f"{ff_matchup1[0]} vs {ff_matchup1[1]}"] = winner_ff1
    round_results["Final Four"][f"{ff_matchup2[0]} vs {ff_matchup2[1]}"] = winner_ff2
    
    round_results["Championship"] = {}
    champ_matchup = (winner_ff1, winner_ff2)
    champion = hybrid_predict_matchup(champ_matchup[0], champ_matchup[1], df, model, feature_columns, rule_weight)
    round_results["Championship"][f"{champ_matchup[0]} vs {champ_matchup[1]}"] = champion
    
    return round_results

# --- Run the full bracket simulation using the hybrid predictor ---
hybrid_bracket_results = simulate_bracket_hybrid(tournament_teams_2025_df, ml_model, ml_feature_columns, rule_weight=0.5)

# --- Display the simulation results ---
print("\n🏆 March Madness Bracket Simulation Results (Hybrid Model) 🏆")
for round_name, matchups in hybrid_bracket_results.items():
    print(f"\n🔹 {round_name}")
    if isinstance(matchups, dict):
        for matchup, winner in matchups.items():
            print(f"{matchup} → 🏀 {winner}")
    else:
        print(matchups)



🏆 March Madness Bracket Simulation Results (Hybrid Model) 🏆

🔹 Round of 64
East: Duke vs Mount St. Mary's → 🏀 Duke
East: Mississippi State vs Baylor → 🏀 Baylor
East: Oregon vs Liberty → 🏀 Oregon
East: Arizona vs Akron → 🏀 Arizona
East: BYU vs VCU → 🏀 BYU
East: Wisconsin vs Montana → 🏀 Wisconsin
East: Saint Mary's vs Vanderbilt → 🏀 Saint Mary's
East: Alabama vs Robert Morris → 🏀 Alabama
West: Florida vs Norfolk State → 🏀 Florida
West: UConn vs Oklahoma → 🏀 UConn
West: Memphis vs Colorado State → 🏀 Colorado State
West: Maryland vs Grand Canyon → 🏀 Maryland
West: Missouri vs Drake → 🏀 Missouri
West: Texas Tech vs UNC Wilmington → 🏀 Texas Tech
West: Kansas vs Arkansas → 🏀 Kansas
West: St. John's vs Omaha → 🏀 St. John's
Midwest: Houston vs SIU Edwardsville → 🏀 Houston
Midwest: Gonzaga vs Georgia → 🏀 Gonzaga
Midwest: Clemson vs McNeese → 🏀 Clemson
Midwest: Purdue vs High Point → 🏀 Purdue
Midwest: Illinois vs Xavier → 🏀 Illinois
Midwest: Kentucky vs Troy → 🏀 Kentucky
Midwest: UCLA vs Utah St