In [4]:
# Step 1: Import libraries
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations
import re            # Regex for seed parsing
from sklearn.metrics import brier_score_loss  # Evaluation metric

# Step 2: Load seed data and sample submission file
men_seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
women_seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySeeds.csv')

# Combine men's and women's seeds into one DataFrame
all_seeds = pd.concat([men_seeds, women_seeds], ignore_index=True)

# Load the file you'll be submitting with matchups to predict
submission = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/SampleSubmissionStage2.csv')

# Step 3: Helper functions

# Break the 'ID' string into Season, TeamID1, and TeamID2
def parse_match_id(match_id):
    year, t1, t2 = match_id.split('_')
    return int(year), int(t1), int(t2)

# Extract the numeric part of the seed (e.g., 'W05' -> 5)
def extract_numeric_seed(seed_str):
    match = re.search(r'\d+', str(seed_str))
    return int(match.group()) if match else 16  # Default to worst seed (16) if missing

# Step 4: Extract season and team IDs from match ID
submission[['Season', 'TeamID1', 'TeamID2']] = submission['ID'].apply(parse_match_id).apply(pd.Series)

# Extract seed numbers for all teams
all_seeds['SeedNum'] = all_seeds['Seed'].apply(extract_numeric_seed)

# Step 5: Merge seed info for Team 1 into the submission
submission = submission.merge(
    all_seeds[['Season', 'TeamID', 'SeedNum']],
    how='left',
    left_on=['Season', 'TeamID1'],
    right_on=['Season', 'TeamID']
).rename(columns={'SeedNum': 'Seed1'}).drop(columns='TeamID')

# Merge seed info for Team 2
submission = submission.merge(
    all_seeds[['Season', 'TeamID', 'SeedNum']],
    how='left',
    left_on=['Season', 'TeamID2'],
    right_on=['Season', 'TeamID']
).rename(columns={'SeedNum': 'Seed2'}).drop(columns='TeamID')

# Fill any missing seeds with 16 (worst possible seed)
submission[['Seed1', 'Seed2']] = submission[['Seed1', 'Seed2']].fillna(16)

# Step 6: Make predictions based on seed difference
submission['SeedGap'] = submission['Seed2'] - submission['Seed1']  # Positive = Team1 stronger
submission['Pred'] = 0.5 + 0.03 * submission['SeedGap']  # Formula to turn seed gap into probability
submission['Pred'] = submission['Pred'].clip(0.05, 0.95)  # Limit predictions between 5% and 95%

# Step 7: Evaluate predictions using fake outcomes (assume Team 1 wins every game)
fake_labels = np.ones(len(submission))  # Ground truth = TeamID1 always wins (for testing only)
brier = brier_score_loss(fake_labels, submission['Pred'])  # Compute Brier score
print(f"Simulated Brier Score (if TeamID1 always won): {brier:.4f}")

# Step 8: Save predictions to CSV for submission
submission[['ID', 'Pred']].to_csv('/kaggle/working/submission.csv', index=False)


Simulated Brier Score (if TeamID1 always won): 0.2677
