# Women's NCAA Tournament Model Predictions
This notebook generates predictions for tournament matchups from 2021-2025 using Early and Elite models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

## Load Training Data

In [None]:
# Load training datasets
games_early = pd.read_csv('games_early.csv', index_col=0)
games_elite = pd.read_csv('games_elite.csv', index_col=0)

print("Early Games Shape:", games_early.shape)
print("Elite Games Shape:", games_elite.shape)

## Load Validation Matchups and Team Data

In [None]:
# Load validation matchups
validation_matchups = pd.read_csv('womens_matchups_validation_1_.csv')

# Load team data for merging
teams = pd.read_csv('wncaat_teams_historical_1_.csv')
torvik = pd.read_csv('torvik_women_historical_1_.csv')

print("Validation Matchups Shape:", validation_matchups.shape)
print("\nFirst few validation matchups:")
print(validation_matchups.head())

## Prepare Team Stats Data

In [None]:
# Merge teams with torvik data
df = pd.merge(teams, torvik, on='torvik_id', how='inner')
print("Merged Team Data Shape:", df.shape)

# Select relevant columns
df = df[[
 'team_id',
 'wab',
 'barthag',
 'adj_oe',
 'adj_de',
 'efg_pct',
 'efgd_pct',
 'tor',
 'tord',
 'orb_pct',
 'drb_pct',
 'ftr',
 'ftrd',
 '2p_pct',
 '2pd_pct',
 '3p_pct',
 '3pd_pct',
 '3pr',
 '3prd',
 'adj_tempo'
]]

In [None]:
# Transform defensive stats (flip them so higher is better)
df['adj_de']   = 200 - df['adj_de']
df['efgd_pct'] = 100 - df['efgd_pct']
df['tord']     = 100 - df['tord']
df['drb_pct']  = 100 - df['drb_pct']
df['ftrd']     = 100 - df['ftrd']
df['2pd_pct']  = 100 - df['2pd_pct']
df['3pd_pct']  = 100 - df['3pd_pct'] 
df['3prd']     = 100 - df['3prd']

print("Team stats prepared")

## Restructure Validation Data

In [None]:
# Restructure the validation data from Team A/Team B format to high_team_id/low_team_id format
validation_restructured = pd.DataFrame()
validation_restructured['year'] = validation_matchups['Year']
validation_restructured['region'] = validation_matchups['Region']
validation_restructured['round'] = validation_matchups['Round']

# Determine which team has the higher seed (lower seed number)
validation_restructured['high_bracket_seed'] = validation_matchups[['Seed A', 'Seed B']].min(axis=1)
validation_restructured['low_bracket_seed'] = validation_matchups[['Seed A', 'Seed B']].max(axis=1)

# Assign teams based on seeds
validation_restructured['high_team_id'] = validation_matchups.apply(
    lambda row: row['Team A'] if row['Seed A'] <= row['Seed B'] else row['Team B'], axis=1
)
validation_restructured['low_team_id'] = validation_matchups.apply(
    lambda row: row['Team B'] if row['Seed A'] <= row['Seed B'] else row['Team A'], axis=1
)

print("Validation data restructured:")
print(validation_restructured.head(10))
print(f"\nShape: {validation_restructured.shape}")
print(f"\nYears: {sorted(validation_restructured['year'].unique())}")

## Merge Validation Data with Team Stats

In [None]:
# Merge with team stats for high seed
matchups = pd.merge(validation_restructured, df, left_on='high_team_id', right_on='team_id', how='left')
matchups = matchups.rename(columns={
 'wab': 'high_wab',
 'barthag': 'high_barthag',
 'adj_oe': 'high_adj_oe',
 'adj_de': 'high_adj_de',
 'efg_pct': 'high_efg_pct',
 'efgd_pct': 'high_efgd_pct',
 'tor': 'high_tor',
 'tord': 'high_tord',
 'orb_pct': 'high_orb_pct',
 'drb_pct': 'high_drb_pct',
 'ftr': 'high_ftr',
 'ftrd': 'high_ftrd',
 '2p_pct': 'high_2p_pct',
 '2pd_pct': 'high_2pd_pct',
 '3p_pct': 'high_3p_pct',
 '3pd_pct': 'high_3pd_pct',
 '3pr': 'high_3pr',
 '3prd': 'high_3prd',
 'adj_tempo': 'high_adj_tempo'
})
matchups = matchups.drop('team_id', axis=1)

# Merge with team stats for low seed
matchups = pd.merge(matchups, df, left_on='low_team_id', right_on='team_id', how='left')
matchups = matchups.rename(columns={
 'wab': 'low_wab',
 'barthag': 'low_barthag',
 'adj_oe': 'low_adj_oe',
 'adj_de': 'low_adj_de',
 'efg_pct': 'low_efg_pct',
 'efgd_pct': 'low_efgd_pct',
 'tor': 'low_tor',
 'tord': 'low_tord',
 'orb_pct': 'low_orb_pct',
 'drb_pct': 'low_drb_pct',
 'ftr': 'low_ftr',
 'ftrd': 'low_ftrd',
 '2p_pct': 'low_2p_pct',
 '2pd_pct': 'low_2pd_pct',
 '3p_pct': 'low_3p_pct',
 '3pd_pct': 'low_3pd_pct',
 '3pr': 'low_3pr',
 '3prd': 'low_3prd',
 'adj_tempo': 'low_adj_tempo'
})
matchups = matchups.drop('team_id', axis=1)

print("Validation matchups merged. Shape:", matchups.shape)
print(f"\nMatches with missing data: {matchups.isnull().any(axis=1).sum()}")

In [None]:
# Remove rows with missing data (teams not in Torvik dataset)
matchups_clean = matchups.dropna()
print(f"Clean matchups: {len(matchups_clean)} (removed {len(matchups) - len(matchups_clean)} with missing data)")

if len(matchups) > len(matchups_clean):
    print("\nTeams with missing stats:")
    missing = matchups[matchups.isnull().any(axis=1)]
    print("High seeds:", missing['high_team_id'].unique())
    print("Low seeds:", missing['low_team_id'].unique())

## Create Differential Features

In [None]:
# Create differential features (following Games_Data_Prep.ipynb)
matchups_final = pd.DataFrame()
matchups_final['year']              = matchups_clean['year']
matchups_final['region']            = matchups_clean['region']
matchups_final['round']             = matchups_clean['round']
matchups_final['high_bracket_seed'] = matchups_clean['high_bracket_seed']
matchups_final['high_team_id']      = matchups_clean['high_team_id']
matchups_final['low_bracket_seed']  = matchups_clean['low_bracket_seed']
matchups_final['low_team_id']       = matchups_clean['low_team_id']
matchups_final['wab']       = matchups_clean['high_wab']      - matchups_clean['low_wab']
matchups_final['barthag']   = matchups_clean['high_barthag']  - matchups_clean['low_barthag']
matchups_final['adj_oe']    = matchups_clean['high_adj_oe']   - matchups_clean['low_adj_de']
matchups_final['adj_de']    = matchups_clean['high_adj_de']   - matchups_clean['low_adj_oe']
matchups_final['efg_pct']   = matchups_clean['high_efg_pct']  - matchups_clean['low_efgd_pct']
matchups_final['efgd_pct']  = matchups_clean['high_efgd_pct'] - matchups_clean['low_efg_pct']
matchups_final['tor']       = matchups_clean['high_tor']      - matchups_clean['low_tord']
matchups_final['tord']      = matchups_clean['high_tord']     - matchups_clean['low_tor']
matchups_final['orb_pct']   = matchups_clean['high_orb_pct']  - matchups_clean['low_drb_pct']
matchups_final['drb_pct']   = matchups_clean['high_drb_pct']  - matchups_clean['low_orb_pct']
matchups_final['ftr']       = matchups_clean['high_ftr']      - matchups_clean['low_ftrd']
matchups_final['ftrd']      = matchups_clean['high_ftrd']     - matchups_clean['low_ftr']
matchups_final['2p_pct']    = matchups_clean['high_2p_pct']   - matchups_clean['low_2pd_pct']
matchups_final['2pd_pct']   = matchups_clean['high_2pd_pct']  - matchups_clean['low_2p_pct']
matchups_final['3p_pct']    = matchups_clean['high_3p_pct']   - matchups_clean['low_3pd_pct']
matchups_final['3pd_pct']   = matchups_clean['high_3pd_pct']  - matchups_clean['low_3p_pct']
matchups_final['3pr']       = matchups_clean['high_3pr']      - matchups_clean['low_3prd']
matchups_final['3prd']      = matchups_clean['high_3prd']     - matchups_clean['low_3pr']
matchups_final['adj_tempo'] = matchups_clean['high_adj_tempo']- matchups_clean['low_adj_tempo']

print("Differential features created")
print(f"Final dataset shape: {matchups_final.shape}")

## Map Round Names and Split Data

In [None]:
# Map round names to match training data format
round_mapping = {
    'Round 1': 'First Round',
    'Round 2': 'Second Round',
    'Round 3': 'Sweet 16',
    'Round 4': 'Elite Eight',
    'Round 5': 'Final Four',
    'Round 6': 'Championship'
}

matchups_final['round'] = matchups_final['round'].map(round_mapping)

# Split validation data
validation_early = matchups_final[matchups_final['round'].isin(['First Round', 'Second Round'])].copy()
validation_elite = matchups_final[matchups_final['round'].isin(['Sweet 16', 'Elite Eight', 'Final Four', 'Championship'])].copy()

print(f"Early Round Games: {len(validation_early)}")
print(f"Elite Round Games: {len(validation_elite)}")
print(f"\nYears in dataset: {sorted(matchups_final['year'].unique())}")

## Train Early Model (Logistic Regression)

In [None]:
# Early model features from WOMEN'S EARLY.txt
early_features = ['barthag', 'adj_oe', 'adj_de', 'orb_pct', 'drb_pct', 'ftr', '2p_pct']

# Prepare training data
X_early_train = games_early[early_features]
y_early_train = games_early['win']

# Train Logistic Regression model
early_model = LogisticRegression(max_iter=1000, random_state=42)
early_model.fit(X_early_train, y_early_train)

print("Early Model (Logistic Regression) trained")
print(f"Training accuracy: {early_model.score(X_early_train, y_early_train):.4f}")

## Train Elite Model (XGBoost)

In [None]:
# Elite model features from WOMEN'S ELITE.txt
elite_features = ['wab', 'barthag', 'adj_oe', 'adj_de', 'efg_pct', 'efgd_pct',
                  'orb_pct', 'drb_pct', '2p_pct', '2pd_pct', '3p_pct', '3pd_pct', '3pr']

# Prepare training data
X_elite_train = games_elite[elite_features]
y_elite_train = games_elite['win']

# Train XGBoost model with parameters from WOMEN'S ELITE.txt
elite_model = XGBClassifier(
    learning_rate=0.2997738363859162,
    max_depth=9,
    min_child_weight=8.623522034407337,
    subsample=0.8324211691115178,
    colsample_bytree=0.9988769480719698,
    gamma=2.017715776385069,
    reg_alpha=0.9692563913308194,
    reg_lambda=2.5910989850621258,
    n_estimators=335,
    random_state=42,
    eval_metric='logloss'
)
elite_model.fit(X_elite_train, y_elite_train)

print("Elite Model (XGBoost) trained")
print(f"Training accuracy: {elite_model.score(X_elite_train, y_elite_train):.4f}")

## Apply Platt Scaling to Elite Model

In [None]:
# XGBoost can produce poorly calibrated probabilities
# Use Platt Scaling (logistic regression on the output) to calibrate
from sklearn.calibration import CalibratedClassifierCV

# Calibrate the elite model using cross-validation
elite_model_calibrated = CalibratedClassifierCV(elite_model, cv=5, method='sigmoid')
elite_model_calibrated.fit(X_elite_train, y_elite_train)

print("Elite Model calibrated with Platt Scaling")
print(f"Calibrated training accuracy: {elite_model_calibrated.score(X_elite_train, y_elite_train):.4f}")

# Compare uncalibrated vs calibrated probabilities on a sample
sample_probs_uncalibrated = elite_model.predict_proba(X_elite_train[:10])[:, 1]
sample_probs_calibrated = elite_model_calibrated.predict_proba(X_elite_train[:10])[:, 1]

print("\nSample probability comparison (first 10 training examples):")
print("Uncalibrated:", [f"{p:.3f}" for p in sample_probs_uncalibrated])
print("Calibrated:  ", [f"{p:.3f}" for p in sample_probs_calibrated])

## Generate Early Round Predictions

In [None]:
# Prepare validation data
X_early_val = validation_early[early_features]

# Make predictions
y_early_pred = early_model.predict(X_early_val)
y_early_pred_proba = early_model.predict_proba(X_early_val)[:, 1]

# Add predictions to validation dataframe
validation_early['predicted_win'] = y_early_pred
validation_early['win_probability'] = y_early_pred_proba

print(f"Early round predictions complete: {len(validation_early)} games")
print("\nSample predictions:")
print(validation_early[['year', 'round', 'high_team_id', 'low_team_id', 'predicted_win', 'win_probability']].head(10))

## Generate Elite Round Predictions

In [None]:
# Prepare validation data
X_elite_val = validation_elite[elite_features]

# Make predictions using CALIBRATED model
y_elite_pred = elite_model_calibrated.predict(X_elite_val)
y_elite_pred_proba = elite_model_calibrated.predict_proba(X_elite_val)[:, 1]

# Add predictions to validation dataframe
validation_elite['predicted_win'] = y_elite_pred
validation_elite['win_probability'] = y_elite_pred_proba

print(f"Elite round predictions complete: {len(validation_elite)} games")
print("\nSample predictions:")
print(validation_elite[['year', 'round', 'high_team_id', 'low_team_id', 'predicted_win', 'win_probability']].head(10))

## Combine and Save All Predictions

In [None]:
# Combine predictions
all_predictions = pd.concat([validation_early, validation_elite], ignore_index=True)
all_predictions = all_predictions.sort_values(['year', 'round', 'region'])

print(f"Total predictions: {len(all_predictions)}")
print(f"Years covered: {sorted(all_predictions['year'].unique())}")

In [None]:
# Save to CSV
all_predictions.to_csv('tournament_predictions.csv', index=False)
print("Predictions saved to 'tournament_predictions.csv'")

# Display sample
print("\nSample of predictions:")
display_cols = ['year', 'round', 'high_team_id', 'high_bracket_seed', 
                'low_team_id', 'low_bracket_seed', 'predicted_win', 'win_probability']
print(all_predictions[display_cols].head(20))

## Create Readable Predictions Output

In [None]:
# Create a more readable output showing which team is predicted to win
predictions_readable = all_predictions.copy()
predictions_readable['predicted_winner'] = predictions_readable.apply(
    lambda row: row['high_team_id'] if row['predicted_win'] == 1 else row['low_team_id'], axis=1
)
predictions_readable['predicted_loser'] = predictions_readable.apply(
    lambda row: row['low_team_id'] if row['predicted_win'] == 1 else row['high_team_id'], axis=1
)

# Save readable version
predictions_readable.to_csv('tournament_predictions_readable.csv', index=False)
print("Readable predictions saved to 'tournament_predictions_readable.csv'")

# Display sample
print("\nReadable predictions sample:")
readable_cols = ['year', 'round', 'region', 'predicted_winner', 'predicted_loser', 'win_probability']
print(predictions_readable[readable_cols].head(20))

## Summary Statistics

In [None]:
print("=" * 60)
print("PREDICTION SUMMARY")
print("=" * 60)

print(f"\nTotal games predicted: {len(all_predictions)}")
print(f"Years: {sorted(all_predictions['year'].unique())}")

print("\nPredictions by round:")
for round_name in ['First Round', 'Second Round', 'Sweet 16', 'Elite Eight', 'Final Four', 'Championship']:
    count = len(all_predictions[all_predictions['round'] == round_name])
    if count > 0:
        print(f"  {round_name}: {count} games")

print("\nPredictions by year:")
for year in sorted(all_predictions['year'].unique()):
    count = len(all_predictions[all_predictions['year'] == year])
    print(f"  {year}: {count} games")

print("\nHigher seed predicted to win:")
higher_seed_wins = (all_predictions['predicted_win'] == 1).sum()
print(f"  {higher_seed_wins} / {len(all_predictions)} ({100*higher_seed_wins/len(all_predictions):.1f}%)")

print("\nWin probability statistics:")
print(f"  Mean: {all_predictions['win_probability'].mean():.3f}")
print(f"  Median: {all_predictions['win_probability'].median():.3f}")
print(f"  Min: {all_predictions['win_probability'].min():.3f}")
print(f"  Max: {all_predictions['win_probability'].max():.3f}")