In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load the matchups file
matchups = pd.read_csv('womens_matchups_validation.csv')

In [None]:
# TRAIN EARLY ROUNDS MODEL
df_early = pd.read_csv('games_early_2_.csv')
features_early = ['barthag', 'adj_oe', 'adj_de', 'orb_pct', 'drb_pct', 'ftr', '2p_pct']
X = df_early[features_early]
y = df_early['win']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model_early = LogisticRegression(max_iter=1000, random_state=42)
model_early.fit(X_train, y_train)

# TRAIN ELITE ROUNDS MODEL
df_elite = pd.read_csv('games_elite_2_.csv')
features_elite = ['wab', 'barthag', 'adj_oe', 'adj_de', 'efg_pct', 'efgd_pct', 'orb_pct', 'drb_pct', '2p_pct', '2pd_pct', '3p_pct', '3pd_pct', '3pr']
X = df_elite[features_elite]
y = df_elite['win']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model_elite = xgb.XGBClassifier(learning_rate=0.2997738363859162,
                                max_depth=9,
                                min_child_weight=8.623522034407337,
                                subsample=0.8324211691115178,
                                colsample_bytree=0.9988769480719698,
                                gamma=2.017715776385069,
                                reg_alpha=0.9692563913308194,
                                reg_lambda=2.5910989850621258,
                                n_estimators=335,
                                objective='binary:logistic',
                                random_state=42,
                                use_label_encoder=False)
model_elite.fit(X_train, y_train, verbose=False)

In [None]:
# Determine which model to use based on round
early_rounds = ['First Round', 'Second Round']
matchups['model_type'] = matchups['round'].apply(lambda x: 'early' if x in early_rounds else 'elite')

# Predict for early rounds
early_mask = matchups['model_type'] == 'early'
if early_mask.sum() > 0:
    X_pred = matchups.loc[early_mask, features_early]
    matchups.loc[early_mask, 'predicted_win'] = model_early.predict(X_pred)
    matchups.loc[early_mask, 'high_seed_win_prob'] = model_early.predict_proba(X_pred)[:, 1]

# Predict for elite rounds
elite_mask = matchups['model_type'] == 'elite'
if elite_mask.sum() > 0:
    X_pred = matchups.loc[elite_mask, features_elite]
    matchups.loc[elite_mask, 'predicted_win'] = model_elite.predict(X_pred)
    matchups.loc[elite_mask, 'high_seed_win_prob'] = model_elite.predict_proba(X_pred)[:, 1]


In [None]:
matchups['correct'] = (matchups['predicted_win'] == matchups['win']).astype(int)

# Export
matchups.to_csv('all_matchups_predictions.csv', index=False)
print(f"Predicted {len(matchups)} matchups")
print(f"Overall Accuracy: {matchups['correct'].mean():.4f}")
print("Saved to all_matchups_predictions.csv")