In [3]:
import pandas as pd
import numpy as np
from itertools import product
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# == Load data ==
# Replace "..." with actual filenames and paths
m_reg = pd.read_csv("MRegularSeasonCompactResults.csv")
m_teams = pd.read_csv("MTeams.csv")
w_reg = pd.read_csv("WRegularSeasonCompactResults.csv")
w_teams = pd.read_csv("WTeams.csv")

print(f"Men's regular season results: {m_reg.shape}")
print(f"Men's teams: {m_teams.shape}")
print(f"Women's regular season results: {w_reg.shape}")
print(f"Women's teams: {w_teams.shape}")


Men's regular season results: (190771, 8)
Men's teams: (380, 4)
Women's regular season results: (134961, 8)
Women's teams: (378, 2)


In [4]:
def prepare_training_data(regular_season_results):
    """
    Convert each game into two training examples:
      - team_won (label=1)
      - team_lost (label=0)
    Returns a DataFrame that can be used to train a classifier.
    """
    df_list = []
    
    for index, row in regular_season_results.iterrows():
        wteam = row['WTeamID']
        lteam = row['LTeamID']
        wscore = row['WScore']
        lscore = row['LScore']
        wloc = row['WLoc']  # 'H','A','N'
        
        # Example 1: Winner perspective
        df_list.append({
            'TeamID': wteam,
            'OppTeamID': lteam,
            'ScoreDiff': wscore - lscore,  # Should be positive
            'HomeAwayIndicator': 1 if wloc=='H' else (0 if wloc=='A' else 0.5),
            'Label': 1
        })
        
        # Example 2: Loser perspective
        df_list.append({
            'TeamID': lteam,
            'OppTeamID': wteam,
            'ScoreDiff': lscore - wscore,  # Should be negative
            'HomeAwayIndicator': 1 if wloc=='A' else (0 if wloc=='H' else 0.5),
            'Label': 0
        })
        
    return pd.DataFrame(df_list)

m_train = prepare_training_data(m_reg)
w_train = prepare_training_data(w_reg)

print(m_train.head())
print(w_train.head())


   TeamID  OppTeamID  ScoreDiff  HomeAwayIndicator  Label
0    1228       1328         17                0.5      1
1    1328       1228        -17                0.5      0
2    1106       1354          7                1.0      1
3    1354       1106         -7                0.0      0
4    1112       1223          7                1.0      1
   TeamID  OppTeamID  ScoreDiff  HomeAwayIndicator  Label
0    3104       3202         50                1.0      1
1    3202       3104        -50                0.0      0
2    3163       3221         11                1.0      1
3    3221       3163        -11                0.0      0
4    3222       3261          7                1.0      1


In [5]:
feature_cols = ['ScoreDiff', 'HomeAwayIndicator']

# --- Men's model ---
X_m = m_train[feature_cols]
y_m = m_train['Label']

X_m_train, X_m_val, y_m_train, y_m_val = train_test_split(X_m, y_m, test_size=0.2, random_state=42)
model_m = LogisticRegression()
model_m.fit(X_m_train, y_m_train)

val_preds_m = model_m.predict_proba(X_m_val)[:,1]
m_val_brier_score = np.mean((val_preds_m - y_m_val)**2)
print(f"Men's Brier Score (validation): {m_val_brier_score:.4f}")

# --- Women's model ---
X_w = w_train[feature_cols]
y_w = w_train['Label']

X_w_train, X_w_val, y_w_train, y_w_val = train_test_split(X_w, y_w, test_size=0.2, random_state=42)
model_w = LogisticRegression()
model_w.fit(X_w_train, y_w_train)

val_preds_w = model_w.predict_proba(X_w_val)[:,1]
w_val_brier_score = np.mean((val_preds_w - y_w_val)**2)
print(f"Women's Brier Score (validation): {w_val_brier_score:.4f}")


Men's Brier Score (validation): 0.0000
Women's Brier Score (validation): 0.0000


In [6]:
# Get all men’s team IDs
men_team_ids = sorted(m_teams['TeamID'].unique())
# Get all women’s team IDs
women_team_ids = sorted(w_teams['TeamID'].unique())

# Cartesian product: all possible pairs for men
men_pairs = []
for t1, t2 in product(men_team_ids, men_team_ids):
    if t1 < t2:
        men_pairs.append((t1, t2))

# Cartesian product: all possible pairs for women
women_pairs = []
for t1, t2 in product(women_team_ids, women_team_ids):
    if t1 < t2:
        women_pairs.append((t1, t2))

print(f"Number of men’s pairs: {len(men_pairs)}")
print(f"Number of women’s pairs: {len(women_pairs)}")


Number of men’s pairs: 72010
Number of women’s pairs: 71253


In [7]:
def predict_matchup_probability(model, team1, team2):
    """
    Using the logistic regression model, returns P(team1 beats team2).
    For a real approach, you’d incorporate advanced rating features for both teams.
    Here we pass ScoreDiff=0, Home/Away=0.5 (as if a neutral site).
    """
    X_row = pd.DataFrame({
        'ScoreDiff': [0], 
        'HomeAwayIndicator': [0.5]
    })
    prob = model.predict_proba(X_row)[:,1]
    return prob[0]

# Example usage:
example_prob = predict_matchup_probability(model_m, men_team_ids[0], men_team_ids[1])
print(f"Probability that team {men_team_ids[0]} beats {men_team_ids[1]} = {example_prob:.3f}")


Probability that team 1101 beats 1102 = 0.501


In [8]:
submission_rows = []

# --- Men's predictions ---
for (t1, t2) in men_pairs:
    prob_t1_wins = predict_matchup_probability(model_m, t1, t2)
    row_id = f"M_{t1}_{t2}"  # or "M_2025_{t1}_{t2}", depending on the competition requirement
    submission_rows.append({
        'ID': row_id,
        'Pred': prob_t1_wins
    })

# --- Women's predictions ---
for (t1, t2) in women_pairs:
    prob_t1_wins = predict_matchup_probability(model_w, t1, t2)
    row_id = f"W_{t1}_{t2}"
    submission_rows.append({
        'ID': row_id,
        'Pred': prob_t1_wins
    })

submission_df = pd.DataFrame(submission_rows)
print(submission_df.head(10))
print("Total submission rows:", submission_df.shape[0])


            ID      Pred
0  M_1101_1102  0.500561
1  M_1101_1103  0.500561
2  M_1101_1104  0.500561
3  M_1101_1105  0.500561
4  M_1101_1106  0.500561
5  M_1101_1107  0.500561
6  M_1101_1108  0.500561
7  M_1101_1109  0.500561
8  M_1101_1110  0.500561
9  M_1101_1111  0.500561
Total submission rows: 143263


In [9]:
submission_df[['ID','Pred']].to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")


Submission file created: submission.csv
