Generate Hypothetical Matchups
==============================

Load generated wins / strength of schedule file to generate the list of hypothetical matchups. This is the file that will be submitted to Kaggle for evaluation.

In [1]:
import pandas as pd
import statistics as st

import sklearn.ensemble as en
import sklearn.model_selection as mds
import sklearn.metrics as ms

from datetime import datetime


### CONSTANTS ###

DATA_DIR = '../data/kaggle/'
YEARS    = [ 2021, 2022, 2023, 2024 ]

SUBMIT_YEAR = 2025

X_COLNAMES = [ "RankDiff", "WinPctDiff", "SoSDiff", "SeedDiff", "PointsDiff" ]
#X_COLNAMES = [ "RankDiff", "WinPctDiff", "SoSDiff" ]
Y_COLNAME = "Team1Win"

SCORING = "accuracy"
NUM_FOLDS = 6



### FUNCTIONS ###

def get_datetime_str( ):

    now = datetime.now()

    s_date = now.strftime("%Y%m%d")
    s_time = now.strftime("%H%M%S")

    return f"{s_date}.{s_time}"

def get_cross_game_id(row):
    season = int(row["Season"])
    team_x1 = int(row["TeamID_X1"])
    team_x2 = int(row["TeamID_X2"])

    return f"{season}_{team_x1}_{team_x2}"

def get_tournament_game_id(row):
    season = int(row["Season"])
    team_w = int(row["WTeamID"])
    team_l = int(row["LTeamID"])

    if team_w >= team_l:
        return f"{season}_{team_l}_{team_w}"
    else:
        return f"{season}_{team_w}_{team_l}"

def load_mens_and_womens( filename ):

    df_m_ = pd.read_csv( DATA_DIR + "M" + filename )
    df_m_["Gender"] = "M"
    
    df_w_ = pd.read_csv( DATA_DIR + "W" + filename )
    df_w_["Gender"] = "W"
    
    df_concat = pd.concat([ df_m_, df_w_ ])
    
    return df_concat

def extract_game_info(id_str):

    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

def does_team1_win(row):

    y, team1, team2 = extract_game_info(row["GameID"])
    if row["WTeamID"] == team1:
        return 1
    else:
        return 0

def cross_dataframe( df ):
        
    df_x = pd.merge( df, df, how="cross", suffixes=["_X1", "_X2"] )
    df_x = df_x.drop( df_x[ df_x["TeamID_X1"] >= df_x["TeamID_X2"] ].index )

    return df_x

def print_confusion_matrix_info( m, X, y ):
    
    y_preds  = model.predict(X)
    test_threshold = 0.5
    
    tn, fp, fn, tp = ms.confusion_matrix(y, y_preds).ravel()
    print(f"Confusion Matrix, at Threshold {test_threshold:.3f}")
    print(f"  tn {tn:<3}  fp {fp:<3}")
    print(f"  fn {fn:<3}  tp {tp:<3}")
    print()

    bal_acc   = round( ms.balanced_accuracy_score( y, y_preds ), 3 )
    precision = round( ms.precision_score( y, y_preds ), 3 )
    recall    = round( ms.recall_score( y, y_preds ), 3 )
    f1        = round( ms.f1_score( y, y_preds, average='micro' ), 3 )

    s_scores = f"Bal Acc: {bal_acc}, Pre: {precision}, Rec: {recall}, F1 Micro: {f1}"
    print(s_scores)
    print()

    l_raw_scores = mds.cross_val_score(m, X, y, scoring=SCORING, cv=NUM_FOLDS)
    l_scores = [ round(i, 2) for i in l_raw_scores ]

    mean_score = round(st.mean(l_scores), 3)
    std_score = round(st.stdev(l_scores), 3)

    print(f"{NUM_FOLDS}-fold {SCORING} mean: {mean_score}, stdev: {std_score}")
    print(f"  each: {l_scores}")

# Load Summary Data

In [2]:
df_ = pd.DataFrame()

for y in YEARS:
    df_y = pd.read_csv( DATA_DIR + f"Summary.{y}.csv" )
    if df_.empty:
        df_ = df_y
    else: 
        df_ = pd.concat([df_, df_y])
        
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2850 entries, 0 to 721
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TeamID         2850 non-null   int64  
 1   Wins           2850 non-null   float64
 2   Losses         2850 non-null   float64
 3   TeamPoints     2850 non-null   float64
 4   OppPoints      2850 non-null   float64
 5   WinDiff        2850 non-null   float64
 6   PointsDiff     2850 non-null   float64
 7   TotalGames     2850 non-null   float64
 8   WinPercentage  2850 non-null   float64
 9   Season         2850 non-null   int64  
 10  TeamName       2850 non-null   object 
 11  Gender         2850 non-null   object 
 12  Seed           540 non-null    object 
 13  SeedValue      484 non-null    float64
 14  SOS            2850 non-null   float64
dtypes: float64(10), int64(2), object(3)
memory usage: 356.2+ KB


# Split Mens and Womens Teams

In [3]:
df_m_ = df_[ df_[ "Gender" ] == "M" ].copy()
df_m_.sort_values(by=['WinPercentage','SOS'], ascending=False, inplace=True)
df_m_.insert(0, 'Rank', range(1, 1 + len(df_m_)))
df_m_.shape

(1430, 16)

In [4]:
df_w_ = df_[ df_[ "Gender" ] == "W" ].copy()
df_w_.sort_values(by=['WinPercentage','SOS'], ascending=False, inplace=True)
df_w_.insert(0, 'Rank', range(1, 1 + len(df_w_)))
df_w_.shape

(1420, 16)

In [5]:
df_w_[ df_w_[ "TeamID" ] == 3376 ]

Unnamed: 0,Rank,TeamID,Wins,Losses,TeamPoints,OppPoints,WinDiff,PointsDiff,TotalGames,WinPercentage,Season,TeamName,Gender,Seed,SeedValue,SOS
0,1,3376,32.0,0.0,2606.0,1635.0,32.0,971.0,32.0,1.0,2023,South Carolina,W,W01,1.0,0.6281
0,2,3376,32.0,0.0,2754.0,1802.0,32.0,952.0,32.0,1.0,2024,South Carolina,W,W01,1.0,0.6073
0,8,3376,29.0,2.0,2210.0,1602.0,27.0,608.0,31.0,0.935,2022,South Carolina,W,Y01,1.0,0.6658
28,66,3376,22.0,4.0,1997.0,1542.0,18.0,455.0,26.0,0.846,2021,South Carolina,W,X01,1.0,0.6541


# Generate Matchups

In [6]:
cross_col_names = [ "TeamID", "TeamName", "WinPercentage", "SOS", "Seed", "SeedValue", "Rank", "PointsDiff" ]    
df_cross = pd.DataFrame()

for y in YEARS:

    df_m_sub = df_m_[ df_m_[ "Season" ] == y ]
    df_m_cross = cross_dataframe( df_m_sub[cross_col_names] )
    
    df_w_sub = df_w_[ df_w_[ "Season" ] == y ]
    df_w_cross = cross_dataframe( df_w_sub[cross_col_names] )
    
    df_combine = pd.concat([df_m_cross, df_w_cross], ignore_index=True)
    df_combine["Season"] = y

    if df_cross.empty:
        df_cross = df_combine
    else:
        df_cross = pd.concat([df_cross, df_combine])
        
print(df_cross.shape)

(506421, 17)


In [7]:
df_cross.head(10)

Unnamed: 0,TeamID_X1,TeamName_X1,WinPercentage_X1,SOS_X1,Seed_X1,SeedValue_X1,Rank_X1,PointsDiff_X1,TeamID_X2,TeamName_X2,WinPercentage_X2,SOS_X2,Seed_X2,SeedValue_X2,Rank_X2,PointsDiff_X2,Season
0,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1457,Winthrop,0.958,0.4664,Z12,12.0,2,306.0,2021
1,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1222,Houston,0.885,0.535,Y02,2.0,14,468.0,2021
2,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1361,San Diego St,0.846,0.567,Y06,6.0,26,323.0,2021
3,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1260,Loyola-Chicago,0.846,0.4876,Y08,8.0,28,382.0,2021
4,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1276,Michigan,0.833,0.5567,W01,1.0,34,261.0,2021
5,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1298,Navy,0.833,0.4843,,,35,81.0,2021
6,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1364,UC Santa Barbara,0.833,0.48,X12,12.0,36,248.0,2021
7,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1382,St Bonaventure,0.8,0.5205,W09,9.0,53,203.0,2021
8,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1228,Illinois,0.793,0.5671,Y01,1.0,61,353.0,2021
9,1211,Gonzaga,1.0,0.5407,X01,1.0,1,598.0,1251,Liberty,0.792,0.4157,Y13,13.0,63,264.0,2021


In [8]:
df_cross["SeedDiff"] = df_cross["SeedValue_X1"] - df_cross["SeedValue_X2"]
df_cross["SeedDiff"] = df_cross["SeedDiff"].fillna(0)

df_cross["RankDiff"]   = df_cross["Rank_X1"] - df_cross["Rank_X2"]
df_cross["WinPctDiff"] = df_cross["WinPercentage_X1"] - df_cross["WinPercentage_X2"]
df_cross["SoSDiff"]   = df_cross["SOS_X1"] - df_cross["SOS_X2"]
df_cross["PointsDiff"] = df_cross["PointsDiff_X1"] - df_cross["PointsDiff_X2"]

df_cross["GameID"] = df_cross.apply( get_cross_game_id, axis=1 )

df_cross.info()

<class 'pandas.core.frame.DataFrame'>
Index: 506421 entries, 0 to 129960
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   TeamID_X1         506421 non-null  int64  
 1   TeamName_X1       506421 non-null  object 
 2   WinPercentage_X1  506421 non-null  float64
 3   SOS_X1            506421 non-null  float64
 4   Seed_X1           98001 non-null   object 
 5   SeedValue_X1      88119 non-null   float64
 6   Rank_X1           506421 non-null  int64  
 7   PointsDiff_X1     506421 non-null  float64
 8   TeamID_X2         506421 non-null  int64  
 9   TeamName_X2       506421 non-null  object 
 10  WinPercentage_X2  506421 non-null  float64
 11  SOS_X2            506421 non-null  float64
 12  Seed_X2           93887 non-null   object 
 13  SeedValue_X2      83769 non-null   float64
 14  Rank_X2           506421 non-null  int64  
 15  PointsDiff_X2     506421 non-null  float64
 16  Season            506421 

In [9]:
df_cross[ df_cross["TeamID_X2"] == 3376 ].head(5)

Unnamed: 0,TeamID_X1,TeamName_X1,WinPercentage_X1,SOS_X1,Seed_X1,SeedValue_X1,Rank_X1,PointsDiff_X1,TeamID_X2,TeamName_X2,...,SeedValue_X2,Rank_X2,PointsDiff_X2,Season,SeedDiff,RankDiff,WinPctDiff,SoSDiff,PointsDiff,GameID
60054,3163,Connecticut,0.96,0.566,Z01,1.0,5,788.0,3376,South Carolina,...,1.0,66,455.0,2021,0.0,-61,0.114,-0.0881,333.0,2021_3163_3376
60423,3124,Baylor,0.926,0.5376,Z02,2.0,12,732.0,3376,South Carolina,...,1.0,66,455.0,2021,1.0,-54,0.08,-0.1165,277.0,2021_3124_3376
60735,3268,Maryland,0.923,0.5898,X02,2.0,13,546.0,3376,South Carolina,...,1.0,66,455.0,2021,1.0,-53,0.077,-0.0643,91.0,2021_3268_3376
60932,3195,FGCU,0.923,0.5088,Z11,11.0,14,578.0,3376,South Carolina,...,1.0,66,455.0,2021,10.0,-52,0.077,-0.1453,123.0,2021_3195_3376
61247,3372,SF Austin,0.917,0.4613,X12,12.0,16,689.0,3376,South Carolina,...,1.0,66,455.0,2021,11.0,-50,0.071,-0.1928,234.0,2021_3372_3376


In [10]:
print(f"Expected: Year 2025, Lines 131407")
print()

for y in YEARS:
    print(f"Actual:   Year {y}, Lines {len(df_cross[ df_cross["Season"] == y])}")

Expected: Year 2025, Lines 131407

Actual:   Year 2021, Lines 118684
Actual:   Year 2022, Lines 127093
Actual:   Year 2023, Lines 130683
Actual:   Year 2024, Lines 129961


# Load Tournament Results

In [11]:
df_ = load_mens_and_womens( "NCAATourneyCompactResults.csv" )
df_tourney = df_[ df_["Season"].isin( YEARS ) ].copy()

df_tourney["GameID"] = df_tourney.apply( get_tournament_game_id, axis=1 )
df_tourney["Team1Win"] = df_tourney.apply( does_team1_win, axis=1 )

df_merged = df_tourney.merge(df_cross[[ "GameID", "RankDiff", "TeamID_X1", "TeamID_X2",\
                                        "WinPercentage_X1", "WinPercentage_X2", "WinPctDiff", 
                                        "SoSDiff", "SeedDiff", "PointsDiff" ]], how="left", on="GameID" )

# Train the Model

In [12]:
X = df_merged[ X_COLNAMES ]
y = df_merged[ Y_COLNAME ].values.ravel()

print(X.info())
#print(y.info())

# Assuming you have your features in X and your target variable in y
X_train, X_test, y_train, y_test = mds.train_test_split(X, y, test_size=0.2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RankDiff    531 non-null    int64  
 1   WinPctDiff  531 non-null    float64
 2   SoSDiff     531 non-null    float64
 3   SeedDiff    531 non-null    float64
 4   PointsDiff  531 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 20.9 KB
None


In [13]:
clf = en.GradientBoostingClassifier( n_estimators=20, 
                                     min_samples_split=10,
                                     min_samples_leaf=5,
                                     max_features=4,
                                     max_depth=10 )
model = clf.fit(X_train, y_train)

In [14]:
importances = model.feature_importances_
feature_imp_df = pd.DataFrame(
    {'Feature': X_COLNAMES, 'Gini Importance': importances} ).sort_values(
         'Gini Importance', ascending=False)

feature_imp_df

Unnamed: 0,Feature,Gini Importance
3,SeedDiff,0.332699
4,PointsDiff,0.238974
2,SoSDiff,0.161465
0,RankDiff,0.144201
1,WinPctDiff,0.122662


# Validate the Model

In [15]:
print_confusion_matrix_info(model, X_train, y_train)

Confusion Matrix, at Threshold 0.500
  tn 204  fp 4  
  fn 1    tp 215

Bal Acc: 0.988, Pre: 0.982, Rec: 0.995, F1 Micro: 0.988

6-fold accuracy mean: 0.657, stdev: 0.052
  each: [np.float64(0.72), np.float64(0.58), np.float64(0.68), np.float64(0.63), np.float64(0.63), np.float64(0.7)]


In [16]:
print_confusion_matrix_info(model, X_test, y_test)

Confusion Matrix, at Threshold 0.500
  tn 37   fp 13 
  fn 20   tp 37 

Bal Acc: 0.695, Pre: 0.74, Rec: 0.649, F1 Micro: 0.692

6-fold accuracy mean: 0.71, stdev: 0.046
  each: [np.float64(0.72), np.float64(0.72), np.float64(0.78), np.float64(0.67), np.float64(0.72), np.float64(0.65)]


# Write the Submission File

Load the summary file for the submission year

In [17]:
df_ = pd.read_csv( DATA_DIR + f"Summary.{SUBMIT_YEAR}.csv" )

df_m_ = df_[ df_[ "Gender" ] == "M" ]
df_m_.insert(0, 'Rank', range(1, 1 + len(df_m_)))

df_w_ = df_[ df_[ "Gender" ] == "W" ]
df_w_.insert(0, 'Rank', range(1, 1 + len(df_w_)))

df_m_cross = cross_dataframe( df_m_[cross_col_names] )
df_w_cross = cross_dataframe( df_w_[cross_col_names] )

df_combine = pd.concat([df_m_cross, df_w_cross], ignore_index=True)
print(df_combine.shape)

(131407, 16)


In [18]:
df_combine["SeedDiff"] = df_combine["SeedValue_X1"] - df_combine["SeedValue_X2"]
df_combine["SeedDiff"] = df_combine["SeedDiff"].fillna(0)

df_combine["RankDiff"]   = df_combine["Rank_X1"] - df_combine["Rank_X2"]
df_combine["WinPctDiff"] = df_combine["WinPercentage_X1"] - df_combine["WinPercentage_X2"]
df_combine["SoSDiff"]    = df_combine["SOS_X1"] - df_combine["SOS_X2"]
df_combine["PointsDiff"] = df_combine["PointsDiff_X1"] - df_combine["PointsDiff_X2"]
df_combine["Season"]     = SUBMIT_YEAR

df_combine["GameID"] = df_combine.apply( get_cross_game_id, axis=1 )

Fit the model to the entire submissions dataset

In [19]:
X_submit = df_combine[ X_COLNAMES ]
X_submit.info()

y_submit = model.predict_proba( X_submit )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131407 entries, 0 to 131406
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   RankDiff    131407 non-null  int64  
 1   WinPctDiff  131407 non-null  float64
 2   SoSDiff     131407 non-null  float64
 3   SeedDiff    131407 non-null  float64
 4   PointsDiff  131407 non-null  int64  
dtypes: float64(3), int64(2)
memory usage: 5.0 MB


Write submission dataset to a file

In [20]:
df_combine["Pred"] = y_submit[:,1]
df_combine.rename( columns={'GameID': 'ID'}, inplace=True )

submission_note = "2025TweakModelv1"
datetime_str = get_datetime_str()
filename = f"submit.{submission_note}.{datetime_str}.csv"

print(f"Writing {len(df_combine)} lines to {filename} ...")
df_combine[["ID", "Pred"]].to_csv(f"../data/kaggle/{filename}", index=False)

Writing 131407 lines to submit.2025TweakModelv1.20250317.130006.csv ...
