Calculate Strength of Schedule
==============================

This notebook calculates strength of schedule for both mens and womens NCAA Division 1 leagues, outputting the results to a file.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sklearn.ensemble as en
import sklearn.model_selection as mds
import sklearn.metrics as ms
import sklearn.preprocessing as pre

from scipy import interpolate
from functools import reduce


### CONSTANTS ###

DATA_DIR = '../data/kaggle/'
YEAR     = 2025


### FUNCTIONS ###

def extract_seed_value(seed_str):
    try:
        return int(seed_str[1:])
    except:
        np.nan

def load_mens_and_womens( filename ):

    df_m_ = pd.read_csv( DATA_DIR + "M" + filename )
    df_m_["Gender"] = "M"
    
    df_w_ = pd.read_csv( DATA_DIR + "W" + filename )
    df_w_["Gender"] = "W"
    
    df_concat = pd.concat([ df_m_, df_w_ ])
    
    return df_concat

# Load Teams and Regular Season Data

Load regular season data for both mens and womens teams

In [2]:
df_reg_ = load_mens_and_womens( "RegularSeasonCompactResults.csv" )
df_reg_ = df_reg_[ df_reg_["Season"] == YEAR ]

In [3]:
df_reg_.head(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,Gender
187289,2025,0,1104,110,1421,54,H,0,M
187290,2025,0,1112,93,1145,64,H,0,M
187291,2025,0,1117,80,1103,75,H,1,M
187292,2025,0,1119,67,1107,59,H,0,M
187293,2025,0,1130,69,1154,60,H,0,M


In [4]:
df_teams_ = load_mens_and_womens( "Teams.csv" )
df_teams_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 758 entries, 0 to 377
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TeamID         758 non-null    int64  
 1   TeamName       758 non-null    object 
 2   FirstD1Season  380 non-null    float64
 3   LastD1Season   380 non-null    float64
 4   Gender         758 non-null    object 
dtypes: float64(2), int64(1), object(2)
memory usage: 35.5+ KB


# Compute Team Records



In [5]:
df_wins        = df_reg_.groupby(['WTeamID']).size().reset_index(name='Wins')
df_win_points  = df_reg_.groupby('WTeamID')["WScore"].sum()

df_wins = df_wins.merge( df_win_points, on="WTeamID" )
df_wins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726 entries, 0 to 725
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   WTeamID  726 non-null    int64
 1   Wins     726 non-null    int64
 2   WScore   726 non-null    int64
dtypes: int64(3)
memory usage: 17.1 KB


In [6]:
df_losses      = df_reg_.groupby(['LTeamID']).size().reset_index(name='Losses')
df_loss_points = df_reg_.groupby('LTeamID')["LScore"].sum()

df_losses = df_losses.merge( df_loss_points, on="LTeamID" )
df_losses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726 entries, 0 to 725
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   LTeamID  726 non-null    int64
 1   Losses   726 non-null    int64
 2   LScore   726 non-null    int64
dtypes: int64(3)
memory usage: 17.1 KB


In [7]:
df_summary = df_wins.merge( df_losses, left_on='WTeamID', right_on='LTeamID', how='inner' )
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726 entries, 0 to 725
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   WTeamID  726 non-null    int64
 1   Wins     726 non-null    int64
 2   WScore   726 non-null    int64
 3   LTeamID  726 non-null    int64
 4   Losses   726 non-null    int64
 5   LScore   726 non-null    int64
dtypes: int64(6)
memory usage: 34.2 KB


In [8]:
df_summary.drop( "LTeamID", axis=1, inplace=True )
df_summary.rename( columns={ "WTeamID": "TeamID" }, inplace=True )

df_summary["WinDiff"] = df_summary["Wins"] - df_summary["Losses"]
df_summary["PointsDiff"] = df_summary["WScore"] - df_summary["LScore"]
df_summary["TotalGames"] = df_summary["Wins"] + df_summary["Losses"]
df_summary["WinPercentage"] = round(df_summary["Wins"] / df_summary["TotalGames"], 3)
df_summary["Season"] = YEAR

In [9]:
df_summary = df_summary.merge( df_teams_[["TeamID", "TeamName", "Gender"]], on="TeamID" )
df_summary.head(5)

Unnamed: 0,TeamID,Wins,WScore,Losses,LScore,WinDiff,PointsDiff,TotalGames,WinPercentage,Season,TeamName,Gender
0,1101,12,909,14,845,-2,64,26,0.462,2025,Abilene Chr,M
1,1102,4,296,26,1580,-22,-1284,30,0.133,2025,Air Force,M
2,1103,22,1903,6,424,16,1479,28,0.786,2025,Akron,M
3,1104,23,2157,6,482,17,1675,29,0.793,2025,Alabama,M
4,1105,7,577,19,1269,-12,-692,26,0.269,2025,Alabama A&M,M


# Merge with Tournament Seed Data

In [10]:
df_ = load_mens_and_womens( "NCAATourneySeeds.csv" )
df_seeds = df_[ df_["Season"] == YEAR ]
df_seeds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Season  0 non-null      int64 
 1   Seed    0 non-null      object
 2   TeamID  0 non-null      int64 
 3   Gender  0 non-null      object
dtypes: int64(2), object(2)
memory usage: 0.0+ bytes


In [11]:
df_summary = df_summary.merge( df_seeds, on=['TeamID', 'Gender', 'Season'], how='left' )
df_summary['SeedValue'] = df_summary['Seed'].apply(extract_seed_value)
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726 entries, 0 to 725
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TeamID         726 non-null    int64  
 1   Wins           726 non-null    int64  
 2   WScore         726 non-null    int64  
 3   Losses         726 non-null    int64  
 4   LScore         726 non-null    int64  
 5   WinDiff        726 non-null    int64  
 6   PointsDiff     726 non-null    int64  
 7   TotalGames     726 non-null    int64  
 8   WinPercentage  726 non-null    float64
 9   Season         726 non-null    int64  
 10  TeamName       726 non-null    object 
 11  Gender         726 non-null    object 
 12  Seed           0 non-null      object 
 13  SeedValue      0 non-null      object 
dtypes: float64(1), int64(9), object(4)
memory usage: 79.5+ KB


# Compute Team Stength of Schedule

In [12]:
teams = set(df_reg_['WTeamID'].unique()).union(df_reg_['LTeamID'].unique())
print(f"Number of teams: {len(teams)}")

Number of teams: 726


In [13]:
# Function to calculate win percentage
def win_percentage(team, df):
    wins = len(df[((df['WTeamID'] == team) & (df['WScore'] > df['LScore'])) | 
                   ((df['LTeamID'] == team) & (df['LScore'] > df['WScore']))])
    losses = len(df[((df['WTeamID'] == team) & (df['WScore'] < df['LScore'])) | 
                     ((df['LTeamID'] == team) & (df['LScore'] < df['WScore']))])
    return wins / (wins + losses) if (wins + losses) > 0 else 0

# Function to calculate OWP
def calculate_owp(team, df):
    opponents = set()
    for index, row in df.iterrows():
        if row['WTeamID'] == team:
            opponents.add(row['LTeamID'])
        elif row['LTeamID'] == team:
            opponents.add(row['WTeamID'])
    
    total_owp = 0
    for opponent in opponents:
        # Filter out games between team and opponent to avoid circularity
        temp_df = df[~((df['WTeamID'] == team) & (df['LTeamID'] == opponent) | (df['WTeamID'] == opponent) & (df['LTeamID'] == team))]
        total_owp += win_percentage(opponent, temp_df)
    return total_owp / len(opponents) if opponents else 0

# Function to calculate OOWP
def calculate_oowp(team, df):
    opponents = set()
    for index, row in df.iterrows():
        if row['WTeamID'] == team:
            opponents.add(row['LTeamID'])
        elif row['LTeamID'] == team:
            opponents.add(row['WTeamID'])
    
    total_oowp = 0
    for opponent in opponents:
        total_oowp += calculate_owp(opponent, df)
    return total_oowp / len(opponents) if opponents else 0

In [14]:
sos_data = []
for team in list(teams):
  owp = round(calculate_owp(team, df_reg_),4)
  #oowp = calculate_oowp(team, df_reg_)
  #sos = (owp * (2/3)) + (oowp * (1/3))
  sos_data.append({'TeamID': team, 'SOS': owp})

sos_df = pd.DataFrame(sos_data)
print(sos_df)

     TeamID     SOS
0      3257  0.6082
1      3258  0.4581
2      3259  0.4894
3      3260  0.4381
4      3262  0.4545
..      ...     ...
721    3251  0.5408
722    3252  0.4578
723    3253  0.4840
724    3254  0.3715
725    3255  0.3754

[726 rows x 2 columns]


# Merge SoS with Season Results

In [15]:
df_summary = df_summary.merge( sos_df, on='TeamID', how='left' )
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726 entries, 0 to 725
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TeamID         726 non-null    int64  
 1   Wins           726 non-null    int64  
 2   WScore         726 non-null    int64  
 3   Losses         726 non-null    int64  
 4   LScore         726 non-null    int64  
 5   WinDiff        726 non-null    int64  
 6   PointsDiff     726 non-null    int64  
 7   TotalGames     726 non-null    int64  
 8   WinPercentage  726 non-null    float64
 9   Season         726 non-null    int64  
 10  TeamName       726 non-null    object 
 11  Gender         726 non-null    object 
 12  Seed           0 non-null      object 
 13  SeedValue      0 non-null      object 
 14  SOS            726 non-null    float64
dtypes: float64(2), int64(9), object(4)
memory usage: 85.2+ KB


In [16]:
df_summary.sort_values(by=['WinPercentage','SOS'], ascending=False, inplace=True)
df_summary.head(10)

Unnamed: 0,TeamID,Wins,WScore,Losses,LScore,WinDiff,PointsDiff,TotalGames,WinPercentage,Season,TeamName,Gender,Seed,SeedValue,SOS
649,3400,29,2393,2,120,27,2273,31,0.935,2025,Texas,W,,,0.6366
666,3417,27,2157,2,127,25,2030,29,0.931,2025,UCLA,W,,,0.634
674,3425,26,2162,2,130,24,2032,28,0.929,2025,USC,W,,,0.6144
469,3213,25,1962,2,101,23,1861,27,0.926,2025,Grand Canyon,W,,,0.456
419,3163,28,2311,3,214,25,2097,31,0.903,2025,Connecticut,W,,,0.6573
644,3395,28,2273,3,161,25,2112,31,0.903,2025,TCU,W,,,0.5624
17,1120,27,2318,3,231,24,2087,30,0.9,2025,Auburn,M,,,0.6682
626,3376,27,2233,3,182,24,2051,30,0.9,2025,South Carolina,W,,,0.6608
74,1181,27,2290,3,215,24,2075,30,0.9,2025,Duke,M,,,0.578
590,3339,27,2101,3,208,24,1893,30,0.9,2025,Portland,W,,,0.428


# Output to File

In [17]:
df_summary.to_csv(f"../data/kaggle/Summary.{YEAR}.csv", index=False)