# **Load Data**

In [1]:
import pandas as pd
import os
import re
from scipy.stats import spearmanr
import joblib
from sklearn.preprocessing import StandardScaler
import numpy as np
import warnings
from scripts import modeling_functions as mf

def read_csv_files(directory):
    """
    Reads all CSV files in a given directory into a dictionary of Pandas DataFrames.

    Args:
        directory (str): The path to the directory containing the CSV files.

    Returns:
        dict: A dictionary where keys are file names (without .csv) and values are Pandas DataFrames.
    """

    dataframes = {}
    try:
        for filename in os.listdir(directory):
            if filename.endswith(".csv"):
                filepath = os.path.join(directory, filename)
                try:
                    # Attempt to read the CSV file
                    df = pd.read_csv(filepath)
                    # Store the DataFrame in the dictionary, using the filename without .csv as the key
                    dataframes[filename[:-4]] = df
                    print(f"Successfully read: {filename}")
                except FileNotFoundError:
                    print(f"Error: File not found - {filepath}")
                except pd.errors.EmptyDataError:
                  print(f"Error: Empty CSV file - {filepath}")
                except pd.errors.ParserError:
                    print(f"Error: Could not parse CSV file - {filepath}")
                except Exception as e:
                    print(f"An unexpected error occurred while reading {filename}: {e}")

    except FileNotFoundError:
        print(f"Error: Directory not found - {directory}")
    except Exception as e:
        print(f"An unexpected error occurred while processing directory {directory}: {e}")

    return dataframes

warnings.filterwarnings('ignore')
directory_path = "data/raw/"  
data_dict = read_csv_files(directory_path)

Successfully read: MNCAATourneyDetailedResults.csv
Successfully read: SampleSubmissionStage2.csv
Successfully read: WSecondaryTourneyTeams.csv
Successfully read: WNCAATourneySlots.csv
Successfully read: MNCAATourneyCompactResults.csv
Successfully read: MSeasons.csv
Successfully read: SampleSubmissionStage1.csv
Successfully read: WTeams.csv
Successfully read: MRegularSeasonDetailedResults.csv
Successfully read: WNCAATourneyDetailedResults.csv
Successfully read: MNCAATourneySlots.csv
Successfully read: MGameCities.csv
Successfully read: MConferenceTourneyGames.csv
Successfully read: WNCAATourneyCompactResults.csv
Successfully read: WSecondaryTourneyCompactResults.csv
Successfully read: WSeasons.csv
Successfully read: Cities.csv
Successfully read: WRegularSeasonCompactResults.csv
Successfully read: WTeamSpellings.csv
Successfully read: WRegularSeasonDetailedResults.csv
Successfully read: MRegularSeasonCompactResults.csv
Successfully read: WNCAATourneySeeds.csv
Successfully read: MNCAATour

# **Pipeline Methods/Functions**

In [2]:
def impute_missing_with_median(df,features):
    """
    Imputes missing values in a DataFrame with the median of each column.

    Args:
        df (pd.DataFrame): The DataFrame to impute.

    Returns:
        pd.DataFrame: The DataFrame with missing values imputed.
    """

    for col in features:
        if df[col].isnull().any():  # Check if there are any null values in the column
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    return df
def load_svm_model_and_predict(data, svm_model_path="svm_model_women.joblib", scaler_path="svm_scaler_women.joblib"):
    """
    Loads an SVM classifier and scaler from disk and predicts on the given data.

    Args:
        data (pd.DataFrame): The DataFrame containing team statistics.
        svm_model_path (str): Path to the SVM model.
        scaler_path (str): Path to the scaler object.

    Returns:
        pd.DataFrame: The DataFrame with added 'predicted_probability' column.
    """
    
    try:
        # Load Model and Scaler
        svm_model = joblib.load(svm_model_path)
        scaler = joblib.load(scaler_path)

        # Prepare data for SVM
        svm_features = np.load('features_svm_model_women.npy',allow_pickle=True)
        data=impute_missing_with_median(data,svm_features)
        X_svm = data[svm_features]

        # Scale the data
        X_svm_scaled = scaler.transform(X_svm)

        # Predict probabilities
        predicted_probabilities = svm_model.predict_proba(X_svm_scaled)[:, 1]
        data['Pred'] = 1-predicted_probabilities

        return data

    except FileNotFoundError as e:
        print(f"Error: Model or scaler file not found - {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


def load_xgb_model_and_predict(data, xgb_model_path="xgb_model.joblib"):
    """
    Loads an XGBoost classifier from disk and predicts on the given data.

    Args:
        data (pd.DataFrame): The DataFrame containing team statistics.
        xgb_model_path (str): Path to the XGBoost model.

    Returns:
        pd.DataFrame: The DataFrame with added 'predicted_probability' column.
    """
    
    try:
        # Load Model
        xgb_model = joblib.load(xgb_model_path)

        # Prepare data for XGBoost
        xgb_features = xgb_model.feature_names_in_.tolist()
        X_xgb = data[xgb_features]

        # Predict probabilities
        predicted_probabilities = xgb_model.predict_proba(X_xgb)[:, 1]
        data['Pred'] = 1-predicted_probabilities

        return data

    except FileNotFoundError as e:
        print(f"Error: Model file not found - {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None
def add_tournament_data(data_with_momentum, tourney_seeds_df):
   
    with_tourney_seeds = pd.merge(data_with_momentum, tourney_seeds_df, on=['Season', 'TeamID'], how='left')

    return with_tourney_seeds
def calculate_momentum(group):
    """Calculates Spearman correlation coefficient for momentum."""

    group = group.sort_values('RankingDayNum', ascending=False)
    last_40 = group.head(40)

    if len(last_40) < 2:  # Need at least 2 data points for correlation
        return 0  # Or any other suitable default

    return spearmanr(last_40['RankingDayNum'], last_40['OrdinalRank']).correlation

def get_mean_stats(regular_season_results):
    """
    Calculates the mean statistics for each team from regular season results.

    Args:
        regular_season_results (pd.DataFrame): DataFrame containing regular season detailed results.

    Returns:
        pd.DataFrame: DataFrame containing mean statistics for each team.
    """

    win_team_features = ['Season', 'WTeamID', 'WScore', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
    lose_team_features = ['Season', 'LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']

    winning_data = regular_season_results[win_team_features].rename(columns={col: re.sub('W', '', col) for col in win_team_features})
    losing_data = regular_season_results[lose_team_features].rename(columns={col: re.sub('L', '', col) for col in lose_team_features})
    all_data = pd.concat([winning_data, losing_data])
    features = ['Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
    average_stats = all_data.groupby(['Season'])[features].mean().reset_index().rename(columns={col:f'Reg_Avg_{col}' for col in features})
    return average_stats,features
def create_initial_features(regular_season_results, conferences):
    """
    Creates initial team statistics features from regular season results.

    Args:
        regular_season_results (pd.DataFrame): Regular season detailed results.
        conferences (pd.DataFrame): Team conference information.

    Returns:
        pd.DataFrame: Team statistics features.
    """

    expected_rows = calculate_expected_rows_after_aggregation(regular_season_results )
    print('Expected Rows: ', expected_rows)
    win_team_features = ['WScore', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
    lose_team_features = ['LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']

    winning_agg_dict = {col: 'mean' for col in win_team_features}
    losing_agg_dict = {col: 'mean' for col in lose_team_features}

    win_games_data = regular_season_results.groupby(['Season', 'WTeamID']).agg(winning_agg_dict).reset_index()
    lose_games_data = regular_season_results.groupby(['Season', 'LTeamID']).agg(losing_agg_dict).reset_index()

    win_games_data.columns = [re.sub('W', "", col) for col in win_games_data.columns]
    win_games_data['Game_Kind'] = 'W'

    lose_games_data.columns = [re.sub('L', "", col) for col in lose_games_data.columns]
    lose_games_data['Game_Kind'] = 'L'

    wins_count = regular_season_results.groupby(['Season', 'WTeamID']).size().reset_index(name='Wins').rename(columns={'WTeamID': 'TeamID'})
    loss_count = regular_season_results.groupby(['Season', 'LTeamID']).size().reset_index(name='Losses').rename(columns={'LTeamID': 'TeamID'})

    record = pd.merge(wins_count, loss_count, on=['Season', 'TeamID'], how='outer').reset_index().fillna(0)
    record['Games'] = record['Wins'] + record['Losses']
    record['Win_Weight'] = record['Wins'] / record['Games']
    record['Loss_Weight'] = record['Losses'] / record['Games']

    all_game_data = pd.concat([win_games_data, lose_games_data])
    
    all_game_data = pd.merge(all_game_data, record[['Season', 'TeamID', 'Win_Weight', 'Loss_Weight']], on=['Season', 'TeamID'])
    all_game_data['Weight'] = all_game_data.apply(lambda x: x['Win_Weight'] if x['Game_Kind'] == 'W' else x['Loss_Weight'], axis=1)

    features = [col for col in all_game_data.columns if col not in ['Season', 'TeamID', 'Win_Weight', 'Loss_Weight', 'Weight', 'Game_Kind', 'NumOT']]
    for col in features:
        all_game_data[col] = all_game_data[col] * all_game_data['Weight']

    agg_dict = {col: 'sum' for col in win_games_data.columns if col not in ['Season', 'TeamID', 'Win_Weight', 'Loss_Weight', 'Game_Kind', 'Win_Weight', 'Loss_Weight']}
    team_stats = all_game_data.groupby(['Season', 'TeamID']).agg(agg_dict).reset_index()

    team_stats = pd.merge(team_stats, wins_count, on=['Season', 'TeamID'], how='outer').fillna(0)
    team_stats = pd.merge(team_stats, loss_count, on=['Season', 'TeamID'], how='outer').fillna(0)
    team_stats = pd.merge(team_stats, conferences, on=['Season', 'TeamID'])

    average_stats,diff_features=get_mean_stats(regular_season_results)
    team_stats_final=pd.merge(team_stats,average_stats, on='Season')

    for col in diff_features:
        team_stats_final[f'{col}_League_Diff']=team_stats_final[col]-team_stats_final[f'Reg_Avg_{col}']

    print('length after initial features added: ',len(team_stats_final))
    return team_stats_final
def create_conference_features(rs_team_stats):
    """
    Creates conference-related features for team statistics.

    Args:
        rs_team_stats (pd.DataFrame): Team statistics DataFrame.

    Returns:
        pd.DataFrame: Team statistics with conference features.
    """

    features = [col for col in rs_team_stats.columns if col not in ['Season', 'TeamID', 'ConfAbbrev', 'Tournament_Wins']]
    features=[col for col in features if 'League' not in col]
    agg_dict = {col: 'mean' for col in features}
    conference_stats = rs_team_stats.groupby(['ConfAbbrev', 'Season']).agg(agg_dict).reset_index()
    conference_stats.rename(columns={col: f'{col}_Conference' for col in features}, inplace=True)
    rs_team_stats_conf = pd.merge(rs_team_stats, conference_stats, on=['ConfAbbrev', 'Season'])

    conf_diff_cols = [f'{col}_ConfDiff' for col in features]
    original_cols = [col for col in features]
    conference_cols = [f'{col}_Conference' for col in features]

    rs_team_stats_conf[conf_diff_cols] = rs_team_stats_conf[original_cols].values - rs_team_stats_conf[conference_cols].values

    return rs_team_stats_conf
def add_rankings_features(rankings, team_stats_conf):
    """
    Adds average rankings features to team statistics.

    Args:
        rankings (pd.DataFrame): DataFrame containing team rankings.
        team_stats_conf (pd.DataFrame): DataFrame containing team statistics with conference features.

    Returns:
        pd.DataFrame: DataFrame containing team statistics with rankings features.
    """
    rankings=rankings[rankings['RankingDayNum']<=121]
    # Calculate the average ranking for each team in each season
    average_rankings = rankings.groupby(['Season', 'TeamID'])['OrdinalRank'].mean().reset_index()

    # Rename the 'OrdinalRank' column to 'AverageOrdinalRank'
    average_rankings.rename(columns={'OrdinalRank': 'AverageOrdinalRank'}, inplace=True)
    
    # Merge the average stats with the data
    team_stats_conf_rank = pd.merge(team_stats_conf, average_rankings, on=['Season', 'TeamID'])
    return team_stats_conf_rank

def filter_rankings_by_system(rankings_df, system_name):
  """
  Filters a rankings DataFrame by a specific system name.

  Args:
    rankings_df (pd.DataFrame): The rankings DataFrame.
    system_name (str): The name of the ranking system to filter by.

  Returns:
    pd.DataFrame: A filtered DataFrame containing only rankings from the specified system.
  """
  rankings_df=rankings_df[rankings_df['RankingDayNum']<=121]
  return rankings_df[rankings_df['SystemName'] == system_name]


def add_selected_rankings_features(rankings, team_stats_conf_rank, selected_systems=['POM', 'SAG', 'WOL']):
    """
    Adds latest rankings features from selected systems to team statistics.

    Args:
        rankings (pd.DataFrame): DataFrame containing team rankings.
        team_stats_conf_rank (pd.DataFrame): DataFrame containing team statistics with conference and rankings features.
        selected_systems (list): List of ranking system names to include.

    Returns:
        pd.DataFrame: DataFrame containing team statistics with selected rankings features.
    """
    rankings=rankings[rankings['RankingDayNum']<=121]
    for system in selected_systems:
        # Filter rankings for the current system
        filtered_rankings = rankings[rankings['SystemName'] == system]

        # Sort by RankingDayNum
        sorted_rankings = filtered_rankings.sort_values('RankingDayNum', ascending=False)

        # Get the latest ranking for each team-season
        latest_rankings = sorted_rankings.groupby(['Season', 'TeamID']).first().reset_index()

        # Rename the 'OrdinalRank' column to 'LatestOrdinalRank_{system}'
        latest_rankings.rename(columns={'OrdinalRank': f'LatestOrdinalRank_{system}'}, inplace=True)

        # Merge with team_stats_conf_rank
        team_stats_conf_rank = pd.merge(team_stats_conf_rank, latest_rankings[['Season', 'TeamID', f'LatestOrdinalRank_{system}']], on=['Season', 'TeamID'], how='left')

    return team_stats_conf_rank
def create_opponent_features(sample_submission,is_rankings=True):
    features=['Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
        'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'Wins', 'Losses']
    rank_features=['AverageOrdinalRank', 'LatestOrdinalRank_POM',
        'Momentum']
    if is_rankings:
        reg_features = features+rank_features
    else:
        reg_features=features
    # Calculate regular season differences (team - opponent)
    for col in reg_features:
        sample_submission[f'{col}_Diff'] = \
            sample_submission[f'Reg_{col}'] - sample_submission[f'Opponent_reg_{col}']

    return sample_submission
def training_data_reg_season(regular_season_results,conference,rankings_df,tourney_seeds,teams,add_rankings=True):
    rs_team_stats=create_initial_features(regular_season_results, conference)
    rs_team_stats_conf = create_conference_features(rs_team_stats)

    if add_rankings:
        rs_team_stats_conf_rank = add_rankings_features(rankings_df, rs_team_stats_conf)     
        rs_team_stats_conf_rank=add_selected_rankings_features(rankings_df, rs_team_stats_conf_rank)     
        filtered_rankings = filter_rankings_by_system(rankings_df, 'POM')
        
        with_rankings = pd.merge(rs_team_stats_conf, filtered_rankings, on=['Season', 'TeamID'],how='left')

        # Calculate momentum for each team-season
        momentum = with_rankings.groupby(['Season', 'TeamID']).apply(calculate_momentum).reset_index(name='Momentum')
        
        # If Rank does not change the function will return nan, so fill nan with 0.
        momentum['Momentum']=momentum['Momentum'].fillna(0)
        # Merge momentum into with_rankings
        data_with_momentum = pd.merge(rs_team_stats_conf_rank, momentum, on=['Season', 'TeamID'])
    else:
        data_with_momentum=rs_team_stats_conf
    data_with_team_names=pd.merge(data_with_momentum,teams[['TeamID','TeamName']])

    with_tourney_seeds = add_tournament_data(data_with_team_names, tourney_seeds)
    
    print('Length at the end of reg season data prep',len(with_tourney_seeds))
    return with_tourney_seeds
def create_training_data(tournament_results, with_tourney_seeds,is_rankings=True):
    """
    Creates training data from tournament results and team statistics.

    Args:
        tournament_results (pd.DataFrame): DataFrame containing tournament results.
        with_tourney_seeds (pd.DataFrame): DataFrame containing team statistics with tournament seed information.

    Returns:
        pd.DataFrame: DataFrame containing training data.
    """

    tournament_results = tournament_results.copy() #prevent SettingWithCopyWarning
    tournament_results['Game_ID'] = tournament_results.index

    # Winning Teams
    winning_teams = tournament_results[['Game_ID', 'Season', 'DayNum', 'LTeamID'] + [col for col in tournament_results.columns if col.startswith('W')]]
    first_columns = ['Game_ID', 'Season', 'DayNum', 'LTeamID', 'WTeamID']
    winning_teams.columns = first_columns + [re.sub('W', 'Game_', col) for col in winning_teams.columns if col not in first_columns]
    rename_dict_rs = {col: f'Reg_{col}' for col in with_tourney_seeds.columns if col not in ['Season', 'WTeamID', 'LTeamID', 'DayNum', 'TeamID']}
    all_tournament_data_win = pd.merge(winning_teams, with_tourney_seeds.rename(columns=rename_dict_rs).copy(), left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])

    # Losing Teams
    losing_teams = tournament_results[['Game_ID', 'Season', 'DayNum', 'WTeamID'] + [col for col in tournament_results.columns if col.startswith('L')]]
    first_columns = ['Game_ID', 'Season', 'DayNum', 'WTeamID', 'LTeamID']
    losing_teams.columns = first_columns + [re.sub('L', 'Game_', col) for col in losing_teams.columns if col not in first_columns]
    rename_dict_rs = {col: f'Reg_{col}' for col in with_tourney_seeds.columns if col not in ['Season', 'WTeamID', 'LTeamID', 'DayNum', 'TeamID', 'Game_ID','TeamName']}
    all_tournament_data_lose = pd.merge(losing_teams, with_tourney_seeds.rename(columns=rename_dict_rs).copy(), left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])

    # Combine Winning and Losing Teams
    all_tournament_data_win['TeamID'] = all_tournament_data_win['WTeamID']
    all_tournament_data_win['Opponent_TeamID'] = all_tournament_data_win['LTeamID']
    all_tournament_data_win['efs'] = 0
    
    rename_dict_opponent = {col: f'Opponent_reg_{col}' for col in with_tourney_seeds.columns if col not in ['Season', 'WTeamID', 'LTeamID', 'DayNum', 'TeamID', 'Game_ID','TeamName']}
    rename_dict_opponent['TeamName']='Opponent_TeamName'
    to_join = with_tourney_seeds.rename(columns=rename_dict_opponent).copy()
    all_tournament_data_win_opponent = pd.merge(all_tournament_data_win, to_join, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
    all_tournament_data_win_opponent.drop(columns=['WTeamID', 'LTeamID'], inplace=True)

    all_tournament_data_lose['TeamID'] = all_tournament_data_lose['LTeamID']
    all_tournament_data_lose['Opponent_TeamID'] = all_tournament_data_lose['WTeamID']
    all_tournament_data_lose['efs'] = 1

    all_tournament_data_lose_opponent = pd.merge(all_tournament_data_lose, to_join, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
    all_tournament_data_lose_opponent.drop(columns=['WTeamID', 'LTeamID'], inplace=True)

    all_tournament_data_survival = pd.concat([all_tournament_data_win_opponent, all_tournament_data_lose_opponent])

    all_tournament_data_survival['efs_time'] = all_tournament_data_survival['DayNum'] - all_tournament_data_survival['DayNum'].min()
    all_tournament_data_survival = all_tournament_data_survival.drop(columns=['TeamID_y']).rename(columns={'TeamID_x': 'TeamID'})
    all_tournament_data_survival=create_opponent_features(all_tournament_data_survival,is_rankings=is_rankings)
    return all_tournament_data_survival
def calculate_expected_rows_after_aggregation(df, season_col='Season', team_id_cols=['WTeamID', 'LTeamID']):
    """
    Calculates the expected number of rows after aggregating a DataFrame by Season and TeamID.

    Args:
        df (pd.DataFrame): The DataFrame containing regular season game data.
        season_col (str): The name of the season column.
        team_id_cols (list): A list of column names containing TeamIDs.

    Returns:
        int: The expected number of rows after aggregation.
    """

    expected_rows = 0
    for season in df[season_col].unique():
        season_df = df[df[season_col] == season]
        teams=list(set(list(season_df['WTeamID'].unique())+list(season_df['LTeamID'].unique())))
        expected_rows += len(teams)

    return expected_rows
def prepare_submission_with_features(sample_submission, with_tourney_seeds,kind='men'):
    """
    Prepares the sample submission DataFrame by:
    1. Splitting the 'ID' column into Season, TeamID1, TeamID2.
    2. Creating TeamID (lower ID) and Opponent_TeamID (higher ID).
    3. Merging regular season statistics for both teams.
    4. Creating a Team_To_Predict column.
    5. Dropping duplicate TeamID columns.

    Args:
        sample_submission (pd.DataFrame): The sample submission DataFrame.
        with_tourney_seeds (pd.DataFrame): DataFrame containing team statistics with tournament seed information.

    Returns:
        pd.DataFrame: The prepared submission DataFrame.
    """

    def split_submission_id(sample_submission):
        """Splits the 'ID' column into 'TeamID1' and 'TeamID2'."""
        split_ids = sample_submission['ID'].str.split('_')
        sample_submission['TeamID1'] = split_ids.str.get(1).astype(int)
        sample_submission['TeamID2'] = split_ids.str.get(2).astype(int)
        sample_submission['Season'] = split_ids.str.get(0).astype(int)
        return sample_submission

    def create_team_opponent_ids(sample_submission):
        """Creates 'TeamID' and 'Opponent_TeamID' columns."""
        sample_submission['TeamID'] = sample_submission[['TeamID1', 'TeamID2']].min(axis=1)
        sample_submission['Opponent_TeamID'] = sample_submission.apply(
            lambda row: row['TeamID2'] if row['TeamID1'] == row['TeamID'] else row['TeamID1'],
            axis=1
        )
        return sample_submission
   
    sample_submission = split_submission_id(sample_submission.copy())
    
    sample_submission = create_team_opponent_ids(sample_submission)
   

    rename_dict_rs = {col: f'Reg_{col}' for col in with_tourney_seeds.columns if col not in ['Season', 'WTeamID', 'LTeamID', 'DayNum', 'TeamID','TeamName']}
    sample_submission = pd.merge(sample_submission, with_tourney_seeds.rename(columns=rename_dict_rs).copy(), left_on=['Season', 'TeamID'], right_on=['Season', 'TeamID'])
    sample_submission['Team_To_Predict'] = sample_submission['TeamID']
   
   
    
    rename_dict_opponent = {col: f'Opponent_reg_{col}' for col in with_tourney_seeds.columns if col not in ['Season', 'WTeamID', 'LTeamID', 'DayNum', 'TeamID', 'GameID','TeamName']}
    rename_dict_opponent['TeamName']='Opponent_TeamName'
    to_join = with_tourney_seeds.rename(columns=rename_dict_opponent).copy()
    sample_submission = pd.merge(sample_submission, to_join, left_on=['Season', 'Opponent_TeamID'], right_on=['Season', 'TeamID'],how='left')
    
    sample_submission = sample_submission.drop(['TeamID_x', 'TeamID_y'], axis=1)
    if kind=='men':
        sample_submission=create_opponent_features(sample_submission)
        sample_submission=pd.get_dummies(sample_submission,columns=['Reg_ConfAbbrev','Opponent_reg_ConfAbbrev'])
    else: 
        sample_submission=create_opponent_features(sample_submission,is_rankings=False)
        sample_submission=pd.get_dummies(sample_submission,columns=['Reg_ConfAbbrev','Opponent_reg_ConfAbbrev'])
        kmeans_features=np.load("data/processed/kmeans_features_women.joblib",allow_pickle=True)
        for col in kmeans_features:
            if col not in sample_submission.columns:
                sample_submission[col]=False
                print(col)
    return sample_submission



# Create Training Data

In [3]:
# Mens training data
warnings.filterwarnings('ignore')
tournament_results_men = data_dict['MNCAATourneyDetailedResults']
regular_season_results_men = data_dict['MRegularSeasonDetailedResults']
conferences_men = data_dict['MTeamConferences'] 
rankings_df = data_dict['MMasseyOrdinals']
tourney_seeds_men = data_dict['MNCAATourneySeeds']  
sample_submission=data_dict['SampleSubmissionStage1']
teams_men=data_dict['MTeams']

with_tourney_seeds_men=training_data_reg_season(regular_season_results_men,conferences_men,rankings_df,tourney_seeds_men,teams_men)
training_data_men=create_training_data(tournament_results_men,with_tourney_seeds_men)
training_data_men.to_csv('data/processed/mens_processed_data.csv',index=False)
print('Mens training data saved to csv')

Expected Rows:  7981
length after initial features added:  7981
Length at the end of reg season data prep 7981
Mens training data saved to csv


In [4]:
# Womens training data
tournament_results_women = data_dict['WNCAATourneyDetailedResults']
regular_season_results_women = data_dict['WRegularSeasonDetailedResults']
conferences_women = data_dict['WTeamConferences'] 
rankings_df_women = None
tourney_seeds_women = data_dict['WNCAATourneySeeds']  
teams_women=data_dict['WTeams']

with_tourney_seeds_women=training_data_reg_season(regular_season_results_women,conferences_women,rankings_df_women,tourney_seeds_women,teams_women,add_rankings=False)
training_data_women=create_training_data(tournament_results_women,with_tourney_seeds_women,is_rankings=False)
training_data_women.to_csv('data/processed/womens_processed_data.csv',index=False)
print('Womens training data saved to csv')

Expected Rows:  5602
length after initial features added:  5602
Length at the end of reg season data prep 5602
Womens training data saved to csv


# **Prepare Submissions (Historical Data)**

In [5]:
# Mens Submission
with_tourney_seeds_men=training_data_reg_season(regular_season_results_men,conferences_men,rankings_df,tourney_seeds_men,teams_men)

df_men=prepare_submission_with_features(sample_submission, with_tourney_seeds_men)
mens_submission=mf.load_xgb_model_and_predict_with_gnn(df_men) # Make Predictions
mens_submission[['ID','Pred']].head()

Expected Rows:  7981
length after initial features added:  7981
Length at the end of reg season data prep 7981


Unnamed: 0,ID,Pred
0,2021_1101_1102,0.686233
1,2021_1101_1103,0.544308
2,2021_1101_1104,0.264565
3,2021_1101_1105,0.695259
4,2021_1101_1106,0.702361


In [6]:
# Womens Submission
with_tourney_seeds_women=training_data_reg_season(regular_season_results_women,conferences_women,rankings_df_women,tourney_seeds_women,teams_women,add_rankings=False)
df_women=prepare_submission_with_features(sample_submission, with_tourney_seeds_women,kind='women')
df_women=df_women.dropna()
womens_submission=mf.load_svm_model_and_predict_women(df_women)
womens_submission[['ID','Pred']].head(5)

Expected Rows:  5602
length after initial features added:  5602
Length at the end of reg season data prep 5602
Reg_ConfAbbrev_pac_ten
Opponent_reg_ConfAbbrev_pac_ten


Unnamed: 0,ID,Pred
1035,2021_3104_3112,0.435293
1039,2021_3104_3116,0.448779
1045,2021_3104_3124,0.104502
1046,2021_3104_3125,0.429347
1052,2021_3104_3133,0.402923


# **Submission (2025)**


In [7]:
# Mens Submission 2025
submission2=data_dict['SampleSubmissionStage2']
with_tourney_seeds_men_2025=training_data_reg_season(regular_season_results_men,conferences_men,rankings_df,tourney_seeds_men,teams_men)
df_men_2025=prepare_submission_with_features(submission2, with_tourney_seeds_men_2025)
mens_submission_2025=mf.load_xgb_model_and_predict_with_gnn(df_men_2025) 
mens_submission_2025[['ID','Pred']].head()

Expected Rows:  7981
length after initial features added:  7981
Length at the end of reg season data prep 7981


Unnamed: 0,ID,Pred
0,2025_1101_1102,0.556379
1,2025_1101_1103,0.303741
2,2025_1101_1104,0.24159
3,2025_1101_1105,0.62693
4,2025_1101_1106,0.556135


In [8]:
# Womens Submission 2025
with_tourney_seeds_women_2025=training_data_reg_season(regular_season_results_women,conferences_women,rankings_df_women,tourney_seeds_women,teams_women,add_rankings=False)
df_women_2025=prepare_submission_with_features(submission2, with_tourney_seeds_women_2025,kind='women')
womens_submission_2025=mf.load_svm_model_and_predict_women(df_women_2025)# Make Predictions
womens_submission_2025[['ID','Pred']].head(5)

Expected Rows:  5602
length after initial features added:  5602
Length at the end of reg season data prep 5602
Reg_ConfAbbrev_pac_ten
Reg_ConfAbbrev_pac_twelve
Opponent_reg_ConfAbbrev_pac_ten
Opponent_reg_ConfAbbrev_pac_twelve


Unnamed: 0,ID,Pred
0,2025_3101_3102,0.45077
1,2025_3101_3103,0.588369
2,2025_3101_3104,0.103213
3,2025_3101_3105,0.520492
4,2025_3101_3106,0.768232


# Check Results
- Insert the name of a team and the name of the opponent to view the probability that one of the teams will win.
- Check the results on google.

In [13]:
def find_games_with_teams(df, team1_substring, team2_substring):
    """
    Finds games in a DataFrame where either TeamName or Opponent_TeamName contains
    the specified substrings, checking both directions.

    Args:
        df (pd.DataFrame): The DataFrame containing game data.
        team1_substring (str): The substring to search for in TeamName.
        team2_substring (str): The substring to search for in Opponent_TeamName.

    Returns:
        pd.DataFrame: A DataFrame containing the matching games.
    """

    team1_mask = df['TeamName'].str.contains(team1_substring, case=False, na=False)
    team2_mask = df['Opponent_TeamName'].str.contains(team2_substring, case=False, na=False)

    team2_mask_reverse = df['TeamName'].str.contains(team2_substring, case=False, na=False)
    team1_mask_reverse = df['Opponent_TeamName'].str.contains(team1_substring, case=False, na=False)

    matching_games = df[(team1_mask & team2_mask) | (team2_mask_reverse & team1_mask_reverse)]

    return matching_games

team='Arizona'
opponent='Akron'
matching_games = find_games_with_teams(mens_submission_2025, team, opponent)
matching_games[['TeamName', 'Opponent_TeamName', 'Pred']]

Unnamed: 0,TeamName,Opponent_TeamName,Pred
732,Akron,Arizona,0.314478
733,Akron,Arizona St,0.414473
930,Akron,Northern Arizona,0.699685


In [19]:
team='Oklahoma'
opponent='Iowa'
matching_games = find_games_with_teams(womens_submission_2025, team, opponent)
matching_games[['TeamName', 'Opponent_TeamName', 'Pred']]

Unnamed: 0,TeamName,Opponent_TeamName,Pred
37228,Iowa,Oklahoma,0.328743
37229,Iowa,Oklahoma St,0.430614
37464,Iowa St,Oklahoma,0.353443
37465,Iowa St,Oklahoma St,0.455127
53566,Northern Iowa,Oklahoma,0.25011
53567,Northern Iowa,Oklahoma St,0.355708
