In [1]:
!pip install pycaret[full]



In [2]:
import re

In [3]:
import pandas as pd

In [4]:
import xgboost

In [5]:
from pycaret.classification import *

  defaults = yaml.load(f)


In [6]:
season_data = pd.read_csv('MRegularSeasonCompactResults.csv')
seeds = pd.read_csv('MNCAATourneySeeds.csv')
public_rating = pd.read_csv('MMasseyOrdinals.csv')

In [7]:
def score_gap(df):
    df['ScoreGap'] = df['WScore'] - df['LScore']
    return df

In [8]:
season_data = score_gap(season_data)

In [9]:
def create_team_list(df, group_list = ['Season', 'WTeamID'], team_id = 'WTeamID'):
    group = df.groupby(group_list).count().reset_index()
    group = group[group_list].rename(columns={team_id: "TeamID"})
    return group

In [10]:
winners = create_team_list(season_data, group_list = ['Season', 'WTeamID'], team_id = 'WTeamID')
lossers = create_team_list(season_data, group_list = ['Season', 'LTeamID'], team_id = 'LTeamID')
team_agg_features = pd.concat([winners, lossers], axis = 0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop = True)

In [11]:
def winner_aggregated_features(df, group_list = ['Season', 'WTeamID']):
    tmp = df.groupby(group_list).agg(NumWins       = ('WTeamID', 'count'), 
                                     AvgWinsGap    = ('ScoreGap', 'mean'),
                                     W_TotalPoints = ('WScore', 'sum'),
                                     W_MaxPoints   = ('WScore', 'max'),
                                     W_MinPoints   = ('WScore', 'min'),
                                    )
    tmp = tmp.reset_index()
    tmp = tmp.rename(columns={"WTeamID": "TeamID"})
    return tmp

In [12]:
def losser_aggregated_features(df, group_list = ['Season', 'LTeamID']):
    tmp = df.groupby(group_list).agg(NumLosses       = ('LTeamID', 'count'), 
                                     AvgLossesGap    = ('ScoreGap', 'mean'),
                                     L_TotalPoints = ('LScore', 'sum'),
                                     L_MaxPoints   = ('LScore', 'max'),
                                     L_MinPoints   = ('LScore', 'min'),
                                    )
    tmp = tmp.reset_index()
    tmp = tmp.rename(columns={"LTeamID": "TeamID"})
    return tmp

In [13]:
winner_team_aggregation = winner_aggregated_features(season_data)
losser_team_aggregation = losser_aggregated_features(season_data)

In [14]:
def merge_back(df):
    df = df.merge(winner_team_aggregation, on = ['Season', 'TeamID'], how = 'left')
    df = df.merge(losser_team_aggregation, on = ['Season', 'TeamID'], how = 'left')
    df.fillna(0, inplace = True) 
    return df

In [15]:
team_agg_features = merge_back(team_agg_features)

In [16]:
def calculate_features(df):
    """
    
    """
    df['WinRatio'] = df['NumWins'] / (df['NumWins'] + df['NumLosses'])
    df['AvgScoreGap'] = ((df['NumWins'] * df['AvgWinsGap'] - df['NumLosses'] * df['AvgLossesGap']) / (df['NumWins'] + df['NumLosses']))
    df['PointsRatio'] = df['W_TotalPoints'] / (df['L_TotalPoints'] + df['W_TotalPoints'])
    return df

team_agg_features = calculate_features(team_agg_features)

In [17]:
team_agg_features = team_agg_features[['Season','TeamID','WinRatio', 'AvgScoreGap','PointsRatio']]

In [18]:
tournament_data = pd.read_csv('MNCAATourneyCompactResults.csv')

In [19]:
tournament_data = tournament_data.rename(columns = {'WTeamID' : 'W_TeamID', 'LTeamID' : 'L_TeamID', 'WScore' : 'W_Score', 'LScore' : 'L_Score'}) 

In [20]:
tournament_data.drop(['NumOT', 'WLoc'], axis = 1, inplace = True)

In [21]:
MIN_SEASON = 2015
tournament_data = tournament_data[tournament_data['Season'] >= MIN_SEASON].reset_index(drop = True)

In [22]:
def merge_seed(df, seed_df, left_on = ['Season', 'W_TeamID'], field_name = 'SeedW'):
    df = pd.merge(df,seed_df, how = 'left', left_on = left_on, right_on = ['Season', 'TeamID'])
    df = df.drop('TeamID', axis = 1).rename(columns = {'Seed': field_name})
    return df

In [23]:
tournament_data = merge_seed(tournament_data, seeds, left_on = ['Season', 'W_TeamID'], field_name = 'W_Seed')
tournament_data = merge_seed(tournament_data, seeds, left_on = ['Season', 'L_TeamID'], field_name = 'L_Seed')

In [24]:
def seed_number(row):
    return int(re.sub("[^0-9]", "", row))

tournament_data['W_Seed'] = tournament_data['W_Seed'].apply(seed_number)
tournament_data['L_Seed'] = tournament_data['L_Seed'].apply(seed_number)

In [25]:
def merge_agg_features(df, agg_features):
    for result in ['W', 'L']:
        df = pd.merge(df, agg_features, how = 'left', left_on = ['Season', result +'_'+ 'TeamID'], right_on = ['Season', 'TeamID'])
        avoid = ['Season', 'TeamID']
        new_names = {col: result +'_'+ col for col in agg_features.columns if col not in avoid}
        df = df.rename(columns = new_names)        
        df = df.drop(columns = 'TeamID', axis = 1)
    return df

tournament_data = merge_agg_features(tournament_data, team_agg_features)

In [26]:
def replace_win_loser(df):
    team_a = df.copy()
    team_b = df.copy()
    
    team_a_dict, team_b_dict = {}, {}
    
    for col in team_a.columns:
        if col.find('W_') == 0:
            new_col_name = str(col).replace('W_', 'A_')
            team_a_dict[col] = new_col_name
        if col.find('L_') == 0:
            new_col_name = col.replace('L_', 'B_')    
            team_a_dict[col] = new_col_name
            
    for col in team_b.columns:
        if col.find('W_') == 0:
            new_col_name = str(col).replace('W_', 'B_')
            team_b_dict[col] = new_col_name
        if col.find('L_') == 0:
            new_col_name = col.replace('L_', 'A_')
            team_b_dict[col] = new_col_name

    team_a = team_a.rename(columns = team_a_dict)
    team_b = team_b.rename(columns = team_b_dict)
    
    merged_df = pd.concat([team_a, team_b], axis = 0, sort = False)
    return merged_df

In [27]:
tournament_data = replace_win_loser(tournament_data)

In [28]:
def calculate_differences(df):
    df['SeedDiff'] = df['A_Seed'] - df['B_Seed']
    df['WinRatioDiff'] = df['A_WinRatio'] - df['B_WinRatio']
    df['GapAvgDiff'] = df['A_AvgScoreGap'] - df['B_AvgScoreGap']    
    df['PointsRatioDiff'] = df['A_PointsRatio'] - df['A_PointsRatio']
    return df

tournament_data = calculate_differences(tournament_data)

In [29]:
tournament_data['ScoreDiff'] = tournament_data['A_Score'] - tournament_data['B_Score']
tournament_data['A_Win'] = (tournament_data['ScoreDiff'] > 0).astype(int)
tournament_data = tournament_data.drop(columns=['A_Score', 'B_Score'])

In [30]:
sub_stage_one = pd.read_csv('MSampleSubmissionStage1.csv')
tst_data = sub_stage_one.copy()

In [31]:
def separate_id(df):
    df['Season']  = df['ID'].apply(lambda x: int(x.split('_')[0]))
    df['TeamIdA'] = df['ID'].apply(lambda x: int(x.split('_')[1]))
    df['TeamIdB'] = df['ID'].apply(lambda x: int(x.split('_')[2]))
    return df
tst_data = separate_id(tst_data)

In [32]:
tst_data = merge_seed(tst_data, seeds, left_on = ['Season', 'TeamIdA'], field_name = 'A_Seed')
tst_data = merge_seed(tst_data, seeds, left_on = ['Season', 'TeamIdB'], field_name = 'B_Seed')

In [33]:
tst_data['A_Seed'] = tst_data['A_Seed'].apply(seed_number)
tst_data['B_Seed'] = tst_data['B_Seed'].apply(seed_number)

In [34]:
tst_data = tst_data.rename(columns = {'TeamIdA': 'A_TeamID', 'TeamIdB': 'B_TeamID'})

In [35]:
def merge_agg_features(df, agg_features):
    for result in ['A', 'B']:
        df = pd.merge(df, agg_features, how = 'left', left_on = ['Season', result +'_'+ 'TeamID'], right_on = ['Season', 'TeamID'])
        avoid = ['Season', 'TeamID']
        new_names = {col: result +'_'+ col for col in agg_features.columns if col not in avoid}
        df = df.rename(columns = new_names)        
        df = df.drop(columns = 'TeamID', axis = 1)
    return df

tst_data = merge_agg_features(tst_data, team_agg_features)

In [36]:
tst_data = calculate_differences(tst_data)

In [50]:
tournament_data = tournament_data.drop(['DayNum', 'ScoreDiff'],axis=1)

In [51]:
s = setup(tournament_data, target = 'A_Win', session_id = 123, feature_selection = True, remove_multicollinearity = True, multicollinearity_threshold = 0.6, pca = True, pca_components = 10)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,A_Win
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(802, 16)"
5,Missing Values,False
6,Numeric Features,12
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


In [52]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7094,0.0,0.6885,0.7098,0.6976,0.4176,0.4192,0.012
lr,Logistic Regression,0.7076,0.7899,0.6922,0.7063,0.6978,0.4143,0.4159,0.721
lda,Linear Discriminant Analysis,0.7076,0.7891,0.6848,0.7087,0.6951,0.414,0.4157,0.014
nb,Naive Bayes,0.697,0.777,0.6885,0.6917,0.6886,0.3929,0.3946,0.016
gbc,Gradient Boosting Classifier,0.6844,0.7405,0.6516,0.6841,0.665,0.367,0.3697,0.181
catboost,CatBoost Classifier,0.6702,0.7443,0.6519,0.6655,0.6568,0.3392,0.341,4.308
ada,Ada Boost Classifier,0.6701,0.7255,0.6259,0.6753,0.6487,0.338,0.3398,0.141
rf,Random Forest Classifier,0.6666,0.7394,0.6336,0.6644,0.6479,0.3317,0.3325,0.529
dt,Decision Tree Classifier,0.6543,0.6546,0.6626,0.6442,0.6504,0.3088,0.3114,0.017
qda,Quadratic Discriminant Analysis,0.6542,0.7528,0.6038,0.6584,0.6269,0.3059,0.3094,0.016


In [53]:
lr = create_model('lr')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7368,0.8116,0.7143,0.7407,0.7273,0.4732,0.4735
1,0.5893,0.7398,0.6071,0.5862,0.5965,0.1786,0.1787
2,0.7321,0.8023,0.7857,0.7097,0.7458,0.4643,0.467
3,0.7321,0.7842,0.6667,0.75,0.7059,0.4615,0.4642
4,0.6429,0.7165,0.6667,0.6207,0.6429,0.2866,0.2874
5,0.7679,0.8966,0.7407,0.7692,0.7547,0.5345,0.5349
6,0.7143,0.7739,0.6296,0.7391,0.68,0.4249,0.4294
7,0.7321,0.7369,0.6667,0.75,0.7059,0.4615,0.4642
8,0.6964,0.8301,0.7407,0.6667,0.7018,0.3944,0.3967
9,0.7321,0.8072,0.7037,0.7308,0.717,0.4629,0.4632


In [54]:
tuned_lr = tune_model(lr, search_library = 'optuna', n_iter = 50)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7544,0.8103,0.75,0.75,0.75,0.5086,0.5086
1,0.6071,0.7564,0.6429,0.6,0.6207,0.2143,0.2148
2,0.7143,0.7832,0.7857,0.6875,0.7333,0.4286,0.433
3,0.7321,0.7714,0.7037,0.7308,0.717,0.4629,0.4632
4,0.6429,0.7165,0.6667,0.6207,0.6429,0.2866,0.2874
5,0.8036,0.8902,0.8148,0.7857,0.8,0.6071,0.6075
6,0.7321,0.765,0.6667,0.75,0.7059,0.4615,0.4642
7,0.7143,0.728,0.6296,0.7391,0.68,0.4249,0.4294
8,0.7143,0.811,0.7778,0.6774,0.7241,0.4307,0.4352
9,0.7321,0.8327,0.7037,0.7308,0.717,0.4629,0.4632


In [55]:
nb = create_model('nb')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7018,0.8005,0.7143,0.6897,0.7018,0.4037,0.4039
1,0.5893,0.7385,0.6071,0.5862,0.5965,0.1786,0.1787
2,0.6964,0.787,0.7857,0.6667,0.7213,0.3929,0.3993
3,0.6964,0.7739,0.6667,0.6923,0.6792,0.3913,0.3916
4,0.625,0.6973,0.6296,0.6071,0.6182,0.25,0.2502
5,0.7857,0.8799,0.7778,0.7778,0.7778,0.5709,0.5709
6,0.75,0.7688,0.7037,0.76,0.7308,0.4981,0.4994
7,0.7143,0.7254,0.6296,0.7391,0.68,0.4249,0.4294
8,0.7143,0.8072,0.7407,0.6897,0.7143,0.4293,0.4304
9,0.6964,0.7918,0.6296,0.7083,0.6667,0.3897,0.392


In [56]:
tuned_nb = tune_model(nb, search_library = 'optuna', n_iter = 50)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7368,0.803,0.75,0.7241,0.7368,0.4738,0.4741
1,0.6071,0.7474,0.6071,0.6071,0.6071,0.2143,0.2143
2,0.75,0.7819,0.7857,0.7333,0.7586,0.5,0.5013
3,0.7321,0.7727,0.7037,0.7308,0.717,0.4629,0.4632
4,0.6071,0.6999,0.6296,0.5862,0.6071,0.2153,0.2158
5,0.7857,0.8889,0.7407,0.8,0.7692,0.5698,0.5713
6,0.6964,0.765,0.5926,0.7273,0.6531,0.3882,0.3946
7,0.7143,0.7178,0.6296,0.7391,0.68,0.4249,0.4294
8,0.7321,0.811,0.7407,0.7143,0.7273,0.4643,0.4646
9,0.75,0.8365,0.7037,0.76,0.7308,0.4981,0.4994


In [57]:
predictions = predict_model(tuned_lr, data=tst_data)

In [62]:
tournament_data.head()

Unnamed: 0,Season,A_TeamID,B_TeamID,A_Seed,B_Seed,A_WinRatio,A_AvgScoreGap,A_PointsRatio,B_WinRatio,B_AvgScoreGap,B_PointsRatio,SeedDiff,WinRatioDiff,GapAvgDiff,PointsRatioDiff,A_Win
0,2015,1214,1264,16,16,0.484848,-0.848485,0.530686,0.59375,2.375,0.628354,0,-0.108902,-3.223485,0.0,1
1,2015,1279,1140,11,11,0.625,5.125,0.643718,0.71875,8.90625,0.74265,0,-0.09375,-3.78125,0.0,1
2,2015,1173,1129,11,11,0.757576,7.30303,0.778765,0.741935,8.935484,0.786242,0,0.01564,-1.632454,0.0,1
3,2015,1352,1316,16,16,0.575758,1.272727,0.618797,0.645161,5.419355,0.676702,0,-0.069404,-4.146628,0.0,1
4,2015,1112,1411,2,15,0.911765,17.823529,0.922663,0.647059,0.735294,0.701898,-13,0.264706,17.088235,0.0,1


In [61]:
tst_data

Unnamed: 0,ID,Pred,Season,A_TeamID,B_TeamID,A_Seed,B_Seed,A_WinRatio,A_AvgScoreGap,A_PointsRatio,B_WinRatio,B_AvgScoreGap,B_PointsRatio,SeedDiff,WinRatioDiff,GapAvgDiff,PointsRatioDiff
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121,0.767537,0.870968,9.935484,0.875115,-6,-0.113392,2.276637,0.0
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121,0.767537,0.484848,-2.363636,0.540957,-10,0.272727,14.575758,0.0
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121,0.767537,0.656250,6.687500,0.703931,1,0.101326,5.524621,0.0
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121,0.767537,0.575758,0.666667,0.622627,-8,0.181818,11.545455,0.0
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121,0.767537,0.677419,9.419355,0.736990,-3,0.080156,2.792766,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,3,12,0.666667,5.185185,0.666507,0.958333,12.750000,0.971189,-9,-0.291667,-7.564815,0.0
11386,2021_1452_1458,0.5,2021,1452,1458,3,9,0.666667,5.185185,0.666507,0.586207,5.310345,0.621100,-6,0.080460,-0.125160,0.0
11387,2021_1455_1457,0.5,2021,1455,1457,11,12,0.736842,2.631579,0.780381,0.958333,12.750000,0.971189,-1,-0.221491,-10.118421,0.0
11388,2021_1455_1458,0.5,2021,1455,1458,11,9,0.736842,2.631579,0.780381,0.586207,5.310345,0.621100,2,0.150635,-2.678766,0.0


In [60]:
predictions.head()

Unnamed: 0,ID,Pred,Season,A_TeamID,B_TeamID,A_Seed,B_Seed,A_WinRatio,A_AvgScoreGap,A_PointsRatio,B_WinRatio,B_AvgScoreGap,B_PointsRatio,SeedDiff,WinRatioDiff,GapAvgDiff,PointsRatioDiff,Label,Score
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121,0.767537,0.870968,9.935484,0.875115,-6,-0.113392,2.276637,0.0,1,0.7336
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121,0.767537,0.484848,-2.363636,0.540957,-10,0.272727,14.575758,0.0,1,0.8522
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121,0.767537,0.65625,6.6875,0.703931,1,0.101326,5.524621,0.0,0,0.5156
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121,0.767537,0.575758,0.666667,0.622627,-8,0.181818,11.545455,0.0,1,0.7921
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121,0.767537,0.677419,9.419355,0.73699,-3,0.080156,2.792766,0.0,1,0.6365


In [67]:
sub = predictions[['ID', 'Score']].rename(columns={"Score": "Pred"})
sub.to_csv('submission.csv', index = False)