## License 

Copyright 2021-2025 Patrick Hall (jphall@gwu.edu)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

*DISCLAIMER*: This notebook is not legal or compliance advice.

# Model Evaluation Notebook

#### Imports and inits

In [1]:
import os              # for directory and file manipulation
import numpy as np     # for basic array manipulation
import pandas as pd    # for dataframe manipulation
import datetime        # for timestamp

# for model eval
from sklearn.metrics import accuracy_score, f1_score, log_loss, mean_squared_error, roc_auc_score

# global constants 
ROUND = 3              # generally, insane precision is not needed 
SEED = 12345           # seed for better reproducibility

# set global random seed for better reproducibility
np.random.seed(SEED)

#### Set basic metadata

In [2]:
y_name = 'high_priced'
scores_dir = 'data/scores'

#### Read in score files 

In [3]:
# init score frame with known test y values
scores_frame = pd.read_csv(scores_dir + os.sep +'key.csv', index_col='Unnamed: 0')

# create random folds in reproducible way
np.random.seed(SEED)
scores_frame['fold'] = np.random.choice(5, scores_frame.shape[0])

# read in each score file in the directory as a new column 
for file in sorted(os.listdir(scores_dir)):
    if file != 'key.csv' and file.endswith('.csv'):
        scores_frame[file[:-4]] = pd.read_csv(scores_dir + os.sep + file)['phat']

# sanity check 
scores_frame

Unnamed: 0,high_priced,fold,group10_best_ebm,group10_best_glm,group10_best_mxgb,group11_ebm_1,group11_elasticnet,group11_xgboost_1,group11_xgboost_500,group1_ebm,...,group7_glm,group7_mxgb,group8_best_ebm,group8_best_mxgb,group9_best_ebm,group9_best_glm,group9_best_mxgb,ph_advval_ebm50,ph_baseline_ebm50,ph_fe_ebm50
0,0.0,2,0.079616,0.142090,0.072924,0.084039,0.142090,0.068479,0.068761,0.084039,...,0.147085,0.065060,0.081542,0.076050,0.079389,0.142090,0.068479,0.082402,0.084039,0.098831
1,0.0,1,0.027952,0.081674,0.035975,0.029255,0.081674,0.036151,0.034514,0.029255,...,0.071244,0.034284,0.029341,0.048150,0.027289,0.081674,0.036151,0.029524,0.029255,0.026936
2,1.0,4,0.194671,0.125823,0.171891,0.188205,0.125823,0.171173,0.169482,0.188205,...,0.151160,0.165456,0.193636,0.157743,0.192532,0.125823,0.171173,0.185242,0.188205,0.197683
3,0.0,1,0.012200,0.006973,0.031537,0.032708,0.006973,0.027247,0.032396,0.032708,...,0.002481,0.027143,0.027803,0.020836,0.011499,0.006973,0.027247,0.018287,0.032708,0.000855
4,1.0,2,0.205166,0.130426,0.174384,0.204401,0.130426,0.179143,0.173536,0.204401,...,0.153105,0.183033,0.204067,0.182161,0.206485,0.130426,0.179143,0.204797,0.204401,0.200064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19826,0.0,3,0.231769,0.160032,0.262694,0.223766,0.160032,0.264472,0.260247,0.223766,...,0.191735,0.263752,0.223769,0.168159,0.226770,0.160032,0.264472,0.224724,0.223766,0.241551
19827,0.0,1,0.255524,0.123836,0.188144,0.252186,0.123836,0.195559,0.184990,0.252186,...,0.135641,0.183118,0.254049,0.197256,0.254323,0.123836,0.195559,0.227610,0.252186,0.247632
19828,1.0,3,0.224751,0.169604,0.222151,0.222928,0.169604,0.218367,0.219813,0.222928,...,0.206735,0.223238,0.218768,0.251273,0.224295,0.169604,0.218367,0.240433,0.222928,0.246630
19829,0.0,1,0.000631,0.002538,0.001531,0.000307,0.002538,0.001704,0.001731,0.000307,...,0.000045,0.001233,0.000441,0.020065,0.000407,0.002538,0.001704,0.000214,0.000307,0.000417


#### Utility function for max. accuracy

In [4]:
def max_acc(y, phat, res=0.01): 

    """ Utility function for finding max. accuracy at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. accuracy, default 0.01.
        :return: Max. accuracy for model scores.
    
    """
    
    # init frame to store acc at different cutoffs
    acc_frame = pd.DataFrame(columns=['cut', 'acc'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find accuracy at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        acc = accuracy_score(temp_df.iloc[:, 0], temp_df['decision'])
        acc_frame = acc_frame.append({'cut': cut,
                                      'acc': acc},
                                     ignore_index=True)

    # find max accurcay across all cutoffs
    max_acc = acc_frame['acc'].max()
    
    # house keeping
    del acc_frame, temp_df
    
    return max_acc

####  Utility function for max. F1

In [5]:
def max_f1(y, phat, res=0.01): 
    
    """ Utility function for finding max. F1 at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. F1, default 0.01.
        :return: Max. F1 for model scores.
    
    """
    
    # init frame to store f1 at different cutoffs
    f1_frame = pd.DataFrame(columns=['cut', 'f1'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find f1 at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        f1 = f1_score(temp_df.iloc[:, 0], temp_df['decision'])
        f1_frame = f1_frame.append({'cut': cut,
                                    'f1': f1},
                                    ignore_index=True)
        
    # find max f1 across all cutoffs
    max_f1 = f1_frame['f1'].max()
    
     # house keeping
    del f1_frame, temp_df
    
    return max_f1

#### Rank all submitted scores 

In [6]:
eval_frame = pd.DataFrame() # init frame to hold score ranking
metric_list = ['acc', 'auc', 'f1', 'logloss', 'mse'] # metric to use for evaluation

# create eval frame row-by-row
for fold in sorted(scores_frame['fold'].unique()): # loop through folds 
    for metric_name in metric_list: # loop through metrics
        
        # init row dict to hold each rows values
        row_dict = {'fold': fold,
                    'metric': metric_name}
        
        # cache known y values for fold
        fold_y = scores_frame.loc[scores_frame['fold'] == fold, y_name]
        
        for col_name in scores_frame.columns[2:]:
            
            # cache fold scores
            fold_scores = scores_frame.loc[scores_frame['fold'] == fold, col_name]
            
            # calculate evaluation metric for fold
            # with reasonable precision 
            
            if metric_name == 'acc':
                row_dict[col_name] = np.round(max_acc(fold_y, fold_scores), ROUND)
                
            if metric_name == 'auc':
                row_dict[col_name] = np.round(roc_auc_score(fold_y, fold_scores), ROUND)
                
            if metric_name == 'f1':
                row_dict[col_name] = np.round(max_f1(fold_y, fold_scores), ROUND) 
                
            if metric_name == 'logloss':
                row_dict[col_name] = np.round(log_loss(fold_y, fold_scores), ROUND)
                
            if metric_name == 'mse':
                row_dict[col_name] = np.round(mean_squared_error(fold_y, fold_scores), ROUND)
        
        # append row values to eval_frame
        eval_frame = eval_frame.append(row_dict, ignore_index=True)

# init a temporary frame to hold rank information
rank_names = [name + '_rank' for name in sorted(eval_frame.columns) if name not in ['fold', 'metric']]
rank_frame = pd.DataFrame(columns=rank_names)        

# set columns to necessary order
eval_frame = eval_frame[['fold', 'metric'] + [name for name in sorted(eval_frame.columns) if name not in ['fold', 'metric']]]

# determine score ranks row-by-row
for i in range(0, eval_frame.shape[0]):
        
        # get ranks for row based on metric
        metric_name = eval_frame.loc[i, 'metric']
        if metric_name in ['logloss', 'mse']:
            ranks = eval_frame.iloc[i, 2:].rank().values
        else:
            ranks = eval_frame.iloc[i, 2:].rank(ascending=False).values
        
        # create single-row frame and append to rank_frame
        row_frame = pd.DataFrame(ranks.reshape(1, ranks.shape[0]), columns=rank_names)
        rank_frame = rank_frame.append(row_frame, ignore_index=True)
        
        # house keeping
        del row_frame

# merge ranks onto eval_frame
eval_frame = pd.concat([eval_frame, rank_frame], axis=1)

# house keeping
del rank_frame
        
eval_frame

Unnamed: 0,fold,metric,group10_best_ebm,group10_best_glm,group10_best_mxgb,group11_ebm_1,group11_elasticnet,group11_xgboost_1,group11_xgboost_500,group1_ebm,...,group7_glm_rank,group7_mxgb_rank,group8_best_ebm_rank,group8_best_mxgb_rank,group9_best_ebm_rank,group9_best_glm_rank,group9_best_mxgb_rank,ph_advval_ebm50_rank,ph_baseline_ebm50_rank,ph_fe_ebm50_rank
0,0.0,acc,0.901,0.9,0.901,0.901,0.9,0.901,0.901,0.901,...,29.0,13.5,13.5,29.0,29.0,29.0,13.5,29.0,13.5,29.0
1,0.0,auc,0.84,0.775,0.813,0.84,0.775,0.815,0.813,0.84,...,27.0,23.5,10.0,34.0,1.0,31.0,18.0,12.5,5.5,14.0
2,0.0,f1,0.408,0.335,0.378,0.402,0.335,0.379,0.379,0.402,...,27.0,15.5,5.0,34.0,1.5,30.0,20.5,9.5,9.5,3.5
3,0.0,logloss,0.251,0.291,0.264,0.251,0.291,0.263,0.263,0.251,...,28.0,20.5,6.5,27.0,1.0,31.0,20.5,12.5,6.5,14.0
4,0.0,mse,0.077,0.084,0.079,0.077,0.084,0.078,0.079,0.077,...,27.5,23.5,7.5,27.5,7.5,31.0,17.5,7.5,7.5,7.5
5,1.0,acc,0.907,0.906,0.906,0.907,0.906,0.906,0.906,0.907,...,22.0,22.0,5.0,22.0,22.0,22.0,22.0,22.0,5.0,5.0
6,1.0,auc,0.83,0.757,0.791,0.829,0.757,0.792,0.791,0.829,...,27.0,23.5,8.0,30.5,2.5,30.5,18.0,2.5,8.0,12.5
7,1.0,f1,0.371,0.302,0.335,0.369,0.302,0.337,0.336,0.369,...,27.0,24.0,10.0,33.0,2.0,30.0,19.0,4.5,10.0,4.5
8,1.0,logloss,0.246,0.281,0.263,0.246,0.281,0.263,0.263,0.246,...,28.0,25.5,7.0,27.0,7.0,31.0,20.0,7.0,7.0,7.0
9,1.0,mse,0.074,0.08,0.078,0.075,0.08,0.078,0.078,0.075,...,30.5,20.5,10.0,27.0,3.0,30.5,20.5,3.0,10.0,3.0


#### Save `eval_frame` as CSV

In [7]:
eval_frame.to_csv('model_eval_' + str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '.csv'), 
                  index=False)

#### Display simple ranked score list 

In [8]:
eval_frame[[name for name in eval_frame.columns if name.endswith('rank')]].mean().sort_values()

ph_advval_ebm50_rank                     7.20
group10_best_ebm_rank                    7.54
group9_best_ebm_rank                     7.56
group8_best_ebm_rank                     8.08
group11_ebm_1_rank                       8.68
group7_ebm_rank                          8.68
group6_best_ebm1_rank                    8.68
group1_ebm_rank                          8.68
ph_baseline_ebm50_rank                   8.68
group2_ebm_rank                          8.68
group4_ebm_rank                          8.76
group3_best_ebm_rank                     9.28
group5_best_ebm_rank                     9.54
ph_fe_ebm50_rank                        11.16
group5_best_mxgb_rank                   16.64
group3_best_mgbm_rank                   19.00
group9_best_mxgb_rank                   20.08
group6_best_mxgb1_rank                  20.08
group11_xgboost_500_rank                20.08
group11_xgboost_1_rank                  20.08
group1_mxgb_rank                        20.08
group2_mxgb_2025_03_26_19_15_54_ra