Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# Confidence Interval of Model Performance

In [1]:
from supervised.automl import AutoML
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, balanced_accuracy_score
from sklearn.utils import resample

import warnings
warnings.filterwarnings('ignore')

In [2]:
EVAL_FUN_MAPPING = {
    'r2': r2_score,
    'balanced_accuracy': balanced_accuracy_score
}


def get_model_performance_ci(X_test, y_test, eval_metric, model_results_path,
                             confidence_level=0.95, bootstrap_iters=1000, sample_size_perct=0.5):
    sample_performance_lst = []
    eval_fun = EVAL_FUN_MAPPING [eval_metric]
    loaded_automl = AutoML(results_path=model_results_path)
    
    with tqdm(total=bootstrap_iters) as progress_bar:
        for i in range(bootstrap_iters):
            X_test_sample = resample(X_test, n_samples=int(len(X_test)*sample_size_perct))
            y_test_sample = y_test.iloc[X_test_sample.index]
            
            y_pred_sample = loaded_automl.predict(X_test_sample)
            
            performance_score = eval_fun(y_test_sample, y_pred_sample)
            sample_performance_lst.append(performance_score)
            
            progress_bar.update(1)
        
    alpha = 1 - confidence_level
    lower_p = alpha * 100/2.0
    lower_bound = max(0.0, np.percentile(sample_performance_lst, lower_p))
    upper_p = (confidence_level + alpha/2.0) * 100
    upper_bound = min(1.0, np.percentile(sample_performance_lst, upper_p))
    
    print(f"""There's {confidence_level*100}% likelihood that {eval_metric} score
            between [{lower_bound}, {upper_bound}] covers the true model performance""")
    
    return lower_bound, upper_bound

## Confidence Interval (CI)

* Using Bootstrap method to calculate the CI of the model performance
* Reference: https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/
  * The difference in the code below is, instead of re-fit the model with sample training data in each bootstrap iteration, the model was trained on all the training data once and only do prediction in each bootstrap iteration

### Regression

In [3]:
df = pd.read_pickle('luigi_pipeline/output/preprocessed_data.pkl')
print(df.shape)

# drop categorical features, only keep numerical features
cat_cols = [col for col in df.select_dtypes(include='category').columns if col != 'Year']
df.drop(cat_cols, axis=1, inplace=True)

test_df = df.loc[df['Year'].astype(str) == '2015']

y_test = test_df['Sales']
X_test = test_df.drop(['Sales', 'Date', 'Year'], axis=1)

X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

print(X_test.shape, y_test.shape)
X_test.head()

(693861, 22)
(161332, 3) (161332,)


Unnamed: 0,Customers_larger_than_3000,CompetitionDistance,Customers
0,0.0,1270.0,555
1,0.0,1270.0,546
2,0.0,1270.0,523
3,0.0,1270.0,560
4,0.0,1270.0,612


In [8]:
lower_bound, upper_bound = get_model_performance_ci(X_test, y_test, eval_metric='r2',
                            model_results_path='luigi_pipeline/output/mljar_regression_sales/',
                             confidence_level=0.95, bootstrap_iters=20, sample_size_perct=0.5)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [08:05<00:00, 24.28s/it]

There's 95.0% likelihood that r2 score between [0.936296712842666, 0.938749454646744] covers the true model performance





### Classification

In [9]:
df30 = pd.read_csv('../crystal_ball/data_collector/structured_data/leaf.csv')

y30 = df30['species']
X30 = df30.drop('species', axis=1)

X_train30, X_test30, y_train30, y_test30 = train_test_split(X30, y30, test_size=0.2,
                                      random_state=10, shuffle=True, stratify=y30)

X_test30.reset_index(inplace=True, drop=True)
y_test30.reset_index(inplace=True, drop=True)

print(X_test30.shape, y_test30.shape)
print(y_test30.nunique())
X_test30.head()

(68, 15) (68,)
30


Unnamed: 0,specimen_number,eccentricity,aspect_ratio,elongation,solidity,stochastic_convexity,isoperimetric_factor,maximal_indentation_depth,lobedness,average_intensity,average_contrast,smoothness,third_moment,uniformity,entropy
0,7,0.58637,1.1419,0.30339,0.93305,0.92105,0.57323,0.041282,0.31016,0.022886,0.093704,0.008704,0.003229,0.000104,0.54481
1,7,0.98717,6.5173,0.84726,0.96846,1.0,0.28899,0.022056,0.08854,0.042124,0.11842,0.013829,0.004382,0.000202,1.098
2,2,0.90557,2.3423,0.58487,0.95943,0.97368,0.55537,0.023542,0.10087,0.045897,0.13433,0.017724,0.006676,0.000245,0.9943
3,6,0.91296,2.4862,0.62315,0.96188,1.0,0.51041,0.010684,0.020775,0.064539,0.13678,0.018365,0.004864,0.000528,1.6875
4,9,0.88172,1.774,0.63974,0.8499,0.87368,0.34354,0.051776,0.4879,0.06103,0.1519,0.022554,0.008081,0.00025,1.3805


In [10]:
lower_bound, upper_bound = get_model_performance_ci(X_test30, y_test30, eval_metric='balanced_accuracy', 
                                                    model_results_path='luigi_pipeline/output/mljar_classification/',
                                                    confidence_level=0.95, bootstrap_iters=10, sample_size_perct=0.8) 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.75it/s]

There's 95.0% likelihood that balanced_accuracy score between [0.7896666666666667, 0.9066061253561254] covers the true model performance



