In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

from load_fdb_datasets import prepare_noisy_dataset, dataset_stats
from cleanlab.filter import find_label_issues
from catboost import CatBoostClassifier
from micro_models import MicroModelEnsemble

In [None]:
# some auxilliary functions

# given a model, a validation set, and an FPR target, 
# find a score threshold that achieves that FPR
def validate(clf, X, y, fpr_target):
    preds = clf.predict_proba(X)[:,1]
    fpr, tpr, thresh = roc_curve(y, preds)
    idx = max(np.where(fpr < fpr_target)[0])
    return thresh[idx]

# given a model, a test set, and a score threshold,
# find the FPR/TPR at that threshold
def evaluate(clf, X, y, threshold):
    preds = clf.predict_proba(X)[:,1]
    fpr, tpr, thresh = roc_curve(y, preds)
    idx = min(np.where(thresh < threshold)[0])
    fpr = fpr[idx]
    tpr = tpr[idx]
    acc = accuracy_score(y, 1.0*(preds > threshold))
    
    return fpr, tpr, acc

# given a model, a validation set, a test set, and an FPR target
# first validate the model, then test to see actual performance
def validate_and_evaluate(clf, X_vl, y_vl, X_ts, y_ts, fpr_target):
    thresh = validate(clf, X_vl, y_vl, fpr_target)
    fpr, tpr, acc = evaluate(clf, X_ts, y_ts, thresh)
    print(f"fpr: {fpr:.3f}, tpr: {tpr:.3f}, acc: {acc:.3f} at thresh {thresh:.3f}")
    return fpr

# use a micro-model ensemble to clean a validation set
# by removing (up to) the 'num' noisiest examples  
def mm_clean(mm_cleaner, X_vl, y_vl, num):
    anomaly_df = X_vl.copy()
    clean_preds = mm_cleaner.predict_proba(X_vl)
    anomaly_df['clean_preds'] = clean_preds
    potential_noise = anomaly_df[(y_vl == 0) & (anomaly_df.clean_preds > 0)]
    noise_idx = potential_noise.sort_values(by='clean_preds', ascending=False)[:num].index
    idx = ~X_vl.index.isin(noise_idx)
    print(f"{X_vl.shape[0] - X_vl[idx].shape[0]} examples filtered")
    return X_vl[idx].reset_index(drop=True), y_vl[idx], idx

# use a trained model to directly clean a validation set
# by removing the 'num' noisiest examples 
def direct_clean(clf, X_vl, y_vl, num):
    anomaly_df = X_vl.copy()
    anomaly_df['clean_preds'] = clf.predict_proba(X_vl)[:,1]
    potential_noise = anomaly_df[(y_vl == 0) & (anomaly_df.clean_preds > 0)]
    noise_idx = potential_noise.sort_values(by='clean_preds', ascending=False)[:num].index
    idx = ~X_vl.index.isin(noise_idx)
    print(f"{X_vl.shape[0] - X_vl[idx].shape[0]} examples filtered")
    return X_vl[idx].reset_index(drop=True), y_vl[idx], idx

# use a trained model with Cleanlab to directly clean a validation set
# by removing (up to) the 'num' noisiest examples 
def cl_clean(clf, X_vl, y_vl, num=0):
    is_label_issue = find_label_issues(labels=y_vl, pred_probs=clf.predict_proba(X_vl))
    idx = (y_vl == 1) | (~is_label_issue)
    return X_vl[idx], y_vl[idx], idx

# compute the type-1/type-2 error of a cleaning method
def error(valid, idx):
    # e_1 - c=0, y=1
    #       c=0, y*=0, y=1
    #       c=0, y*=1, y=1 doesn't happen
    # idx - not removed -> c=0, y*=0 or y*=1
    e1 = valid[(idx) & (valid.noise == 1)].shape[0]/valid.shape[0]

    # e_2 - c=1, y=0
    #       c=1, y*=1, y=0 doesn't happen
    #       c=1, y*=0, y=0
    # ~idx - removed -> c=1, y*=0
    # noise=0 <-> y=0 and y*=0 or y=1 and y*=1
    e2 = valid[(~idx) & (valid.noise == 0)].shape[0]/valid.shape[0]
    print(f"type 1 error: {e1:.4f}, type 2 error: {e2:.4f}")
    return e1, e2

In [None]:
# method to generate an "experiment dict", i.e. a python dict containing a dataset and
# all relevant experimental parameters (this will allow us to easily iterate through all 
# such experiments). This method will add noise to the datasets as they are loaded.
# We will also use these dicts to store results of experiments once completed.
def generate_experiment(key, fprs):

    noise_type = 'time-dependent'
    noise_amount = 0.3
    split = 0.7
    shuffle = True
    sort_key = 'creation_date'

    dataset = prepare_noisy_dataset(key, noise_type, noise_amount, split=split, shuffle=shuffle, sort_key=sort_key)
    dataset_stats(dataset)
    
    result_dict = {cleaning_type: [] for cleaning_type in ['none','cleanlab','micromodel','direct']}
    
    experiment = {'description': key,
                  'dataset': dataset,
                  'cleaned_data': result_dict.copy(),
                  'cleaning_error': result_dict.copy(),
                  'results': {fpr: result_dict.copy() for fpr in fprs}
                 }
    
    return experiment
    

In [None]:
# main experimental process: for each dataset we will:
# load the dataset/metadata from FDB and add noise 
# train a base model and micro-model ensemble
# clean the data using each method (none, cleanlab, micro-model, direct)
# for each FPR target, determine "true" threshold (threshold which achieves FPR on clean data)
#   then determine FPR estimate for each method on noisy data and record error 

datasets = ['cloud', 'ieeecis', 'ccfraud', 'fraudecom', 'sparknov'] 
fpr_targets = [0.01, 0.02, 0.04, 0.08]

experiments = {key: generate_experiment(key, fpr_targets) for key in datasets}

for dataset_key in experiments:
    
    experiment = experiments[dataset_key]
    dataset = experiment['dataset']
    
    # extract features and subsets from main dataset
    features, cat_features, label = dataset['features'], dataset['cat_features'], dataset['label']
    train, valid, test = dataset['train'], dataset['valid'], dataset['test']
    X_train, y_train = train[features], train[label].values.reshape(-1)
    X_valid, y_valid = valid[features], valid[label].values.reshape(-1)
    
    # y_true is true labels for validation data
    y_true = y_valid.copy()
    y_true[valid.noise==1] = 1
    
    # train base model 
    print(f"training model / cleaner for {dataset_key}\n")
    model_params = {
                'cat_features':cat_features,
                'verbose':False,
                'iterations':500
            }
    base_model = CatBoostClassifier(**model_params)
    base_model.fit(X_train, y_train)
    
    # create micro-model cleaning ensemble
    mm_cleaner = MicroModelEnsemble(CatBoostClassifier, num_clfs=16, score_type='preds_avg', **model_params)
    mm_cleaner.fit(X_train, y_train)
            
    # analyze total error in untouched noisy validation data
    print(f"cleaning {dataset_key} dataset with 'none'\n")
    experiment['cleaning_error']['none'] = error(valid,valid.index.isin(valid.index))

    # clean dataset using various methods and record t1/t2 error
    print(f"cleaning {dataset_key} dataset with 'cleanlab'\n")
    X_vl_cl, y_vl_cl, idx = cl_clean(base_model, X_valid, y_valid)
    experiment['cleaning_error']['cleanlab'] = error(valid,idx)

    print(f"cleaning {dataset_key} dataset with 'micromodel'\n")
    X_vl_mm, y_vl_mm, idx = mm_clean(mm_cleaner, X_valid, y_valid, valid.noise.sum())
    experiment['cleaning_error']['micromodel'] = error(valid,idx)

    print(f"cleaning {dataset_key} dataset with 'direct'\n")
    X_vl_dr, y_vl_dr, idx = direct_clean(base_model, X_valid, y_valid, valid.noise.sum())
    experiment['cleaning_error']['direct'] = error(valid,idx)

    # store cleaned datasets in result dict
    experiment['cleaned_data'] = {
        'none': (X_valid, y_valid), 
        'cleanlab': (X_vl_cl, y_vl_cl),
        'micromodel': (X_vl_mm, y_vl_mm), 
        'direct': (X_vl_dr, y_vl_dr)
    }
    print(f"doing fpr validation for {dataset_key}\n")

    for fpr_target in experiment['results'].keys():
        print(f"\ntarget fpr: {fpr_target:.3f}")
        true_thresh = validate(base_model, X_valid, y_true, fpr_target)

        for cleaning_type, (X_clean, y_clean) in experiment['cleaned_data'].items():
            fpr, tpr, acc = evaluate(base_model, X_clean, y_clean, true_thresh)
            err = np.abs((fpr - fpr_target)/fpr_target)
            experiment['results'][fpr_target][cleaning_type] = (fpr, err)
            print(f"{cleaning_type} estimate: {fpr:.3f}, err: {err:.3f}")
            

In [None]:
# helper functions to highlight best performance among experiment results

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

def bold_extreme_values(data, data_max=-1):

    if data == data_max:
        return "\textbf{%s}" % data

    return data

In [None]:
# load results of experiments into dataframes for examination

rows = pd.Index(['None','CleanLab','MicroModel','Direct'])
row_key = {'none': 'None',
           'cleanlab': 'CleanLab',
           'micromodel': 'MicroModel',
           'direct': 'Direct'}
fpr_cols = pd.MultiIndex.from_product([fpr_targets,['fpr','err']], names=['Target FPR', 'Metric'])
fpr_dfs = {}

cln_error_cols = pd.MultiIndex.from_product([datasets,['type 1', 'type 2']], names=['dataset', 'Error Type'])

cln_error_df = pd.DataFrame(index=rows, columns=cln_error_cols)

for dataset_key in experiments:
    experiment = experiments[dataset_key]
    fpr_df = pd.DataFrame(index=rows, columns=fpr_cols)
    
    for cleaning_type in experiment['cleaning_error']:
        e1, e2 = experiment['cleaning_error'][cleaning_type]
        # print(f"{key} type-1/type-2 cleaning error: {e1:.4f}/ {e2:.4f}")
        cln_error_df.loc[row_key[cleaning_type], (dataset_key,'type 1')] = f"{e1:.3f}"
        cln_error_df.loc[row_key[cleaning_type], (dataset_key,'type 2')] = f"{e2:.3f}"

    for fpr_target in experiment['results']:
        for cleaning_type in experiment['results'][fpr_target]:
            fpr, err = experiment['results'][fpr_target][cleaning_type]
            fpr_df.loc[row_key[cleaning_type], (fpr_target, 'fpr')] = f"{fpr:.3f}"
            fpr_df.loc[row_key[cleaning_type], (fpr_target, 'err')] = f"{err:.2f}"
            
    fpr_dfs[dataset_key] = fpr_df

In [None]:
# display results in dataframes

display(cln_error_df) #.style.apply(highlight_min, props='font-weight:bold;background-color:lightblue', axis=0, 
                      #     subset=[[d,['type 1','type 2']] for d in datasets]))

for experiment_key in experiments:
    fpr_df = fpr_dfs[experiment_key]
    
    print(f"\n={experiment_key}=\n")
    
    display(fpr_df.style.apply(highlight_min, props='font-weight:bold;background-color:lightblue', axis=0, 
                           subset=[[t,'err'] for t in fpr_targets]))

In [None]:
# export type-1/type-2 error results to LaTeX

s = '\\begin{table}[!htbp]\n'
s = s + '\\caption {Cleaning Errors}\n'
s = s + '\label{tab:cleaning_error}\n'
s = s + cln_error_df.to_latex(escape=False) + '\n'
s = s + '\end{table}'        
print(s.replace('l'*(2*len(datasets)+1),('l'+'ll|'*(len(datasets)))[:-1]))


In [None]:
# export FPR estimation error results to LaTeX

for dataset_key in experiments:
    fpr_df = fpr_dfs[dataset_key]
    columns = [(t,'err') for t in fpr_targets]
    for col in columns:
        fpr_df[col] = fpr_df[col].apply(lambda data : bold_extreme_values(data, data_max=fpr_df[col].min()))
    s = '\\begin{table}[!htbp]\n'
    s = s + '\\caption {{Results for Dataset \\texttt{{{}}}}}\n'.format(dataset_key)
    s = s + '\label{{tab:exact_{}}}\n'.format(dataset_key)
    s = s + fpr_df.to_latex(escape=False) + '\n'
    s = s + '\end{table}\n'        
    print(s.replace('lllllllll','lll|ll|ll|ll'))