In [4]:
%load_ext autoreload

%autoreload 2

import numpy as np
import pandas as pd
import json
import utils
import itertools

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
config_df = pd.DataFrame(utils.get_config_dicts())
base_models = sorted(set(config_df["base_model"].values))
datasets = sorted(set(config_df["dataset"].values))
drop_scoring_base_bools = [False, True]
ensemble_configs = list(dict(zip(("base_model", "drop_scoring_base", "dataset"), p)) for p in itertools.product(base_models, drop_scoring_base_bools, datasets))

In [121]:
def multi_step(df_train, df_test):
    def cond_mean(scores, thresholds):
        i = 0
        while i < len(thresholds) and scores[i] < thresholds[i]:
            i+=1
        return np.mean(scores[:(i+1)])
    
    df_test["ms"] = 0
    auc_scores = []
    for z in np.arange(0, 3, 0.1):
        thresholds = [np.mean(df_train.loc[df_train['y'] == 1, model]) - z*np.std(df_train.loc[df_train['y'] == 1, model]) for model in df_train.columns.values[:-2]]
        for i in range(df_train.shape[0]):
            df_train.loc[i, "ms"] = cond_mean(df_train.iloc[i, :df_train.shape[1]-2].to_list(), thresholds)
        auc_scores.append(roc_auc_score(df_train["y"], df_train["ms"]))
    
    z_star = 0.1*np.argmax(auc_scores)
    thresholds = [np.mean(df_train.loc[df_train['y'] == 1, model]) - z_star*np.std(df_train.loc[df_train['y'] == 1, model]) for model in df_train.columns.values[:-2]]
    for i in range(df_test.shape[0]):
        df_test.loc[i, "ms"] = cond_mean(df_test.iloc[i, :df_test.shape[1]-2].to_list(), thresholds)
    return df_test["ms"]

def get_aucs(**kwargs):
    df = utils.get_pred_df(**kwargs)
    
    y = df["y"]
    X = df.drop("y", axis=1).values

    auc_dict = {}

    auc_dict["max"] = roc_auc_score(y, X.max(axis=1))
    auc_dict["mean"] =  roc_auc_score(y, X.mean(axis=1))
    auc_dict["median"] = roc_auc_score(y, np.median(X, axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model_lr = LogisticRegression()
    model_lr.fit(X_train, y_train)
    auc_dict["lr"] = roc_auc_score(y_test, model_lr.predict_proba(X_test)[:, 1])

    model_rf = RandomForestClassifier()
    model_rf.fit(X_train, y_train)
    auc_dict["rf"] = roc_auc_score(y_test, model_rf.predict_proba(X_test)[:, 1])
    
    df_train = pd.concat([pd.DataFrame(X_train, columns=df.columns.values[:-1]), pd.DataFrame(np.array(y_train), columns=["y"])], axis=1)
    df_test = pd.concat([pd.DataFrame(X_test, columns=df.columns.values[:-1]), pd.DataFrame(np.array(y_test), columns=["y"])], axis=1)
    auc_dict["ms"] = roc_auc_score(y_test, multi_step(df_train, df_test))
    
    return auc_dict

In [126]:
df_ensemble_results_nested = pd.DataFrame([{**c, "aucs":get_aucs(**c)} for c in ensemble_configs])
df_ensemble_results = df_ensemble_results_nested.join(df_ensemble_results_nested["aucs"].apply(pd.Series)).drop("aucs", axis=1)
print(df_ensemble_results_nested['aucs'][0])
print(df_ensemble_results)
print(np.mean(df_ensemble_results, axis=0))

{'max': 0.7833, 'mean': 0.80465, 'median': 0.732375, 'lr': 1.0, 'rf': 1.0, 'ms': 0.9895330112721417}
                base_model  drop_scoring_base dataset      max      mean  \
0  EleutherAI-gpt-neo-125m              False    xsum  0.78330  0.804650   
1  EleutherAI-gpt-neo-125m               True    xsum  0.31355  0.464975   
2                     gpt2              False    xsum  0.81760  0.898925   
3                     gpt2               True    xsum  0.55780  0.785875   
4              gpt2-medium              False    xsum  0.92520  0.929075   
5              gpt2-medium               True    xsum  0.89935  0.911500   

     median        lr        rf        ms  
0  0.732375  1.000000  1.000000  0.989533  
1  0.464975  0.991971  0.975110  0.269771  
2  0.890325  1.000000  1.000000  0.948379  
3  0.785875  0.962424  0.948485  0.933737  
4  0.899400  0.898638  0.893229  0.863381  
5  0.911500  0.903266  0.896652  0.892104  
drop_scoring_base    0.500000
max                  0.71613

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [125]:
print(ensemble_configs.sort_values(["base_model", "scoring_model"]))

AttributeError: 'list' object has no attribute 'sort_values'

In [75]:
utils.get_auc_single().sort_values(["base_model", "scoring_model"])

Unnamed: 0,base_model,scoring_model,dataset,auc
0,EleutherAI-gpt-neo-125m,EleutherAI-gpt-neo-125m,xsum,0.9982
2,EleutherAI-gpt-neo-125m,gpt2,xsum,0.69605
1,EleutherAI-gpt-neo-125m,gpt2-medium,xsum,0.247875
6,gpt2,EleutherAI-gpt-neo-125m,xsum,0.93255
8,gpt2,gpt2,xsum,0.978875
7,gpt2,gpt2-medium,xsum,0.536425
3,gpt2-medium,EleutherAI-gpt-neo-125m,xsum,0.899875
5,gpt2-medium,gpt2,xsum,0.901775
4,gpt2-medium,gpt2-medium,xsum,0.9252


In [119]:
np.mean(df_ensemble_results, axis=0)

drop_scoring_base    0.500000
max                  0.670846
mean                 0.771654
median               0.758104
lr                   0.951975
rf                   0.961240
ms                   0.815915
dtype: float64