In [63]:
%load_ext autoreload

%autoreload 2

import numpy as np
import pandas as pd
import json
import utils
import itertools

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
config_df = pd.DataFrame(utils.get_config_dicts())
base_models = sorted(set(config_df["base_model"].values))
datasets = sorted(set(config_df["dataset"].values))
drop_scoring_base_bools = [False, True]
ensemble_configs = list(dict(zip(("base_model", "drop_scoring_base", "dataset"), p)) for p in itertools.product(base_models, drop_scoring_base_bools, datasets))

In [69]:
def multi_step(df_train, df_test):
    def cond_mean(scores, thresholds):
        i = 0
        while i < len(thresholds) and scores[i] < thresholds[i]:
            i+=1
        return np.mean(scores[:(i+1)])
    
    df_test["ms"] = 0
    auc_scores = []
    for z in np.arange(0, 3, 0.1):
        thresholds = [np.mean(df_train.loc[df_train['y'] == 1, model]) - z*np.std(df_train.loc[df_train['y'] == 1, model]) for model in df_train.columns.values[:-2]]
        for i in range(df_train.shape[0]):
            df_train.loc[i, "ms"] = cond_mean(df_train.iloc[i, :df_train.shape[1]-2].to_list(), thresholds)
        auc_scores.append(roc_auc_score(df_train["y"], df_train["ms"]))
    
    z_star = 0.1*np.argmax(auc_scores)
    thresholds = [np.mean(df_train.loc[df_train['y'] == 1, model]) - z_star*np.std(df_train.loc[df_train['y'] == 1, model]) for model in df_train.columns.values[:-2]]
    for i in range(df_test.shape[0]):
        df_test.loc[i, "ms"] = cond_mean(df_test.iloc[i, :df_test.shape[1]-2].to_list(), thresholds)
    return df_test["ms"]

def get_aucs(**kwargs):
    df = utils.get_pred_df(**kwargs)
    
    y = df["y"]
    X = df.drop("y", axis=1).values

    auc_dict = {}
    
    for model in df.columns.values[:-1]:
        auc_dict[model] = roc_auc_score(y, df.loc[:, model])

    auc_dict["max"] = roc_auc_score(y, X.max(axis=1))
    auc_dict["mean"] =  roc_auc_score(y, X.mean(axis=1))
    auc_dict["median"] = roc_auc_score(y, np.median(X, axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    model_lr = LogisticRegression()
    model_lr.fit(X_train, y_train)
    auc_dict["lr"] = roc_auc_score(y_test, model_lr.predict_proba(X_test)[:, 1])

    model_rf = RandomForestClassifier()
    model_rf.fit(X_train, y_train)
    auc_dict["rf"] = roc_auc_score(y_test, model_rf.predict_proba(X_test)[:, 1])
    
    model_nb = GaussianNB()
    model_nb.fit(X_train, y_train)
    auc_dict["nb"] = roc_auc_score(y_test, model_nb.predict_proba(X_test)[:, 1])
    
    model_sv = svm.SVC(probability=True)
    model_sv.fit(X_train, y_train)
    auc_dict["sv"] = roc_auc_score(y_test, model_sv.predict_proba(X_test)[:, 1])
    
    model_list = ["gpt2-medium", "EleutherAI-gpt-neo-125m", "roberta-base", "gpt2", "bert-base-cased", "y"] #355, 125, 125, 124, 110
    df_train = pd.concat([pd.DataFrame(X_train, columns=df.columns.values[:-1]), pd.DataFrame(np.array(y_train), columns=["y"])], axis=1)
    df_test = pd.concat([pd.DataFrame(X_test, columns=df.columns.values[:-1]), pd.DataFrame(np.array(y_test), columns=["y"])], axis=1)
    df_train = df_train[[model for model in model_list if model in df_train.columns.values]]
    df_test = df_test[[model for model in model_list if model in df_train.columns.values]]
    auc_dict["ms"] = roc_auc_score(y_test, multi_step(df_train, df_test))
    
    return auc_dict

In [70]:
df_ensemble_results_nested = pd.DataFrame([{**c, "aucs":get_aucs(**c)} for c in ensemble_configs])
df_ensemble_results = df_ensemble_results_nested.join(df_ensemble_results_nested["aucs"].apply(pd.Series)).drop("aucs", axis=1)
print(df_ensemble_results)
print(np.mean(df_ensemble_results[df_ensemble_results["drop_scoring_base"]==True], axis=0))

                 base_model  drop_scoring_base dataset  \
0   EleutherAI-gpt-neo-125m              False  german   
1   EleutherAI-gpt-neo-125m              False    xsum   
2   EleutherAI-gpt-neo-125m               True  german   
3   EleutherAI-gpt-neo-125m               True    xsum   
4                      gpt2              False  german   
5                      gpt2              False    xsum   
6                      gpt2               True  german   
7                      gpt2               True    xsum   
8               gpt2-medium              False  german   
9               gpt2-medium              False    xsum   
10              gpt2-medium               True  german   
11              gpt2-medium               True    xsum   

    EleutherAI-gpt-neo-125m  bert-base-cased  gpt2-medium      gpt2  \
0                  0.953675         0.369700     0.775925  0.823425   
1                  0.998200         0.628175     0.247875  0.696050   
2                       NaN     

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [72]:
df_ensemble_results

Unnamed: 0,base_model,drop_scoring_base,dataset,EleutherAI-gpt-neo-125m,bert-base-cased,gpt2-medium,gpt2,roberta-base,max,mean,median,lr,rf,nb,sv,ms
0,EleutherAI-gpt-neo-125m,False,german,0.953675,0.3697,0.775925,0.823425,0.46575,0.94855,0.89105,0.780375,0.95197,0.975164,0.923645,0.96757,0.818555
1,EleutherAI-gpt-neo-125m,False,xsum,0.9982,0.628175,0.247875,0.69605,0.84035,0.7833,0.92295,0.846525,1.0,1.0,1.0,1.0,0.425287
2,EleutherAI-gpt-neo-125m,True,german,,0.3697,0.775925,0.823425,0.46575,0.6336,0.730225,0.765425,0.780378,0.801314,0.773399,0.816913,0.741379
3,EleutherAI-gpt-neo-125m,True,xsum,,0.628175,0.247875,0.69605,0.84035,0.318825,0.721425,0.73975,0.981117,0.974343,0.945402,0.981527,0.360427
4,gpt2,False,german,0.2814,0.274925,0.83685,0.988675,0.5585,0.91195,0.825625,0.719425,0.999589,0.999179,0.995895,1.0,0.827997
5,gpt2,False,xsum,0.93255,0.56375,0.536425,0.978875,0.7572,0.8176,0.946425,0.93375,1.0,1.0,0.997126,1.0,0.676108
6,gpt2,True,german,0.2814,0.274925,0.83685,,0.5585,0.5452,0.434125,0.4527,0.934319,0.921182,0.899015,0.921182,0.827997
7,gpt2,True,xsum,0.93255,0.56375,0.536425,,0.7572,0.557825,0.88455,0.9109,0.996305,0.98789,0.982759,0.995074,0.671593
8,gpt2-medium,False,german,0.305825,0.2839,0.979325,0.9663,0.594575,0.930225,0.912325,0.811975,0.99179,0.989122,0.980706,0.98358,0.974138
9,gpt2-medium,False,xsum,0.899875,0.55375,0.9252,0.901775,0.648475,0.9252,0.95685,0.90695,0.969622,0.954228,0.93555,0.970854,0.947455
