In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
%load_ext autoreload

%autoreload 2

import numpy as np
import pandas as pd
import json
import utils
import itertools
import warnings

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.linear_model')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
config_dicts = utils.get_config_dicts()

In [4]:
pd.DataFrame(config_dicts).sort_values(["dataset", "base_model"])

Unnamed: 0,folder,base_model,scoring_model,mask_model,dataset,sampling,d_path,z_path
0,EleutherAI-gpt-neo-125m_EleutherAI-gpt-neo-125...,EleutherAI-gpt-neo-125m,EleutherAI-gpt-neo-125m,t5-base,german,temp,results/permute/EleutherAI-gpt-neo-125m_Eleuth...,results/permute/EleutherAI-gpt-neo-125m_Eleuth...
2,EleutherAI-gpt-neo-125m_bert-base-cased_t5-bas...,EleutherAI-gpt-neo-125m,bert-base-cased,t5-base,german,temp,results/permute/EleutherAI-gpt-neo-125m_bert-b...,results/permute/EleutherAI-gpt-neo-125m_bert-b...
4,EleutherAI-gpt-neo-125m_gpt2-medium_t5-base_ge...,EleutherAI-gpt-neo-125m,gpt2-medium,t5-base,german,temp,results/permute/EleutherAI-gpt-neo-125m_gpt2-m...,results/permute/EleutherAI-gpt-neo-125m_gpt2-m...
6,EleutherAI-gpt-neo-125m_gpt2_t5-base_german_temp,EleutherAI-gpt-neo-125m,gpt2,t5-base,german,temp,results/permute/EleutherAI-gpt-neo-125m_gpt2_t...,results/permute/EleutherAI-gpt-neo-125m_gpt2_t...
8,EleutherAI-gpt-neo-125m_roberta-base_t5-base_g...,EleutherAI-gpt-neo-125m,roberta-base,t5-base,german,temp,results/permute/EleutherAI-gpt-neo-125m_robert...,results/permute/EleutherAI-gpt-neo-125m_robert...
20,gpt2_EleutherAI-gpt-neo-125m_t5-base_german_temp,gpt2,EleutherAI-gpt-neo-125m,t5-base,german,temp,results/permute/gpt2_EleutherAI-gpt-neo-125m_t...,results/permute/gpt2_EleutherAI-gpt-neo-125m_t...
22,gpt2_bert-base-cased_t5-base_german_temp,gpt2,bert-base-cased,t5-base,german,temp,results/permute/gpt2_bert-base-cased_t5-base_g...,results/permute/gpt2_bert-base-cased_t5-base_g...
24,gpt2_gpt2-medium_t5-base_german_temp,gpt2,gpt2-medium,t5-base,german,temp,results/permute/gpt2_gpt2-medium_t5-base_germa...,results/permute/gpt2_gpt2-medium_t5-base_germa...
26,gpt2_gpt2_t5-base_german_temp,gpt2,gpt2,t5-base,german,temp,results/permute/gpt2_gpt2_t5-base_german_temp/...,results/permute/gpt2_gpt2_t5-base_german_temp/...
28,gpt2_roberta-base_t5-base_german_temp,gpt2,roberta-base,t5-base,german,temp,results/permute/gpt2_roberta-base_t5-base_germ...,results/permute/gpt2_roberta-base_t5-base_germ...


In [5]:
config_df = pd.DataFrame(config_dicts)

In [6]:
base_models = sorted(set(config_df["base_model"].values))
datasets = sorted(set(config_df["dataset"].values))
drop_scoring_base_bools = [False, True]

In [7]:
ensemble_configs = list(dict(zip(("base_model", "drop_scoring_base", "dataset"), p)) for p in itertools.product(base_models, drop_scoring_base_bools, datasets))

In [8]:
def multi_step(df_train, df_test):
    def cond_mean(scores, thresholds):
        i = 0
        while i < len(thresholds) and scores[i] < thresholds[i]:
            i+=1
        return np.mean(scores[:(i+1)])
    
    df_test["ms"] = 0
    auc_scores = []
    for z in np.arange(0, 3, 0.1):
        thresholds = [np.mean(df_train.loc[df_train['y'] == 1, model]) - z*np.std(df_train.loc[df_train['y'] == 1, model]) for model in df_train.columns.values[:-2]]
        for i in range(df_train.shape[0]):
            df_train.loc[i, "ms"] = cond_mean(df_train.iloc[i, :df_train.shape[1]-2].to_list(), thresholds)
        auc_scores.append(roc_auc_score(df_train["y"], df_train["ms"]))
    
    z_star = 0.1*np.argmax(auc_scores)
    thresholds = [np.mean(df_train.loc[df_train['y'] == 1, model]) - z_star*np.std(df_train.loc[df_train['y'] == 1, model]) for model in df_train.columns.values[:-2]]
    for i in range(df_test.shape[0]):
        df_test.loc[i, "ms"] = cond_mean(df_test.iloc[i, :df_test.shape[1]-2].to_list(), thresholds)
    return df_test["ms"]

def get_aucs(**kwargs):
    df = utils.get_pred_df(**kwargs)
    
    y = df["y"]
    X = df.drop("y", axis=1).values

    auc_dict = {}
    best_params = {}
    
    for model in df.columns.values[:-1]:
        auc_dict[model] = roc_auc_score(y, df.loc[:, model])

    auc_dict["max"] = roc_auc_score(y, X.max(axis=1))
    auc_dict["mean"] =  roc_auc_score(y, X.mean(axis=1))
    auc_dict["median"] = roc_auc_score(y, np.median(X, axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    print("Training Logistic Regression")
    model_lr = LogisticRegression()
    param_grid_lr = {
        'C': [100, 10, 1, 0.1, 0.01],
        'penalty': ['none', 'l2']
    }
    gs_lr = GridSearchCV(estimator=model_lr, param_grid=param_grid_lr, cv=5)
    gs_lr.fit(X_train, y_train)
    auc_dict["lr"] = roc_auc_score(y_test, gs_lr.best_estimator_.predict_proba(X_test)[:, 1])
    print(kwargs, gs_lr.best_estimator_.coef_)
    
    print("Training Random Forest")
    model_rf = RandomForestClassifier()
    param_grid_rf = {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30],
        'min_samples_split': [4, 6, 8],
        'n_estimators': [100, 300, 500, 1000]
    }
    gs_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=3)
    gs_rf.fit(X_train, y_train)
    auc_dict["rf"] = roc_auc_score(y_test, gs_rf.best_estimator_.predict_proba(X_test)[:, 1])
    best_params["rf"] = gs_rf.best_params_
    
    print("Training Gaussian Naive Bayes")
    model_nb = GaussianNB()
    param_grid_nb = {'var_smoothing': np.logspace(0,-9, num=100)}
    gs_nb = GridSearchCV(estimator=model_nb, param_grid = param_grid_nb, cv=3)
    gs_nb.fit(X_train, y_train)
    auc_dict["nb"] = roc_auc_score(y_test, gs_nb.best_estimator_.predict_proba(X_test)[:, 1])
    best_params["nb"] = gs_nb.best_params_
    
    print("Training Support Vector Machine")
    model_sv = svm.SVC(probability=True)
    param_grid_sv = {'C': [0.1, 1, 10, 100, 1000], 
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'kernel': ['rbf']
    }
    gs_sv = GridSearchCV(estimator=model_sv, param_grid=param_grid_sv, cv=3)
    gs_sv.fit(X_train, y_train)
    auc_dict["sv"] = roc_auc_score(y_test, gs_sv.best_estimator_.predict_proba(X_test)[:, 1])
    best_params["nb"] = gs_sv.best_params_
    
    print("Training Multi-step")
    model_list = ["gpt2-medium", "EleutherAI-gpt-neo-125m", "roberta-base", "gpt2", "bert-base-cased", "y"] #355, 125, 125, 124, 110
    df_train = pd.concat([pd.DataFrame(X_train, columns=df.columns.values[:-1]), pd.DataFrame(np.array(y_train), columns=["y"])], axis=1)
    df_test = pd.concat([pd.DataFrame(X_test, columns=df.columns.values[:-1]), pd.DataFrame(np.array(y_test), columns=["y"])], axis=1)
    df_train = df_train[[model for model in model_list if model in df_train.columns.values]]
    df_test = df_test[[model for model in model_list if model in df_train.columns.values]]
    auc_dict["ms"] = roc_auc_score(y_test, multi_step(df_train, df_test))
    
    return auc_dict, best_params

In [9]:
auc_results = [[c, get_aucs(**c)] for c in ensemble_configs]
best_params = [[r[0], r[1][1]] for r in auc_results]
df_ensemble_results_nested = pd.DataFrame([{**(r[0]), "aucs":r[1][0]} for r in auc_results])
##df_ensemble_results_nested = pd.DataFrame([{**c, "aucs":get_aucs(**c)} for c in ensemble_configs])

Training Logistic Regression
{'base_model': 'EleutherAI-gpt-neo-125m', 'drop_scoring_base': False, 'dataset': 'german'} [[ 6.68863842 -0.3663044  -5.98632602  6.56389637  2.2765553 ]]
Training Random Forest
Training Gaussian Naive Bayes
Training Support Vector Machine
Training Multi-step
Training Logistic Regression
{'base_model': 'EleutherAI-gpt-neo-125m', 'drop_scoring_base': False, 'dataset': 'xsum'} [[ 40.73669723  -6.29479348 -25.67268641  -6.01410881  10.24906843]]
Training Random Forest
Training Gaussian Naive Bayes
Training Support Vector Machine
Training Multi-step
Training Logistic Regression
{'base_model': 'EleutherAI-gpt-neo-125m', 'drop_scoring_base': True, 'dataset': 'german'} [[-0.35443463 -1.7137634   5.74343978  2.01387073]]
Training Random Forest
Training Gaussian Naive Bayes
Training Support Vector Machine
Training Multi-step
Training Logistic Regression
{'base_model': 'EleutherAI-gpt-neo-125m', 'drop_scoring_base': True, 'dataset': 'xsum'} [[-0.41043512 -8.62998649 

In [10]:
df_ensemble_results = df_ensemble_results_nested.join(df_ensemble_results_nested["aucs"].apply(pd.Series)).drop("aucs", axis=1)

In [11]:
best_params

[[{'base_model': 'EleutherAI-gpt-neo-125m',
   'drop_scoring_base': False,
   'dataset': 'german'},
  {'rf': {'bootstrap': True,
    'max_depth': 20,
    'min_samples_split': 8,
    'n_estimators': 100},
   'nb': {'C': 1, 'gamma': 1, 'kernel': 'rbf'}}],
 [{'base_model': 'EleutherAI-gpt-neo-125m',
   'drop_scoring_base': False,
   'dataset': 'xsum'},
  {'rf': {'bootstrap': True,
    'max_depth': 10,
    'min_samples_split': 4,
    'n_estimators': 300},
   'nb': {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}}],
 [{'base_model': 'EleutherAI-gpt-neo-125m',
   'drop_scoring_base': True,
   'dataset': 'german'},
  {'rf': {'bootstrap': True,
    'max_depth': 20,
    'min_samples_split': 6,
    'n_estimators': 100},
   'nb': {'C': 1, 'gamma': 1, 'kernel': 'rbf'}}],
 [{'base_model': 'EleutherAI-gpt-neo-125m',
   'drop_scoring_base': True,
   'dataset': 'xsum'},
  {'rf': {'bootstrap': True,
    'max_depth': 10,
    'min_samples_split': 4,
    'n_estimators': 500},
   'nb': {'C': 10, 'gamma': 0.01, 'ker

In [12]:
utils.get_auc_single().sort_values(["base_model", "scoring_model"])

Unnamed: 0,base_model,scoring_model,dataset,auc
0,EleutherAI-gpt-neo-125m,EleutherAI-gpt-neo-125m,german,0.953675
1,EleutherAI-gpt-neo-125m,EleutherAI-gpt-neo-125m,xsum,0.9982
2,EleutherAI-gpt-neo-125m,bert-base-cased,german,0.3697
3,EleutherAI-gpt-neo-125m,bert-base-cased,xsum,0.628175
6,EleutherAI-gpt-neo-125m,gpt2,german,0.823425
7,EleutherAI-gpt-neo-125m,gpt2,xsum,0.69605
4,EleutherAI-gpt-neo-125m,gpt2-medium,german,0.775925
5,EleutherAI-gpt-neo-125m,gpt2-medium,xsum,0.247875
8,EleutherAI-gpt-neo-125m,roberta-base,german,0.46575
9,EleutherAI-gpt-neo-125m,roberta-base,xsum,0.84035


In [13]:
table_3 = df_ensemble_results[df_ensemble_results['drop_scoring_base']==False].iloc[:, 3:8]
table_3_avg = np.mean(df_ensemble_results[df_ensemble_results['drop_scoring_base']==False].iloc[:, 3:8], axis=0)
print(table_3)
print(table_3_avg)

   EleutherAI-gpt-neo-125m  bert-base-cased  gpt2-medium      gpt2  \
0                 0.953675         0.369700     0.775925  0.823425   
1                 0.998200         0.628175     0.247875  0.696050   
4                 0.281400         0.274925     0.836850  0.988675   
5                 0.932550         0.563750     0.536425  0.978875   
8                 0.305825         0.283900     0.979325  0.966300   
9                 0.899875         0.553750     0.925200  0.901775   

   roberta-base  
0      0.465750  
1      0.840350  
4      0.558500  
5      0.757200  
8      0.594575  
9      0.648475  
EleutherAI-gpt-neo-125m    0.728588
bert-base-cased            0.445700
gpt2-medium                0.716933
gpt2                       0.892517
roberta-base               0.644142
dtype: float64


In [14]:
table_4_baseline = np.mean(df_ensemble_results[df_ensemble_results['drop_scoring_base']==True].iloc[:, 3:8], axis=1)
table_4_baseline_avg = np.mean(table_4_baseline)
print(table_4_baseline)
print(table_4_baseline_avg)
table_4_sum_stats = df_ensemble_results[df_ensemble_results['drop_scoring_base']==True].iloc[:, 8:11]
print(table_4_sum_stats)
print(np.mean(table_4_sum_stats, axis=0))

2     0.608700
3     0.603113
6     0.487919
7     0.697481
10    0.537650
11    0.750969
dtype: float64
0.6143052083333335
         max      mean    median
2   0.633600  0.730225  0.765425
3   0.318825  0.721425  0.739750
6   0.545200  0.434125  0.452700
7   0.557825  0.884550  0.910900
10  0.776175  0.683425  0.637700
11  0.899350  0.932275  0.878725
max       0.621829
mean      0.731004
median    0.730867
dtype: float64


In [15]:
print(df_ensemble_results)
table_5_supervised = df_ensemble_results[df_ensemble_results['drop_scoring_base']==True].iloc[:, 11:16]
print(table_5_supervised)
print(np.mean(table_5_supervised, axis=0))

                 base_model  drop_scoring_base dataset  \
0   EleutherAI-gpt-neo-125m              False  german   
1   EleutherAI-gpt-neo-125m              False    xsum   
2   EleutherAI-gpt-neo-125m               True  german   
3   EleutherAI-gpt-neo-125m               True    xsum   
4                      gpt2              False  german   
5                      gpt2              False    xsum   
6                      gpt2               True  german   
7                      gpt2               True    xsum   
8               gpt2-medium              False  german   
9               gpt2-medium              False    xsum   
10              gpt2-medium               True  german   
11              gpt2-medium               True    xsum   

    EleutherAI-gpt-neo-125m  bert-base-cased  gpt2-medium      gpt2  \
0                  0.953675         0.369700     0.775925  0.823425   
1                  0.998200         0.628175     0.247875  0.696050   
2                       NaN     