# Build Ensembles
This Notebook collects select target model epochs into ensembles, aggregates their predictions using a majority vote strategy, scores their predictions against the target task test set, and saves the results.

## Imports & Settings

In [1]:
# Update working directory to parent so that we may use our custom functions
import os
os.chdir('..')
# os.getcwd( )

In [2]:
import re
import itertools
import pandas as pd
import matplotlib.pyplot as plt

from ast import literal_eval
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def most_common(lst):
    return max(set(lst), key=lst.count)

## Load Test Data

In [3]:
# load test data
dataset_path = "data/target_iSarcasmEval/itesd_iSarcasmEval_balanced.hf"
datasets = load_from_disk(dataset_path)

iSarcasm_test_df = datasets['test'].to_pandas()
true_preds = iSarcasm_test_df['label'].to_list()

## Load Model Results Dataset

In [4]:
results_df = pd.read_csv('05_results/results_target.csv', converters={'predictions': pd.eval})
results_df.head()

Unnamed: 0,model_name,model_epoch,test_accuracy,test_f1,predictions
0,control_iSarcasm_01,E01_A0.75_F0.41,0.835714,0.454976,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,control_iSarcasm_01,E02_A0.77_F0.39,0.825714,0.452915,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
2,control_iSarcasm_01,E03_A0.75_F0.45,0.784286,0.430189,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,control_iSarcasm_01,E04_A0.76_F0.5,0.757857,0.402116,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,control_iSarcasm_01,E05_A0.74_F0.45,0.768571,0.408759,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [5]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   model_name     210 non-null    object 
 1   model_epoch    210 non-null    object 
 2   test_accuracy  210 non-null    float64
 3   test_f1        210 non-null    float64
 4   predictions    210 non-null    object 
dtypes: float64(2), object(3)
memory usage: 8.3+ KB


## Create Filtered Datasets

We will construct two different sets of ensembles. Recall that we trained each model for 10 epochs. The first will use model epochs as inducers, selected according to best F1 scores. The second will use model epochs as inducers, selected according to best accuracy scores. As such, we need four datasets:
1. The best epoch from each model_name by highest F1 score
2. The best epoch from each model_name by highest Accuracy score
3. The best epoch from each control model by highest F1 score
4. The best epoch from each control model by highest Accuracy score

### F1 Filtering
Below, we filter the dataset to retrieve only the best models by F1 score.

In [6]:
# get row index for each epoch with best f1 score for each model_name
best_f1_idx = results_df.groupby(['model_name'])['test_f1'].transform(max) == results_df['test_f1']

best_f1_df = results_df[best_f1_idx]

# remove control models
patternDel = ".*control.*"
f1_control_filter = best_f1_df['model_name'].str.contains(patternDel)
f1_estimators_df = best_f1_df[~f1_control_filter]

f1_estimators_df

Unnamed: 0,model_name,model_epoch,test_accuracy,test_f1,predictions
36,target-iSarcasm_inter-SARC_01,E07_A0.77_F0.45,0.810714,0.455852,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."
43,target-iSarcasm_inter-SARC_02,E04_A0.77_F0.4,0.805,0.457256,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
53,target-iSarcasm_inter-SARC_03,E04_A0.8_F0.4,0.839286,0.494382,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
69,target-iSarcasm_inter-IMDB_01,E10_A0.73_F0.44,0.730714,0.370618,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
79,target-iSarcasm_inter-IMDB_02,E10_A0.73_F0.32,0.81,0.40625,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
87,target-iSarcasm_inter-IMDB_03,E08_A0.75_F0.32,0.812857,0.382075,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
96,target-iSarcasm_inter-HellaSwag_01,E07_A0.75_F0.34,0.820714,0.386308,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
108,target-iSarcasm_inter-HellaSwag_02,E09_A0.76_F0.36,0.828571,0.402985,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
117,target-iSarcasm_inter-HellaSwag_03,E08_A0.74_F0.27,0.847143,0.40884,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
127,target-iSarcasm_inter-CosmosQA_01,E08_A0.74_F0.28,0.830714,0.4,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Accuracy Filtering
Below, we filter the dataset to retrieve only the best models by accuracy score.

In [7]:
# get row index for each epoch with best accuracy score for each model_name
best_acc_idx = results_df.groupby(['model_name'])['test_accuracy'].transform(max) == results_df['test_accuracy']

best_acc_df = results_df[best_acc_idx]

# remove control models
patternDel = ".*control.*"
acc_control_filter = best_acc_df['model_name'].str.contains(patternDel)
acc_estimators_df = best_acc_df[~acc_control_filter]

acc_estimators_df

Unnamed: 0,model_name,model_epoch,test_accuracy,test_f1,predictions
35,target-iSarcasm_inter-SARC_01,E06_A0.77_F0.38,0.822857,0.431193,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
47,target-iSarcasm_inter-SARC_02,E08_A0.78_F0.42,0.826429,0.456376,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
53,target-iSarcasm_inter-SARC_03,E04_A0.8_F0.4,0.839286,0.494382,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
62,target-iSarcasm_inter-IMDB_01,E03_A0.73_F0.36,0.765714,0.330612,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
77,target-iSarcasm_inter-IMDB_02,E08_A0.76_F0.3,0.814286,0.375,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
87,target-iSarcasm_inter-IMDB_03,E08_A0.75_F0.32,0.812857,0.382075,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
98,target-iSarcasm_inter-HellaSwag_01,E09_A0.75_F0.2,0.847143,0.33125,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
104,target-iSarcasm_inter-HellaSwag_02,E05_A0.75_F0.21,0.853571,0.373089,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
117,target-iSarcasm_inter-HellaSwag_03,E08_A0.74_F0.27,0.847143,0.40884,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
126,target-iSarcasm_inter-CosmosQA_01,E07_A0.76_F0.26,0.855714,0.356688,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Control Filtering
Below, we get the control models to display.

In [8]:
# we can use the previously-created filter to get the control models by best F1
f1_control_df = best_f1_df[f1_control_filter]
f1_control_df

Unnamed: 0,model_name,model_epoch,test_accuracy,test_f1,predictions
0,control_iSarcasm_01,E01_A0.75_F0.41,0.835714,0.454976,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14,control_iSarcasm_02,E05_A0.75_F0.3,0.842857,0.441624,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
22,control_iSarcasm_03,E03_A0.78_F0.32,0.85,0.435484,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [9]:
# we can use the previously-created filter to get the control models by best F1
acc_control_df = best_acc_df[acc_control_filter]
acc_control_df

Unnamed: 0,model_name,model_epoch,test_accuracy,test_f1,predictions
8,control_iSarcasm_01,E09_A0.76_F0.33,0.841429,0.415789,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14,control_iSarcasm_02,E05_A0.75_F0.3,0.842857,0.441624,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
22,control_iSarcasm_03,E03_A0.78_F0.32,0.85,0.435484,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


# Prep Ensembles' Inducer Combinations

In order to get sets of three models, each containing all three top epochs for each model iteration, we follow these steps:
1. make list of core model names
2. Get a superset of all unique combinations of 3 and 5 core model names - itertools.combinations
3. Triplicate each core model name and add _01, _02, _03 (as per our naming convention)
4. Collect sets of predictions via each unique collection, and count most frequent predictions per index.

In [10]:
# 1. make list of core model names
estimator_names = ['target-iSarcasm_inter-SARC', 
                   'target-iSarcasm_inter-XED-fine', 
                   'target-iSarcasm_inter-XED-binary', 
                   'target-iSarcasm_inter-IMDB', 
                   'target-iSarcasm_inter-HellaSwag',
                   'target-iSarcasm_inter-CosmosQA']

# 2. Create a superset of all unique combinations of 3 and 5 core model names - itertools.combinations
combinations = []

# get all possible unique combinations of estimator names
for r in range(len(estimator_names)+1):
    for combination in itertools.combinations(set(estimator_names), r):
        combinations.append(combination)

# extract those combinations that are 3 or 5 models long
three_estimators = []
five_estimators = []
for i in combinations:
    if len(i) == 3: 
        three_estimators.append(i)
    if len(i) == 5: 
        five_estimators.append(i)

In [11]:
# double-check length and contents
print(f"three_estimators Example: {three_estimators[0]}")
print(f"Length of three_estimators: {len(three_estimators)}")
print(f"Length of five_estimators: {len(five_estimators)}")

three_estimators Example: ('target-iSarcasm_inter-IMDB', 'target-iSarcasm_inter-XED-fine', 'target-iSarcasm_inter-HellaSwag')
Length of three_estimators: 20
Length of five_estimators: 6


In [12]:
# 3. we are working from best df, so for each list created in step 2, triplicate each core model name and add _01, _02, _03
# this allows us to query the df for those specific model predictions in unique sets of three, containing all three best iterations of each model
named_three_estimators = []
for i in three_estimators:
    triplets = []
    for j in i:
        triplets.append(j + "_01")
        triplets.append(j + "_02")
        triplets.append(j + "_03")
    named_three_estimators.append(tuple(triplets))
    
named_five_estimators = []
for i in five_estimators:
    triplets = []
    for j in i:
        triplets.append(j + "_01")
        triplets.append(j + "_02")
        triplets.append(j + "_03")
    named_five_estimators.append(tuple(triplets))

In [13]:
# double-check length
print(f"Length of named_three_estimators: {len(named_three_estimators)}")
print(f"Length of named_five_estimators: {len(named_five_estimators)}")

Length of named_three_estimators: 20
Length of named_five_estimators: 6


## Get Ensembles' Results

In [14]:
# takes the best estimators dataframe, the list of tuples containing the named estimators
# and the list of of lists containing core estimator names for each combination (those without the _01, _02, _03)
def get_ensemble_results(estimators_df, named_estimator_groups_list, estimator_groups_list):
    results_dict = {"ensemble_id":[], 
                    "inducers": [], 
                    "inducers_spec":[], 
                    "dataset":[], 
                    "test_accuracy":[], 
                    "test_f1":[]}
    
    # for each set of inducers in estimator_groups_list
    for i, v in enumerate(named_estimator_groups_list):
        # collect the predictions of inducers within set (v) into a list of lists
        predictions = [estimators_df.loc[estimators_df['model_name']==m, 'predictions'].tolist()[0] for m in v]

        # collect the predictions of inducers within set (v) into groups by index
        # ie, predictions at index 0 from each of the 9 inducers will be grouped together
        pred_groups = []
        for j in range(len(true_preds)):
            pred_stage = []
            for prediction in predictions:
                pred_stage.append(prediction[j])
            pred_groups.append(pred_stage)

        # count must common prediction at each index
        ensemble_preds = []
        for k in pred_groups:
            ensemble_preds.append(most_common(k))
        
        # calculate accuracy and f1
        acc = accuracy_score(true_preds, ensemble_preds)
        f1 = f1_score(true_preds, ensemble_preds, average='binary')
        
        # store results
        results_dict["ensemble_id"].append(f"ensemble_{i}")
        results_dict["inducers"].append(estimator_groups_list[i])
        results_dict["inducers_spec"].append(v)
        results_dict["dataset"].append("iSarcasm_test")
        results_dict["test_accuracy"].append(acc)
        results_dict["test_f1"].append(f1)
        
    return results_dict

### Best F1 Ensembles Results

In [15]:
f1_TM_ensemble_results = get_ensemble_results(f1_estimators_df, named_three_estimators, three_estimators)
f1_FM_ensemble_results = get_ensemble_results(f1_estimators_df, named_five_estimators, five_estimators)

### Best Accuracy Ensembles Results

In [16]:
acc_TM_ensemble_results = get_ensemble_results(acc_estimators_df, named_three_estimators, three_estimators)
acc_FM_ensemble_results = get_ensemble_results(acc_estimators_df, named_five_estimators, five_estimators)

### Prep Results for Exporting

#### Best F1 Ensembles Results Prep

In [17]:
# three-model ensembles csv prep
f1_TM_ensemble_results_df = pd.DataFrame.from_dict(f1_TM_ensemble_results)
f1_TM_ensemble_results_df = f1_TM_ensemble_results_df.sort_values(by=['test_f1'], ascending=False)
f1_TM_ensemble_results_df['ensemble_type'] = "best-F1_three-model"

# five-model ensembles csv prep
f1_FM_ensemble_results_df = pd.DataFrame.from_dict(f1_FM_ensemble_results)
f1_FM_ensemble_results_df = f1_FM_ensemble_results_df.sort_values(by=['test_f1'], ascending=False)
f1_FM_ensemble_results_df['ensemble_type'] = "best-F1_five-model"

#### Best Accuracy Ensembles Results Prep

In [18]:
# three-model ensembles csv prep
acc_TM_ensemble_results_df = pd.DataFrame.from_dict(acc_TM_ensemble_results)
acc_TM_ensemble_results_df = acc_TM_ensemble_results_df.sort_values(by=['test_accuracy'], ascending=False)
acc_TM_ensemble_results_df['ensemble_type'] = "best-acc_three-model"

# five-model ensembles csv prep
acc_FM_ensemble_results_df = pd.DataFrame.from_dict(acc_FM_ensemble_results)
acc_FM_ensemble_results_df = acc_FM_ensemble_results_df.sort_values(by=['test_accuracy'], ascending=False)
acc_FM_ensemble_results_df['ensemble_type'] = "best-acc_five-model"

In [19]:
# double-check results are as expected
pd.set_option('display.max_colwidth', None)
f1_TM_ensemble_results_df.head()

Unnamed: 0,ensemble_id,inducers,inducers_spec,dataset,test_accuracy,test_f1,ensemble_type
19,ensemble_19,"(target-iSarcasm_inter-SARC, target-iSarcasm_inter-CosmosQA, target-iSarcasm_inter-XED-binary)","(target-iSarcasm_inter-SARC_01, target-iSarcasm_inter-SARC_02, target-iSarcasm_inter-SARC_03, target-iSarcasm_inter-CosmosQA_01, target-iSarcasm_inter-CosmosQA_02, target-iSarcasm_inter-CosmosQA_03, target-iSarcasm_inter-XED-binary_01, target-iSarcasm_inter-XED-binary_02, target-iSarcasm_inter-XED-binary_03)",iSarcasm_test,0.863571,0.516456,best-F1_three-model
13,ensemble_13,"(target-iSarcasm_inter-XED-fine, target-iSarcasm_inter-SARC, target-iSarcasm_inter-CosmosQA)","(target-iSarcasm_inter-XED-fine_01, target-iSarcasm_inter-XED-fine_02, target-iSarcasm_inter-XED-fine_03, target-iSarcasm_inter-SARC_01, target-iSarcasm_inter-SARC_02, target-iSarcasm_inter-SARC_03, target-iSarcasm_inter-CosmosQA_01, target-iSarcasm_inter-CosmosQA_02, target-iSarcasm_inter-CosmosQA_03)",iSarcasm_test,0.857857,0.511057,best-F1_three-model
14,ensemble_14,"(target-iSarcasm_inter-XED-fine, target-iSarcasm_inter-SARC, target-iSarcasm_inter-XED-binary)","(target-iSarcasm_inter-XED-fine_01, target-iSarcasm_inter-XED-fine_02, target-iSarcasm_inter-XED-fine_03, target-iSarcasm_inter-SARC_01, target-iSarcasm_inter-SARC_02, target-iSarcasm_inter-SARC_03, target-iSarcasm_inter-XED-binary_01, target-iSarcasm_inter-XED-binary_02, target-iSarcasm_inter-XED-binary_03)",iSarcasm_test,0.848571,0.506977,best-F1_three-model
17,ensemble_17,"(target-iSarcasm_inter-HellaSwag, target-iSarcasm_inter-SARC, target-iSarcasm_inter-XED-binary)","(target-iSarcasm_inter-HellaSwag_01, target-iSarcasm_inter-HellaSwag_02, target-iSarcasm_inter-HellaSwag_03, target-iSarcasm_inter-SARC_01, target-iSarcasm_inter-SARC_02, target-iSarcasm_inter-SARC_03, target-iSarcasm_inter-XED-binary_01, target-iSarcasm_inter-XED-binary_02, target-iSarcasm_inter-XED-binary_03)",iSarcasm_test,0.861429,0.502564,best-F1_three-model
10,ensemble_10,"(target-iSarcasm_inter-XED-fine, target-iSarcasm_inter-HellaSwag, target-iSarcasm_inter-SARC)","(target-iSarcasm_inter-XED-fine_01, target-iSarcasm_inter-XED-fine_02, target-iSarcasm_inter-XED-fine_03, target-iSarcasm_inter-HellaSwag_01, target-iSarcasm_inter-HellaSwag_02, target-iSarcasm_inter-HellaSwag_03, target-iSarcasm_inter-SARC_01, target-iSarcasm_inter-SARC_02, target-iSarcasm_inter-SARC_03)",iSarcasm_test,0.855714,0.502463,best-F1_three-model


#### Concat & Export

In [20]:
# concat
frames = [f1_TM_ensemble_results_df, 
          f1_FM_ensemble_results_df, 
          acc_TM_ensemble_results_df,
          acc_FM_ensemble_results_df]

ensembles_results_df = pd.concat(frames)

In [21]:
ensembles_results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52 entries, 19 to 1
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ensemble_id    52 non-null     object 
 1   inducers       52 non-null     object 
 2   inducers_spec  52 non-null     object 
 3   dataset        52 non-null     object 
 4   test_accuracy  52 non-null     float64
 5   test_f1        52 non-null     float64
 6   ensemble_type  52 non-null     object 
dtypes: float64(2), object(5)
memory usage: 3.2+ KB


In [22]:
# export result CSVs
ensembles_results_df.to_csv("05_results/results_ensemble.csv", index=False)