Leung Wai Liu <br>
JPMC-SMM4H <br>
July 14, 2022 <br>
Task 2a Unweighted Average Ensembling

In [11]:
import pandas as pd
import numpy as np
from collections import Counter
from labels_to_ids import task7_labels_to_ids
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, confusion_matrix
from training_code import calculate_overall_performance_metrics
import os

In [12]:
# Loading up all the predictions data

n_rnds = 5
original_df = pd.read_csv('../Datasets/dev.tsv', sep='\t')
models = ['bert-large-uncased', 'roberta-large']
n_models = len(models)

epoch_string = '../20_epochs_large_model/eval_testing/saved_eval_test_result_2a'
n_rows = len(original_df)

labels_to_ids = task7_labels_to_ids
ids_to_labels = dict((v,k) for k,v in labels_to_ids.items())



In [13]:
# Retrieving all the predictions from the 
list_of_df = []
list_of_f1_score = []
sum_of_all_f1_score = 0

for model in models: 
    specific_model_row = []
    specific_model_row_f1 = []
    for rnd in range(n_rnds):
        to_read_string = epoch_string + '/' + model + '/' + str(rnd) + '/unformatted_eval_test_result.tsv'
            
        particular_model_df = pd.read_csv(to_read_string, sep='\t')
        specific_model_row.append(particular_model_df)
    
    list_of_df.append(specific_model_row)

In [14]:
# TAKING THE UNWEIGHTED AVERAGE OF DATA

uw_avg_original_tweet_id_list = []
uw_avg_original_sentence_list = []
uw_avg_original_claim_list = []
uw_avg_original_label_list = []
uw_avg_original_numbered_label_list = []

uw_avg_predicted_number_results = []
uw_avg_predicted_results = []


for index, row in original_df.iterrows(): 
    # getting the original values in the tweet
    original_tweet_id = row['id']
    original_sentence = row['Tweet']
    original_claim = row['Claim']
    original_label = row['Stance']
    
    # transferring the labels over to final list
    uw_avg_original_tweet_id_list.append(original_tweet_id)
    uw_avg_original_sentence_list.append(original_sentence)
    uw_avg_original_claim_list.append(original_claim)
    uw_avg_original_label_list.append(original_label)
    
    specific_row_value = 0
    # go through every models' row of data 
    
    for model_num in range(n_models):
        for rnd_num in range(n_rnds):
            # print(list_of_df[model_num][rnd_num])
            row = list_of_df[model_num][rnd_num].loc[(list_of_df[model_num][rnd_num]['id'] == original_tweet_id) & (list_of_df[model_num][rnd_num]['text'] == original_sentence)]
            
            prediction = labels_to_ids[row['Stance'].values[0]]
            specific_row_value += prediction

    specific_row_value = specific_row_value / 10
    specific_row_result = int(round(specific_row_value))
    uw_avg_predicted_results.append(ids_to_labels[specific_row_result])
    

In [15]:
# Calculating sklearn metrics

uw_avg_original_numbered_label_list = [labels_to_ids[label] for label in uw_avg_original_label_list]
uw_avg_predicted_number_results = [labels_to_ids[label] for label in uw_avg_predicted_results]

num_overall_prediction_data = pd.DataFrame(zip(uw_avg_original_tweet_id_list, uw_avg_original_sentence_list, uw_avg_original_claim_list, uw_avg_original_numbered_label_list, uw_avg_predicted_number_results), columns=['tweet_id', 'text', 'Claim', 'Orig', 'Stance'])

fm_f1_score, fm_precision, fm_recall, saho_f1_score, saho_precision, saho_recall, sc_f1_score, sc_precision, sc_recall = calculate_overall_performance_metrics(num_overall_prediction_data)

accuracy_result = accuracy_score(uw_avg_original_numbered_label_list, uw_avg_predicted_number_results)

net_f1 = (1.0/3.0) * (fm_f1_score + saho_f1_score + sc_f1_score)

print("ACCURACY:", accuracy_result)
print("F1:", net_f1)

# Saving results to file
os.makedirs('../20_epochs_large_model/eval_testing/eval_validation_statistics/uw_avg_ensemble', exist_ok=True)

with open('../20_epochs_large_model/eval_testing/eval_validation_statistics/uw_avg_ensemble/uw_avg_ensemble_valid_stats.txt', 'w') as file:
        file.write("Accuracy: " + str(accuracy_result) + "\n")
        file.write("Net F1: " + str(net_f1) + "\n")
        file.write("Ind F1 Score: " + str(fm_f1_score) + " , " + str(saho_f1_score) + " , " + str(sc_f1_score) + "\n")
        file.write("Ind Precision Score: " + str(fm_precision) + " , " + str(saho_precision) + " , " + str(sc_precision) + "\n")
        file.write("Ind Recall Score: " + str(fm_recall) + " , " + str(saho_recall) + " , " + str(sc_recall) + "\n")


Running performance metrics
Finished running performance metrics
ACCURACY: 0.7683333333333333
F1: 0.7387175248890282


In [16]:
# Saving it as a dataframe
unformatted_uw_avg_prediction_data = pd.DataFrame(zip(uw_avg_original_tweet_id_list, uw_avg_original_sentence_list, uw_avg_original_claim_list, uw_avg_original_label_list, uw_avg_predicted_results), columns=['id', 'text', 'Claim', 'Orig', 'Stance'])
formatted_uw_avg_prediction_data = unformatted_uw_avg_prediction_data.drop(columns = ['Orig'])

# Saving it as a tsv file
os.makedirs('../20_epochs_large_model/eval_testing/eval_validation_statistics/', exist_ok=True)
unformatted_uw_avg_prediction_data.to_csv('../20_epochs_large_model/eval_testing/eval_validation_statistics/uw_avg_ensemble/unformatted_uw_avg_data.tsv', sep='\t', index=False)
formatted_uw_avg_prediction_data.to_csv('../20_epochs_large_model/eval_testing/eval_validation_statistics/uw_avg_ensemble/formatted_uw_avg_data.tsv', sep='\t', index=False)