# Candidate Cleaning and Election Platform
This notebook facilitates model ensembling based on hard labels (label election) and soft labels (post-submission experiment). Furthermore, it postprocesses (cleans) the candidates.

## Cleaning
`I-XX` can never occur without a preceding `B-XX`. Therefore, all `I-XX` that are preceded by `O` (or `B-YY` and `I-YY`) will get changed to `B-XX`.


In [None]:
!pip install pandas

In [None]:
import numpy as np
import scipy
import csv

import scipy.special

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#BASEPATH = '/content/drive/MyDrive/e_ML4NLP2/niclas_models_and_data/outputs/'
#FINAL = '/content/drive/MyDrive/e_ML4NLP2/niclas_models_and_data/outputs/Final/'
BASEPATH = '/content/drive/MyDrive/e_ML4NLP2/Anjas_models/Results_Post-submission_experiment/newseye_second_try/Election/'
FINAL = '/content/drive/MyDrive/e_ML4NLP2/Anjas_models/Results_Post-submission_experiment/newseye_second_try/Election/Final/'

In [None]:
dataset = "newseye"
lang = "sv"
run = "multilang"
datasetpath = f'{dataset}_{lang}/'

In [None]:
class Candidate:
    def __init__(self, path, clean=False):
        self.path = path
        self.clean = clean

        self.prev_label = 'O'

    def _clean(self, label):
        if label.startswith('I') and self.prev_label == 'O':
            return 'B-' + label[2:]
        else:
            return label

    def __iter__(self):
        with open(self.path, 'r') as infile:
            header = infile.readline()

            for line in infile:
                line = line.strip().split('\t')
                form = line[0]
                label = line[1]

                if self.clean:
                    label = self._clean(label)
                    
                self.prev_label = label
                yield (form, label)

class LogitCandidate:
    def __init__(self, path):
        self.path = path

    def header(self):
        with open(self.path, 'r') as infile:
            header = infile.readline().strip().split("\t")
            return header
    
    def __iter__(self):
        with open(self.path, 'r') as infile:
            header = infile.readline().strip().split("\t")
            labelnames = header[1:]
            for line in infile:
                line = line.strip().split("\t")
                form = line[0]
                strlogits = line[1:]
                logits = [float(l) for l in strlogits]
                logits = np.array(logits)

                probs = scipy.special.softmax(logits)
                yield (form, probs)



## Soft Label Ensembling (Post-Submission Experiment)

In [None]:
import os

candidate_names = os.listdir(BASEPATH + datasetpath)
candidate_paths = [BASEPATH + datasetpath + name for name in candidate_names if name.endswith(".tsv")]
candidates = [LogitCandidate(path) for path in candidate_paths]
candidate_paths

In [None]:
# check if all headers match
headers = [c.header() for c in candidates]
if [headers[0]]*len(headers) != headers:
    raise Exception("Header mismatch in the candidate Files")

labels = candidates[0].header()[1:]
header = ["TOKEN", "NE-COARSE-LIT", "NE-COARSE-METO", "NE-FINE-LIT", "NE-FINE-METO", "NE-FINE-COMP", "NE-NESTED", "NEL-LIT", "NEL-METO", "MISC"]

outfilepath = outfilepath = FINAL + f"aauzh_bundle4_{dataset}_{lang}_softlabel.tsv"

prev_label = 'O'

with open(outfilepath, "w", encoding="utf8") as outfile:
    writer = csv.writer(outfile, delimiter="\t")
    cols = len(header)

    writer.writerow(header)

    for i, votes in enumerate(zip(*candidates)):

        # Check if all votes are for the same token
        if len({vote[0] for vote in votes}) != 1:
            raise Exception(f"Mismatch on token {i}")

        # Check if all votes have the same amount of fields (i.e. classes)
        if len({len(vote[1]) for vote in votes}) != 1:
            raise Exception(f"Probs/Logits don't add up at token {i}")

        token = votes[0][0]

        votum_sacrum = np.zeros(votes[0][1].shape)
        for vote in votes:
            votum_sacrum += vote[1]
        index = int(np.argmax(votum_sacrum))
        
        label = labels[index]

        if label.startswith('I'):
            if prev_label == 'O':
                label = 'B-' + label[2:]
            else:  
                prev_category = prev_label[2:]
                category = label[2:]
                if prev_category != category:
                    label = 'B-' + label[2:]

        row = [token, label] + ["_"] * (cols-2)
        writer.writerow(row)
        prev_label = label


## Hard Label Election (Submission)

In [None]:
import os

candidate_names = os.listdir(BASEPATH + datasetpath)
candidate_paths = [BASEPATH + datasetpath + name for name in candidate_names if name.endswith(".tsv")]
candidates = [Candidate(path) for path in candidate_paths]
candidate_paths

In [None]:
from collections import Counter

outfilepath = FINAL + f"aauzh_bundle4_{dataset}_{lang}_{run}.tsv"

outfile = open(outfilepath, 'w')
outfile.write("TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC")

prev_label = 'O'
for i, votes in enumerate(zip(*candidates)):
    if len({vote[0] for vote in votes}) != 1:
        raise Exception(f"Mismatch on token {i}")

    token = votes[0][0]
    
    counter = Counter()
    for vote in votes:
        counter.update([vote[1]])
    
    max_votes = 0
    highest_labels = []
    for clabel, votes in counter.items():
        if votes > max_votes:
            highest_labels = [clabel]
            max_votes = votes
        elif clabel == max_votes:
            highest_labels.append(clabel)
    
    if len(highest_labels) == 1:
        label = highest_labels[0]
    elif len(highest_labels) == 0:
        raise Exception(f"No label chosen at token on position {i}")
    else:
        highest_labels.remove('O')
        highest_labels.shuffle
        label = highest_labels[0]

    if label.startswith('I') and prev_label == 'O':
        label = 'B-' + label[2:]
    

    outfile.write("\n" + f"{token}\t{label}\t" + ('\t'.join(['_'] * 8)))
    prev_label = label

outfile.close()