In [40]:
import math
import json
from time import time
from random import shuffle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt


In [41]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [42]:
link = '/content/gdrive/MyDrive/'

### NN ARCHITECTURES

In [43]:
# Learn semantic relatedness between two feature vectors
class SemRelNN (nn.Module):
    def __init__(self, max_score=3):
        super(SemRelNN, self).__init__()
        self.classes = max_score + 1
        self.dense = nn.Linear(1, self.classes, bias=True)
        self.soft = nn.LogSoftmax(dim=1)

    def forward(self, ins1, ins2):
        inputs = ins1 * ins2
        inputs = inputs.sum(axis=1)
        inputs = inputs.reshape((-1, 1))
        logits = self.dense(inputs)
        dist = self.soft(logits)
        return dist


In [44]:
# Transfer learning from the pre-trained models
class FineTuneNN (nn.Module):
    def __init__(self, input_dim, output_dim=512, act=torch.tanh):
        super(FineTuneNN, self).__init__()
        self.in_dim = input_dim
        self.out_dim = output_dim
        self.dense1 = nn.Linear(input_dim, output_dim, bias=True)
        self.dense2 = nn.Linear(output_dim, output_dim, bias=True)
        self.act_fn = act

    def forward(self, inputs):
        out1 = self.dense1(inputs)
        act_out1 = self.act_fn(out1)
        out2 = self.dense2(act_out1)
        act_out2 = self.act_fn(out2)
        return act_out2


In [45]:
class QBESciAR (nn.Module):
    def __init__(self, baseline, facet, input_dim, max_score=3, act=torch.tanh):
        super(QBESciAR, self).__init__()
        self.baseline = baseline
        self.facet = facet
        self.fine_tuner = FineTuneNN(input_dim, input_dim, act)
        self.score_gen = SemRelNN(max_score)
        self.train_acc_step = []
        self.train_acc_epoch = []
        self.train_loss_step = []
        self.train_loss_epoch = []
        self.valid_acc = []
        self.valid_loss = []

    def forward(self, ins1, ins2):
        tuned1 = self.fine_tuner(ins1)
        tuned2 = self.fine_tuner(ins2)
        return self.score_gen(tuned1, tuned2)


### DATASET UTILITES

In [46]:
def ResampleDataset(dataset):
    sm = SMOTE(random_state=42)
    dim = len(dataset[0][0])

    X = np.array([s[0] + s[1] for s in dataset])
    Y = np.array([s[2] for s in dataset])
    X, Y = sm.fit_resample(X, Y)

    l = Y.shape[0]
    data = [(X[i][:dim], X[i][dim:], Y[i]) for i in range(l)]
    return data


In [47]:
def PrepareDataset(baseline, facet):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)

    query_feature_data_file_name = link+'data/' + baseline + '/' + facet + '.json'
    query_feature_data_file = open(query_feature_data_file_name)
    query_feature_data = json.load(query_feature_data_file)

    cand_feature_data_file_name = link+'data/' + baseline + '/all.json'
    cand_feature_data_file = open(cand_feature_data_file_name)
    cand_feature_data = json.load(cand_feature_data_file)

    data = []
    for q in rank_data.keys():
        qvec = query_feature_data[q]
        l = len(rank_data[q]['cands'])
        for i in range(l):
            cand = rank_data[q]['cands'][i]
            score = rank_data[q]['relevance_adju'][i]
            data.append((qvec, cand_feature_data[cand], score))

    rank_data_file.close()
    query_feature_data_file.close()
    cand_feature_data_file.close()

    data = ResampleDataset(data)
    return data


In [48]:
def ReFormatAbstractData():
    abstract_data = dict()
    with open(link+'data/abstracts-csfcube-preds.jsonl', 'r') as abstract_file:
        json_list = list(abstract_file)
    for json_str in json_list:
        line = json.loads(json_str)
        abstract_data[line['paper_id']] = line
    with open(link+'data/abstracts-csfcube-preds.json', 'w') as formatted_abstract_file:
        json.dump(abstract_data, formatted_abstract_file)


In [49]:
# ReFormatAbstractData()

In [50]:
def SplitDataset(data, tr_v_te_ratio=[0.40, 0.15, 0.45]):
    tr_v_te_ratio = [_ / sum(tr_v_te_ratio) for _ in tr_v_te_ratio]
    tr_s = int(tr_v_te_ratio[0] * len(data))
    vl_s = int(tr_v_te_ratio[1] * len(data))
    te_s = len(data) - tr_s - vl_s

    shuffle(data)
    train_data = data[:tr_s]
    valid_data = data[tr_s:(tr_s+vl_s)]
    test_data = data[(tr_s+vl_s):]

    return train_data, valid_data, test_data


### TRAINING UTILITIES

In [51]:
sigma = np.sqrt(2/np.pi)
from scipy.ndimage import gaussian_filter1d

In [52]:
def MakeBatches(dataset, batch_size=16):
    shuffle(dataset)
    batches = list()
    for start in range(0, len(dataset), batch_size):
        end = start + batch_size
        ins1, ins2, outs = list(zip(*dataset[start:end]))
        outs = np.array(outs)
        n_values = 4
        outs = np.eye(n_values)[outs]
        outs = gaussian_filter1d(outs, sigma)
        ins1, ins2, outs = torch.tensor(ins1).float(), torch.tensor(
            ins2).float(), torch.tensor(outs).float()
        batches.append((ins1, ins2, outs))
    return batches


In [53]:
def Train(train_data, val_data, baseline, facet,
          epochs=5, lr=0.01, batch_size=16,
          early_stopping_tolerance=4,
          desirable_train_acc=0.95):

    input_dim = len(train_data[0][0])
    MODEL = QBESciAR(baseline, facet, input_dim, max_score=3)

    loss_fn = nn.KLDivLoss(reduction="batchmean")
    optimizer = torch.optim.Adam(MODEL.parameters(), lr=lr)
    consec_inc = 0
    prev_valid_loss = float('inf')

    for epoch in range(epochs):
        total_loss = 0.0
        batches = MakeBatches(train_data, batch_size)
        epoch_start_time = time()
        correct_preds = 0
        total_samples = 0

        for step, (ins1, ins2, true) in enumerate(batches):
            pred = MODEL(ins1, ins2)
            pred_labels = torch.argmax(pred, dim=1)
            true_labels = torch.argmax(true, dim=1)

            loss = loss_fn(pred, true)

            total_loss += loss
            c = (pred_labels == true_labels).sum()
            correct_preds += c
            total_samples += true.shape[0]

            MODEL.train_loss_step.append(loss)
            MODEL.train_acc_step.append(c / true.shape[0])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        mean_loss = total_loss / len(batches)
        acc = correct_preds / total_samples
        ti = time() - epoch_start_time

        val_acc, val_loss = AccLoss(val_data, loss_fn, MODEL)

        MODEL.train_acc_epoch.append(acc)
        MODEL.train_loss_epoch.append(mean_loss)
        MODEL.valid_acc.append(val_acc)
        MODEL.valid_loss.append(val_loss)

        logg = ' EPOCH {:3d} | T.LOSS : {:2.6f} | T.ACC : {:3f} | V.LOSS : {:.6f} | V.ACC : {:3f} | DUR : {:.4f}'
        print(logg.format(epoch+1, mean_loss, acc, val_loss, val_acc, ti))

        if (acc >= desirable_train_acc):
            print('\n [ EARLY STOPPING : DESIRABLE TRAINING ACCURACY REACHED ! ]')
            break

        if (prev_valid_loss < val_loss):
            consec_inc += 1
        else:
            consec_inc = 0
        prev_valid_loss = val_loss

        if (consec_inc == early_stopping_tolerance):
            print('\n [ EARLY STOPPING : VALIDATION LOSS NOT DECREASING ! ]')
            break

    MODEL.eval()
    model_name = baseline + '/' + facet + '.qbe'
    torch.save(MODEL, 'models-KLDivLoss/' + model_name)
    return MODEL


### PLOT UTILITIES

In [54]:
def GetContingencyMatrix(true, pred):
    mat = np.zeros((4, 4))
    l = len(true)
    for i in range(l):
        mat[pred[i]][true[i]] += 1
    return mat


In [55]:
def PlotAndSaveContingencyMatrix(true, pred, loc):
    contingency_mat = GetContingencyMatrix(true, pred)
    fig = plt.figure()
    ax = fig.add_subplot()
    sns.heatmap(contingency_mat)
    ax.set_xlabel('TRUE SCORES', fontsize=13)
    ax.set_ylabel('PREDICTED SCORES', fontsize=13)
    plt.show()
    fig.savefig(loc)


In [56]:
def PlotAccuracyProgress(train_step_hist, train_epoch_hist, valid_hist, loc):
    steps = len(train_step_hist)
    epochs = len(train_epoch_hist)
    r = steps / epochs

    xe = np.array(range(epochs)) * r
    xs = np.array(range(steps))

    fontsize = 12
    plt.style.use('seaborn-whitegrid')
    currentAxis = plt.gca()
    currentAxis.plot(xe, train_epoch_hist, linewidth=2.5)
    currentAxis.plot(xe, valid_hist, linewidth=1.8)
    currentAxis.plot(xs, train_step_hist, linewidth=1, color='C3', alpha=0.5)

    plt.legend(["TRAIN-SET (epoch-wise)", "VALID-SET (epoch-wise)",
                "TRAIN-SET (step-wise)"], loc="lower right")

    plt.xlabel('STEP / EPOCH', fontsize=fontsize)
    plt.ylabel('ACCURACY', fontsize=fontsize)
    plt.title('PROGRESS OF ACCURACY', fontsize=13, fontweight='bold')
    plt.savefig('plots-KLDivLoss/' + loc)
    plt.show()


In [57]:
def PlotLossProgress(train_step_hist, train_epoch_hist, valid_hist, loc):
    steps = len(train_step_hist)
    epochs = len(train_epoch_hist)
    r = steps / epochs

    xe = np.array(range(epochs)) * r
    xs = np.array(range(steps))

    fontsize = 12
    plt.style.use('seaborn-whitegrid')
    currentAxis = plt.gca()
    currentAxis.plot(xe, train_epoch_hist, linewidth=2.5)
    currentAxis.plot(xe, valid_hist, linewidth=1.8)
    currentAxis.plot(xs, train_step_hist, linewidth=1, color='C3', alpha=0.5)

    plt.legend(["TRAIN-SET (epoch-wise)", "VALID-SET (epoch-wise)",
                "TRAIN-SET (step-wise)"], loc="upper right")

    plt.xlabel('STEP / EPOCH', fontsize=fontsize)
    plt.ylabel('LOSS', fontsize=fontsize)
    plt.title('PROGRESS OF LOSS', fontsize=13, fontweight='bold')
    plt.savefig('plots-KLDivLoss/' + loc)
    plt.show()


### COGNITIVE EVALUATION

In [58]:
def GetQueryFeature(baseline, facet, paper_id):
    query_feature_data_file_name = link+'data/' + baseline + '/' + facet + '.json'
    query_feature_data_file = open(query_feature_data_file_name)
    query_feature_data = json.load(query_feature_data_file)
    query_feature_data_file.close()
    return query_feature_data[paper_id]


def GetCandidateFeatures(baseline, facet, paper_id):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)

    cand_feature_data_file_name = link+'data/' + baseline + '/all.json'
    cand_feature_data_file = open(cand_feature_data_file_name)
    cand_feature_data = json.load(cand_feature_data_file)

    rank_data_file.close()
    cand_feature_data_file.close()

    cand_features = [cand_feature_data[cand_pid]
                     for cand_pid in rank_data[paper_id]['cands']]
    return rank_data[paper_id]['cands'], cand_features


In [59]:
def QBERetrieveSciArticles(baseline, facet, paper_id, top=True, ret_k=15):
    qf = GetQueryFeature(baseline, facet, paper_id)
    cand_pids, cand_f = GetCandidateFeatures(baseline, facet, paper_id)
    cand_c = len(cand_f)

    model_name = baseline + '/' + facet + '.qbe'
    model = torch.load(link+'data/models-KLDivLoss/' + model_name)
    model.eval()

    ins1, ins2 = [qf] * cand_c, cand_f
    ins1, ins2 = torch.tensor(ins1).float(), torch.tensor(ins2).float()

    pred = model(ins1, ins2)
    pred_labels = torch.argmax(pred, dim=1).numpy()
    pred = pred.detach().numpy()
    rank_scores = [l + (l != 0) * pred[i][l] + (l == 0) * (1 - pred[i][0])
                   for i, l in enumerate(pred_labels)]

    scored_cands = list(zip(cand_pids, rank_scores))
    scored_cands.sort(key=lambda x: -1 * x[1])

    abstract_data_file = open(link+'data/abstracts-csfcube-preds.json')
    abstract_data = json.load(abstract_data_file)
    abstract_data_file.close()

    results = None
    if top:
        results = scored_cands[:ret_k]
    else:
        results = scored_cands[-ret_k:]
        results.reverse()

    print('\n >>> BASELINE : ' + baseline + ' <<<')
    print(' >>> FACET : ' + facet + ' <<<')

    print('\n +++ QUERY PAPER +++\n')
    print(' [ TITLE ] ' + abstract_data[paper_id]['title'])
    print(' [ ABSTRACT ] ')
    for sent in abstract_data[paper_id]['abstract']:
        print('', sent)

    if top:
        print('\n +++ TOP {} PAPERS +++\n'.format(ret_k))
    else:
        print('\n +++ BOTTOM {} PAPERS +++\n'.format(ret_k))

    for rank, (pid, score) in enumerate(results):
        print(' RANK : {} | PAPER I.D. : {} | GRADED RELEVANCE SCORE : {:.4f}'.format(
            rank+1, pid, score))
        print(' [ TITLE ] ' + abstract_data[pid]['title'])
        print(' [ ABSTRACT ] ')
        for sent in abstract_data[pid]['abstract']:
            print('', sent)
        print('\n')


### ANALYTICAL EVALUATION

In [60]:
def AccLoss(data, loss_fn, MODEL):
    batches = MakeBatches(data)
    total_loss = 0
    correct_preds = 0
    total_samples = 0

    for __, (ins1, ins2, true) in enumerate(batches):
        pred = MODEL(ins1, ins2)
        pred_labels = torch.argmax(pred, dim=1)
        true_labels = torch.argmax(true, dim=1)

        loss = loss_fn(pred, true)

        total_loss += loss.item()
        correct_preds += (pred_labels == true_labels).sum()
        total_samples += true.shape[0]

    mean_loss = total_loss / len(batches)
    acc = correct_preds / total_samples
    return acc, mean_loss


def Pred(data, MODEL):
    preds = []

    for in1, in2, __ in data:
        in1 = in1.reshape((1, -1))
        in2 = in2.reshape((1, -1))
        in1, in2 = torch.tensor(in1).float(), torch.tensor(in2).float()
        pred = MODEL(in1, in2)
        pred_label = torch.argmax(pred, dim=1)
        preds.append(pred_label.item())

    return preds


In [61]:
def GetMSE(x, y):
    x, y = np.array(x), np.array(y)
    return ((x - y) ** 2).mean()


In [62]:
def DCG(rank_scores, K=100):
    score = rank_scores[0]
    l = int(len(rank_scores) * (K/100))
    for i in range(1, l):
        score += rank_scores[i] / math.log2(i + 1)
    return score


def NDCG(pred_scores, ideal_scores, K=100):
    return DCG(pred_scores, K) / DCG(ideal_scores, K)


def AllNDCG(baseline, facet, K=100):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)
    rank_data_file.close()

    model_name = baseline + '/' + facet + '.qbe'
    model = torch.load(link+'data/models-KLDivLoss/' + model_name)
    model.eval()

    query_pids = rank_data.keys()
    all_ndcg = []

    for pid in query_pids:
        true_labels = rank_data[pid]['relevance_adju']

        qf = GetQueryFeature(baseline, facet, pid)
        cand_pids, cand_f = GetCandidateFeatures(baseline, facet, pid)
        cand_c = len(cand_f)

        ins1, ins2 = [qf] * cand_c, cand_f
        ins1, ins2 = torch.tensor(ins1).float(), torch.tensor(ins2).float()

        pred = model(ins1, ins2)
        pred_labels = list(torch.argmax(pred, dim=1).numpy())

        pred_true = list(zip(pred_labels, true_labels))
        pred_true.sort(key=lambda x: -1 * x[0])

        __, pred_ranking_scores = zip(*pred_true)
        ideal_ranking_scores = true_labels
        ideal_ranking_scores.sort(reverse=True)

        ndcg = NDCG(pred_ranking_scores, ideal_ranking_scores, K)
        all_ndcg.append(ndcg)

    return np.array(all_ndcg)


In [63]:
def PrecisionAtK(pred_ranking_scores, K):
    pred_ranking_scores = pred_ranking_scores[:K]
    K = len(pred_ranking_scores)
    return len([_ for _ in pred_ranking_scores if _ > 1]) / K


def MeanPrecisionAtK(baseline, facet, K=20, need_list=False):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)
    rank_data_file.close()

    model_name = baseline + '/' + facet + '.qbe'
    model = torch.load(link+'data/models-KLDivLoss/' + model_name)
    model.eval()

    query_pids = rank_data.keys()
    values = []

    for pid in query_pids:
        true_labels = rank_data[pid]['relevance_adju']

        qf = GetQueryFeature(baseline, facet, pid)
        cand_pids, cand_f = GetCandidateFeatures(baseline, facet, pid)
        cand_c = len(cand_f)

        ins1, ins2 = [qf] * cand_c, cand_f
        ins1, ins2 = torch.tensor(ins1).float(), torch.tensor(ins2).float()

        pred = model(ins1, ins2)
        pred_labels = list(torch.argmax(pred, dim=1).numpy())

        pred_true = list(zip(pred_labels, true_labels))
        pred_true.sort(key=lambda x: -1 * x[0])

        __, pred_ranking_scores = zip(*pred_true)

        prec_at_k = PrecisionAtK(pred_ranking_scores, K)
        values.append(prec_at_k)

    if (not need_list):
        return sum(values) / len(values)
    return values


In [64]:
def RecallAtK(pred_ranking_scores, K):
    tot = len([_ for _ in pred_ranking_scores if _ > 1])
    pred_ranking_scores = pred_ranking_scores[:K]
    return len([_ for _ in pred_ranking_scores if _ > 1]) / tot


def MeanRecallAtK(baseline, facet, K=20, need_list=False):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)
    rank_data_file.close()

    model_name = baseline + '/' + facet + '.qbe'
    model = torch.load(link+'data/models-KLDivLoss/' + model_name)
    model.eval()

    query_pids = rank_data.keys()
    values = []

    for pid in query_pids:
        true_labels = rank_data[pid]['relevance_adju']

        qf = GetQueryFeature(baseline, facet, pid)
        cand_pids, cand_f = GetCandidateFeatures(baseline, facet, pid)
        cand_c = len(cand_f)

        ins1, ins2 = [qf] * cand_c, cand_f
        ins1, ins2 = torch.tensor(ins1).float(), torch.tensor(ins2).float()

        pred = model(ins1, ins2)
        pred_labels = list(torch.argmax(pred, dim=1).numpy())

        pred_true = list(zip(pred_labels, true_labels))
        pred_true.sort(key=lambda x: -1 * x[0])

        __, pred_ranking_scores = zip(*pred_true)

        recall_at_k = RecallAtK(pred_ranking_scores, K)
        values.append(recall_at_k)

    if (not need_list):
        return sum(values) / len(values)
    return values


In [65]:
def AveragePrecision(pred_ranking_scores):
    precs = []
    count = 0
    for it, rel in enumerate(pred_ranking_scores):
        if (rel == 1):
            continue
        count += 1
        precs.append(count / (it + 1))
    return sum(precs) / len(precs)


def MeanAveragePrecision(baseline, facet, need_list=False):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)
    rank_data_file.close()

    model_name = baseline + '/' + facet + '.qbe'
    model = torch.load(link+'data/models-KLDivLoss/' + model_name)
    model.eval()

    query_pids = rank_data.keys()
    values = []

    for pid in query_pids:
        true_labels = rank_data[pid]['relevance_adju']

        qf = GetQueryFeature(baseline, facet, pid)
        cand_pids, cand_f = GetCandidateFeatures(baseline, facet, pid)
        cand_c = len(cand_f)

        ins1, ins2 = [qf] * cand_c, cand_f
        ins1, ins2 = torch.tensor(ins1).float(), torch.tensor(ins2).float()

        pred = model(ins1, ins2)
        pred_labels = list(torch.argmax(pred, dim=1).numpy())

        pred_true = list(zip(pred_labels, true_labels))
        pred_true.sort(key=lambda x: -1 * x[0])

        __, pred_ranking_scores = zip(*pred_true)

        avg_prec = AveragePrecision(pred_ranking_scores)
        values.append(avg_prec)

    if (not need_list):
        return sum(values) / len(values)
    return values


In [66]:
def RPrecision(pred_ranking_scores):
    tot = len([_ for _ in pred_ranking_scores if _ > 1])
    pred_ranking_scores = pred_ranking_scores[:tot]
    return len([_ for _ in pred_ranking_scores if _ > 1]) / tot


def MeanRPrecision(baseline, facet, need_list=False):
    rank_data_file_name = link+'data/test-pid2anns-csfcube-' + facet + '.json'
    rank_data_file = open(rank_data_file_name)
    rank_data = json.load(rank_data_file)
    rank_data_file.close()

    model_name = baseline + '/' + facet + '.qbe'
    model = torch.load(link+'data/models-KLDivLoss/' + model_name)
    model.eval()

    query_pids = rank_data.keys()
    values = []

    for pid in query_pids:
        true_labels = rank_data[pid]['relevance_adju']

        qf = GetQueryFeature(baseline, facet, pid)
        cand_pids, cand_f = GetCandidateFeatures(baseline, facet, pid)
        cand_c = len(cand_f)

        ins1, ins2 = [qf] * cand_c, cand_f
        ins1, ins2 = torch.tensor(ins1).float(), torch.tensor(ins2).float()

        pred = model(ins1, ins2)
        pred_labels = list(torch.argmax(pred, dim=1).numpy())

        pred_true = list(zip(pred_labels, true_labels))
        pred_true.sort(key=lambda x: -1 * x[0])

        __, pred_ranking_scores = zip(*pred_true)

        r_prec = RPrecision(pred_ranking_scores)
        values.append(r_prec)

    if (not need_list):
        return sum(values) / len(values)
    return values


### AGGREGATED EVALUATION

In [67]:
def AggregatedNDCG(baseline, K=100):
    facets = ['background', 'result', 'method']
    all_ndcgs = []
    for facet in facets:
        all_ndcgs = np.concatenate((all_ndcgs, AllNDCG(baseline, facet, K)))
    return all_ndcgs


In [68]:
def AggregatedMeanPrecisionAtK(baseline, K=20):
    facets = ['background', 'result', 'method']
    values = []
    for facet in facets:
        values.extend(MeanPrecisionAtK(baseline, facet, K, True))
    return sum(values) / len(values)


def AggregatedMeanRecallAtK(baseline, K=20):
    facets = ['background', 'result', 'method']
    values = []
    for facet in facets:
        values.extend(MeanRecallAtK(baseline, facet, K, True))
    return sum(values) / len(values)


def AggregatedMeanRPrecision(baseline):
    facets = ['background', 'result', 'method']
    values = []
    for facet in facets:
        values.extend(MeanRPrecision(baseline, facet, True))
    return sum(values) / len(values)


def AggregatedMeanAveragePrecision(baseline):
    facets = ['background', 'result', 'method']
    values = []
    for facet in facets:
        values.extend(MeanAveragePrecision(baseline, facet, True))
    return sum(values) / len(values)


In [69]:
def EvaluateOnAllQueries(baseline):
    ndcg_all_queries = AggregatedNDCG(baseline)
    print(' > (AGG) MEAN NDCG (@100%) : {:.2f} %'.format(
        100 * ndcg_all_queries.mean()))
    print(
        ' > (AGG) STD-DEV in NDCG (@100%) : {:.2f} %'.format(100 * ndcg_all_queries.std()))
    ndcg_20 = AggregatedNDCG(baseline, K=20)
    print(' > (AGG) MEAN NDCG (@20%) : {:.2f} %'.format(
        100 * ndcg_20.mean()))
    print(
        ' > (AGG) STD-DEV in NDCG (@20%) : {:.2f} %'.format(100 * ndcg_20.std()))
    print(' > (AGG) MEAN PRECISION@20 : {:.2f} %'.format(
        100 * AggregatedMeanPrecisionAtK(baseline, K=20)))
    print(' > (AGG) MEAN RECALL@20 : {:.2f} %'.format(
        100 * AggregatedMeanRecallAtK(baseline, K=20)))
    print(' > (AGG) MEAN R PRECISION : {:.2f} %'.format(
        100 * AggregatedMeanRPrecision(baseline)))
    print(' > (AGG) MEAN AVG. PRECISION : {:.2f} %'.format(
        100 * AggregatedMeanAveragePrecision(baseline)))


### TESTING & EVALUATION UTILITIES

In [70]:
def EvaluateOnTestData01(test_data, model, baseline, facet):
    preds = Pred(test_data, model)
    true = list(list(zip(*test_data))[2])
    mat = GetContingencyMatrix(true, preds)

    print('\n +++ CONTINGENCY MATRIX +++')
    print(mat)

    loc = link+'data/plots-KLDivLoss/' + baseline + '/' + facet + '_conting_mat'
    PlotAndSaveContingencyMatrix(true, preds, loc)


In [71]:
def EvaluateOnTestData02(test_data, model):
    preds = Pred(test_data, model)
    true = list(list(zip(*test_data))[2])

    loss_fn = nn.KLDivLoss(reduction="batchmean")
    acc, loss = AccLoss(test_data, loss_fn, model)

    print(' > LOSS : {:.4f}'.format(loss))
    print(' > ACCURACY : {:.4f}'.format(acc.item()))
    print(' > MEAN SQUARED ERR : {:.4f}'.format(GetMSE(preds, true)))


In [72]:
def EvaluateOnAllFacetQueries(baseline, facet):
    print(' > MEAN PRECISION@20 : {:.2f} %'.format(
        100 * MeanPrecisionAtK(baseline, facet, K=20)))
    print(' > MEAN RECALL@20 : {:.2f} %'.format(
        100 * MeanRecallAtK(baseline, facet, K=20)))
    print(' > MEAN R PRECISION : {:.2f} %'.format(
        100 * MeanRPrecision(baseline, facet)))
    print(' > MEAN AVG. PRECISION : {:.2f} %'.format(
        100 * MeanAveragePrecision(baseline, facet)))


### IMPLEMENTATION (INDIVIDUAL)

In [73]:
def Implement(baseline, facet, train=False):
    data = PrepareDataset(baseline, facet)
    train_data, valid_data, test_data = SplitDataset(data)

    print('\n > BASELINE :', baseline)
    print(' > FACET :', facet)
    print('\n', end='')

    model = None
    if (train):
        model = Train(train_data, valid_data, baseline, facet, lr=0.001,
                      epochs=300, batch_size=16, early_stopping_tolerance=4)
    else:
        model_name = baseline + '/' + facet + '.qbe'
        model = torch.load(link+'data/models-KLDivLoss/' + model_name)

    print('\n', model.eval())

    PlotAccuracyProgress(model.train_acc_step, model.train_acc_epoch,
                         model.valid_acc, baseline + '/' + facet + '_acc')

    PlotLossProgress(model.train_loss_step, model.train_loss_epoch,
                     model.valid_loss, baseline + '/' + facet + '_loss')

    print('\n', end='')
    EvaluateOnTestData01(test_data, model, baseline, facet)

    print('\n', end='')
    EvaluateOnTestData02(test_data, model)

    print('\n', end='')
    EvaluateOnAllFacetQueries(baseline, facet)

    ndcg_all_queries = AllNDCG(baseline, facet)
    print('\n > MEAN NDCG (@100%) : {:5f}'.format(ndcg_all_queries.mean()))
    print(' > (STD) DEV in NDCG (@100%) : {:5f}'.format(ndcg_all_queries.std()))

    ndcg_all_queries_20 = AllNDCG(baseline, facet, K=20)
    print('\n > MEAN NDCG (@20%) : {:5f}'.format(ndcg_all_queries_20.mean()))
    print(' > (STD) DEV in NDCG (@20%) : {:5f}'.format(ndcg_all_queries_20.std()))


### IMPLEMENTATION (ENSEMBLE)

In [74]:
def EnsembleNDCG(baselines, K=100):
    facets = ['background', 'result', 'method']
    all_ndcgs = np.array([])
    for t, facet in enumerate(facets):
        all_ndcgs = np.concatenate((all_ndcgs, AllNDCG(baselines[t], facet, K)))
    return all_ndcgs


In [75]:
def EnsembleMeanPrecisionAtK(baselines, K=20):
    facets = ['background', 'result', 'method']
    values = []
    for t, facet in enumerate(facets):
        values.extend(MeanPrecisionAtK(baselines[t], facet, K, True))
    return sum(values) / len(values)


def EnsembleMeanRecallAtK(baselines, K=20):
    facets = ['background', 'result', 'method']
    values = []
    for t, facet in enumerate(facets):
        values.extend(MeanRecallAtK(baselines[t], facet, K, True))
    return sum(values) / len(values)


def EnsembleMeanRPrecision(baselines):
    facets = ['background', 'result', 'method']
    values = []
    for t, facet in enumerate(facets):
        values.extend(MeanRPrecision(baselines[t], facet, True))
    return sum(values) / len(values)


def EnsembleMeanAveragePrecision(baselines):
    facets = ['background', 'result', 'method']
    values = []
    for t, facet in enumerate(facets):
        values.extend(MeanAveragePrecision(baselines[t], facet, True))
    return sum(values) / len(values)


In [76]:
def EnsembleEvaluateOnAllQueries(baselines):
    ndcg_all_queries = EnsembleNDCG(baselines)
    print(' > (AGG) MEAN NDCG (@100%) : {:.2f} %'.format(100*ndcg_all_queries.mean()))
    print(
        ' > (AGG) STD-DEV in NDCG (@100%) : {:.2f} %'.format(100*ndcg_all_queries.std()))
    ndcg_all_queries = EnsembleNDCG(baselines, K=20)
    print(' > (AGG) MEAN NDCG (@20%) : {:.2f} %'.format(100*ndcg_all_queries.mean()))
    print(
        ' > (AGG) STD-DEV in NDCG (@20%) : {:.2f} %'.format(100*ndcg_all_queries.std()))
    print(' > (AGG) MEAN PRECISION@20 : {:.2f} %'.format(
        100*EnsembleMeanPrecisionAtK(baselines, K=20)))
    print(' > (AGG) MEAN RECALL@20 : {:.2f} %'.format(
        100*EnsembleMeanRecallAtK(baselines, K=20)))
    print(' > (AGG) MEAN R PRECISION : {:.2f} %'.format(
        100*EnsembleMeanRPrecision(baselines)))
    print(' > (AGG) MEAN AVG. PRECISION : {:.2f} %'.format(
        100*EnsembleMeanAveragePrecision(baselines)))


## BERT-based NLI Baseline (bert_nli)

### [ BACKGROUND ]

In [None]:
baseline = 'bert_nli'
facet = 'background'
Implement(baseline, facet, train=True)


 > BASELINE : bert_nli
 > FACET : background

 EPOCH   1 | T.LOSS : 2.038194 | T.ACC : 0.313120 | V.LOSS : 1.428711 | V.ACC : 0.275272 | DUR : 1.4452
 EPOCH   2 | T.LOSS : 0.906542 | T.ACC : 0.357434 | V.LOSS : 0.819086 | V.ACC : 0.360809 | DUR : 1.5986
 EPOCH   3 | T.LOSS : 0.690758 | T.ACC : 0.400000 | V.LOSS : 0.645771 | V.ACC : 0.376361 | DUR : 5.2059
 EPOCH   4 | T.LOSS : 0.543093 | T.ACC : 0.437318 | V.LOSS : 0.617651 | V.ACC : 0.335925 | DUR : 1.2289


In [None]:
QBERetrieveSciArticles(baseline, facet, '10695055')


### [ METHOD ]

In [None]:
baseline = 'bert_nli'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1198964')


### [ RESULTS ]

In [None]:
baseline = 'bert_nli'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10052042')


### [ AGGREGATED ]

In [None]:
baseline = 'bert_nli'
EvaluateOnAllQueries(baseline)


## BERT-based PP Baseline (bert_pp)

### [ BACKGROUND ]

In [None]:
baseline = 'bert_pp'
facet = 'background'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10695055')


### [ METHOD ]

In [None]:
baseline = 'bert_pp'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1198964')


### [ RESULTS ]

In [None]:
baseline = 'bert_pp'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10052042')


### [ AGGREGATED ]

In [None]:
baseline = 'bert_pp'
EvaluateOnAllQueries(baseline)


## SciBERT Cased Baseline (scibert_cased)

### [ BACKGROUND ]

In [None]:
baseline = 'scibert_cased'
facet = 'background'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10695055')


### [ METHOD ]

In [None]:
baseline = 'scibert_cased'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1198964')


### [ RESULTS ]

In [None]:
baseline = 'scibert_cased'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10052042')


### [ AGGREGATED ]

In [None]:
baseline = 'scibert_cased'
EvaluateOnAllQueries(baseline)


## SciBERT Uncased Baseline (scibert_uncased)

### [ BACKGROUND ]

In [None]:
baseline = 'scibert_uncased'
facet = 'background'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10695055')


### [ METHOD ]

In [None]:
baseline = 'scibert_uncased'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1198964')


### [ RESULTS ]

In [None]:
baseline = 'scibert_uncased'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10052042')


### [ AGGREGATED ]

In [None]:
baseline = 'scibert_uncased'
EvaluateOnAllQueries(baseline)


## SPECTER Baseline (specter)

### [ BACKGROUND ]

In [None]:
baseline = 'specter'
facet = 'background'
Implement(baseline, facet, train=True) train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10695055')


### [ METHOD ]

In [None]:
baseline = 'specter'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10010426')


### [ RESULTS ]

In [None]:
baseline = 'specter'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10052042')


### [ AGGREGATED ]

In [None]:
baseline = 'specter'
EvaluateOnAllQueries(baseline)


## SimCSE Baseline (simcse)

### [ BACKGROUND ]

In [None]:
baseline = 'susimcse'
facet = 'background'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10695055')


### [ METHOD ]

In [None]:
baseline = 'susimcse'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1198964')


### [ RESULTS ]

In [None]:
baseline = 'susimcse'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '10052042')


### [ AGGREGATED ]

In [None]:
baseline = 'susimcse'
EvaluateOnAllQueries(baseline)


## UnSimCSE Baseline (unsimcse)

### [ BACKGROUND ]

In [None]:
baseline = 'unsimcse'
facet = 'background'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1791179')


### [ METHOD ]

In [None]:
baseline = 'unsimcse'
facet = 'method'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '1198964')


### [ RESULTS ]

In [None]:
baseline = 'unsimcse'
facet = 'result'
Implement(baseline, facet, train=True)


In [None]:
QBERetrieveSciArticles(baseline, facet, '5052952')


### [ AGGREGATED ]

In [None]:
baseline = 'unsimcse'
EvaluateOnAllQueries(baseline)


## ENSEMBLE CONSTRUCTION

###### The following table contains the %-NDCG values (for 100% data, unless explicitly mentioned otherwise) for all baseline-facet pairs and also for the aggregated pool of queries.

| <i>FACET / BASELINE</i> | BERT_NLI | BERT_PP | SCIBERT_CASED | SCIBERT_UNCASED | SPECTER | SUSIMCSE | UNSIMCSE |
| :- | :-: | :-: | :-: | :-: | :-: | :-: | :-: |
| <b>BACKGROUND</b> | 75.017 | 68.497 | 85.603 | 88.070 | 86.388 | 87.703 | 88.799 |
| <b>METHOD</b> | 70.511 | 80.637 | 82.161 | 81.937 | 84.748 | 78.283 | 63.308 |
| <b>RESULTS</b> | 81.576 | 67.120 | 84.506 | 83.280 | 68.023 | 76.383 | 89.558 |
| <b>AGG@100%</b> | 88.280 | 86.020 | 89.250 | 87.680 | 90.820 | 84.840 | 85.530 |
| <b>AGG@20%</b> | 77.870 | 74.530 | 80.220 | 75.780 | 82.000 | 72.890 | 73.680 |

In [None]:
EnsembleEvaluateOnAllQueries(['unsimcse', 'unsimcse', 'specter'])