# Better

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
from models.Ensemble import LogisticRegressionModel
import torch.nn as nn
import random

if os.getcwd()[-15:] != 'BachelorProject':
    os.chdir('../')

from utils.metrics import get_link_prediction_metrics
from tgb.linkproppred.evaluate import Evaluator


# Import data
folder_name = 'DTU_Test/Test_folder'
logits = {}
labels = {}
MRRs = {}

for model_folder in os.listdir(folder_name):
    if '_' in model_folder:
        continue

    model_folder_path = f'{folder_name}/{model_folder}'
    for data_name in os.listdir(model_folder_path):
        if data_name != 'tgbl-flight':
            continue
        for run_name in os.listdir(f'{model_folder_path}/{data_name}'):
            if '.' in run_name:
                continue
            for file_name in os.listdir(f'{model_folder_path}/{data_name}/{run_name}'):
                #print(file_name)
                # OBS exception for EdgeBank!

                if 'logits' in file_name:
                    data = torch.load(f'{model_folder_path}/{data_name}/{run_name}/{file_name}', map_location=torch.device('cpu'))
                    if type(data[0]) == torch.Tensor:
                        logits[file_name] = data if 'EdgeBank' not in file_name else torch.stack(data).flatten()
                    else:
                        logits[file_name] = data if 'EdgeBank' not in file_name else torch.Tensor(np.array(data).flatten())
                elif 'labels' in file_name:
                    data = torch.load(f'{model_folder_path}/{data_name}/{run_name}/{file_name}', map_location=torch.device('cpu'))
                    if type(data[0]) == torch.Tensor:
                        labels[file_name] = data if 'EdgeBank' not in file_name else torch.stack(data).flatten()
                    else:
                        labels[file_name] = data if 'EdgeBank' not in file_name else torch.Tensor(np.array(data).flatten())
                elif 'all_val_metric' in file_name:
                    MRRs[model_folder] = np.load(f'{model_folder_path}/{data_name}/{run_name}/{file_name}')

# Find best epoch for each model
model_names = ['EdgeBank'] + list(MRRs.keys())
best_epochs = {}
for model in MRRs:
    best_epochs[model] = MRRs[model].argmax()

# Training + test data (train: logits from best epoch)
training_data = {}
test_data = {}

for key in logits:
    model_name = key.split('_')[0]
    if 'train' in key:
        training_data[model_name] = logits[key] if model_name=='EdgeBank' else logits[key][best_epochs[model_name]].detach().numpy()
    elif 'test' in key:
        test_data[model_name] = logits[key] if model_name=='EdgeBank' else logits[key].detach().numpy()

# Add labels
for key in labels:
    if not 'labels' in training_data.keys() and 'train' in key:
        training_data['labels'] = labels[key][0]
    elif not 'labels' in test_data.keys() and 'test' in key:
        test_data['labels'] = labels[key]

train_df = pd.DataFrame(training_data)
test_df = pd.DataFrame(test_data)

In [None]:
# for key in training_data:
#     print(key, training_data[key].shape)
#     print(key, test_data[key].shape)

In [None]:
# Are labels the same? YES
# for key1 in labels.keys():
#     for key2 in labels.keys():
#         if key1 == key2 or key1.split('_')[-1] != key2.split('_')[-1]:
#             continue

#         if not np.array_equal(labels[key1][np.random.randint(len(labels[key1]))], labels[key2][np.random.randint(len(labels[key2]))]):
#             print(f'Labels {key1} and {key2} are not equal')

In [None]:
# # Class imbalance
# for file in ['TGN_tgbl-wiki_labels_train.pth', 'TGN_tgbl-wiki_labels_test.pth']:
#     data = labels[file] if not 'train' in file else labels[file].T
#     print(data.shape)

#     print(sum(data == 1))
#     print(sum(data == 0))

In [None]:
# set seed
random.seed(2024)

if os.getcwd()[-15:] != 'BachelorProject':
    os.chdir('../')

from utils.metrics import get_link_prediction_metrics
from tgb.linkproppred.evaluate import Evaluator

model_performances = {}

for model_name in model_names:
    if model_name == 'EdgeBank':
        continue
    combiner = LogisticRegressionModel(input_dim=2, output_dim=1, manual_init=False)
    loss_func = nn.BCEWithLogitsLoss()
    evaluator = Evaluator(name='tgbl-wiki')
    
    optimizer = torch.optim.Adam(combiner.parameters(), lr=0.01, weight_decay=0.0)
    models = [model_name, 'EdgeBank']
    labels = train_df['labels'].values
    combined_logits = train_df[models].values

    logit_batches = np.array_split(combined_logits, len(combined_logits)/2000)
    labels_batches = np.array_split(labels, len(labels)/2000)
    weights = [combiner.get_weights()[1]]
    losses = []
    combiner.train()

    for epoch in range(1):
        for batch, labels in zip(logit_batches, labels_batches):
            batch = torch.Tensor(batch)
            labels = torch.Tensor(labels)
            output = combiner.forward(batch, return_logits=True).squeeze(1)
            loss = loss_func(output, labels)
            losses.append(loss.item())
            optimizer.zero_grad()
            weights.append(combiner.get_weights()[1])
            loss.backward()
            optimizer.step()

    test_combined_logits = test_df[models].values
    test_batches = np.array_split(test_combined_logits, len(test_combined_logits)/101)
    test_label_batches = np.array_split(test_df['labels'].values, len(test_df)/101)

    mrrs = []
    pr_aucs = []
    roc_aucs = []

    combiner.eval()

    for batch, test_label_batches in zip(test_batches, test_label_batches):
        batch = torch.Tensor(batch)
        predicts = combiner.forward(batch, return_logits=False).squeeze(1)
        mrrs.append(evaluator.eval({'y_pred_pos': predicts[0], 'y_pred_neg': predicts[1:], 'eval_metric': ['mrr']})['mrr'])
        train_perf = get_link_prediction_metrics(predicts, torch.Tensor(test_label_batches))
        pr_aucs.append(train_perf['pr_auc'])
        roc_aucs.append(train_perf['roc_auc'])


    mrr = np.mean(np.array(mrrs))
    pr_auc = np.mean(np.array(pr_aucs))
    roc_auc = np.mean(np.array(roc_aucs))

    print('model:', models)
    print('mrr:', mrr)
    print('pr-auc:', pr_auc)
    print('roc-auc:', roc_auc)

    mean_losses = []
    for i in range(0, len(losses), len(logit_batches)):
        mean_losses.append(np.mean(losses[i:i+len(logit_batches)]))

    # print('mean loss:', mean_losses)

    xticks = np.arange(0, len(losses), len(logit_batches))

    # Subplots of weights and losses
    plt.figure(figsize=(15, 5))

    # Plot for losses
    plt.subplot(1, 2, 1)
    plt.plot(np.array(losses), label='Losses') 
    plt.plot(xticks, np.array(mean_losses))  
    plt.xlabel('Batches')
    plt.ylabel('Loss')
    plt.title('Loss for ensemble of ' + model_name + ' and EdgeBank')
    plt.legend()

    # Plot for weights
    plt.subplot(1, 2, 2)
    plt.plot(np.array(weights), label='Weights')
    plt.legend(combiner.get_weights()[0])
    plt.xlabel('Batches')
    plt.ylabel('Weights')
    plt.title('Weights for ensemble of ' + model_name + ' and EdgeBank')

    plt.show()

    # model_performances['EdgeBank_' + model_name] = [mrr, pr_auc, roc_auc]

In [None]:
# Add row names to the dataframe
pd.DataFrame(model_performances, index=['MRR', 'PR-AUC', 'ROC-AUC'])