# Better

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import os
import torch
import matplotlib.pyplot as plt

if os.getcwd()[-15:] != 'BachelorProject':
    os.chdir('../')

from utils.metrics import get_link_prediction_metrics
from tgb.linkproppred.evaluate import Evaluator


# Import data
folder_name = 'DTU_Test/Test_folder'
logits = {}
labels = {}
MRRs = {}

for model_folder in os.listdir(folder_name):
    if '_' in model_folder:
        continue

    model_folder_path = f'{folder_name}/{model_folder}'
    for data_name in os.listdir(model_folder_path):
        if data_name != 'tgbl-flight':
            continue
        for run_name in os.listdir(f'{model_folder_path}/{data_name}'):
            if '.' in run_name:
                continue
            for file_name in os.listdir(f'{model_folder_path}/{data_name}/{run_name}'):
                #print(file_name)
                # OBS exception for EdgeBank!

                if 'logits' in file_name:
                    data = torch.load(f'{model_folder_path}/{data_name}/{run_name}/{file_name}', map_location=torch.device('cpu'))
                    if type(data[0]) == torch.Tensor:
                        logits[file_name] = data if 'EdgeBank' not in file_name else torch.stack(data).flatten()
                    else:
                        logits[file_name] = data if 'EdgeBank' not in file_name else torch.Tensor(np.array(data).flatten())
                elif 'labels' in file_name:
                    data = torch.load(f'{model_folder_path}/{data_name}/{run_name}/{file_name}', map_location=torch.device('cpu'))
                    if type(data[0]) == torch.Tensor:
                        labels[file_name] = data if 'EdgeBank' not in file_name else torch.stack(data).flatten()
                    else:
                        labels[file_name] = data if 'EdgeBank' not in file_name else torch.Tensor(np.array(data).flatten())
                elif 'all_val_metric' in file_name:
                    MRRs[model_folder] = np.load(f'{model_folder_path}/{data_name}/{run_name}/{file_name}')

# Find best epoch for each model
model_names = ['EdgeBank'] + list(MRRs.keys())
best_epochs = {}
for model in MRRs:
    best_epochs[model] = MRRs[model].argmax()

# Training + test data (train: logits from best epoch)
training_data = {}
test_data = {}

for key in logits:
    model_name = key.split('_')[0]
    if 'train' in key:
        training_data[model_name] = logits[key] if model_name=='EdgeBank' else logits[key][best_epochs[model_name]].detach().numpy()
    elif 'test' in key:
        test_data[model_name] = logits[key] if model_name=='EdgeBank' else logits[key].detach().numpy()

# Add labels
for key in labels:
    if not 'labels' in training_data.keys() and 'train' in key:
        training_data['labels'] = labels[key][0]
    elif not 'labels' in test_data.keys() and 'test' in key:
        test_data['labels'] = labels[key]

train_df = pd.DataFrame(training_data)
test_df = pd.DataFrame(test_data)

In [None]:
# for key in training_data:
#     print(key, training_data[key].shape)
#     print(key, test_data[key].shape)

In [None]:
# Are labels the same? YES
# for key1 in labels.keys():
#     for key2 in labels.keys():
#         if key1 == key2 or key1.split('_')[-1] != key2.split('_')[-1]:
#             continue

#         if not np.array_equal(labels[key1][np.random.randint(len(labels[key1]))], labels[key2][np.random.randint(len(labels[key2]))]):
#             print(f'Labels {key1} and {key2} are not equal')

In [None]:
# # Class imbalance
# for file in ['TGN_tgbl-wiki_labels_train.pth', 'TGN_tgbl-wiki_labels_test.pth']:
#     data = labels[file] if not 'train' in file else labels[file].T
#     print(data.shape)

#     print(sum(data == 1))
#     print(sum(data == 0))

In [None]:
print(train_df)
print(test_df.head())

In [None]:
evaluator = Evaluator(name='tgbl-comment')
model_performances = {}

for model_name in model_names:
    # Train
    print(f'\n\n-------------------- Model {model_name} --------------------')
    print('Model alone')
    formula = "labels ~" + model_name
    model = sm.Logit.from_formula(formula, data=train_df)
    result = model.fit()
    print(result.conf_int())

    # Test
    probs = result.predict(test_df[model_name])
    predicts = (probs > 0.5).astype(int)
    hits = predicts == test_df['labels']
    # print(np.mean(hits))

    probs = np.array(probs)
    probs_batches = np.array_split(probs, len(probs)/101)
    mrrs = []
    for p_batch in probs_batches:
        
        # Eval metrics
        input_dict = {
            'y_pred_pos': np.array(p_batch[0]),
            'y_pred_neg': np.array(p_batch[1:]),
            'eval_metric': ['mrr']
        }   

        mrrs.append(evaluator.eval(input_dict)['mrr'])

    train_perf = get_link_prediction_metrics(torch.Tensor(predicts), torch.Tensor(test_df['labels']))
    pr_auc = train_perf['pr_auc']
    roc_auc = train_perf['roc_auc']
    mrr = float(np.mean(np.array(mrrs)))

    print('mrr:', mrr)
    print('pr-auc:', pr_auc)
    print('roc-auc:', roc_auc)
    
    model_performances[model_name] = [mrr, pr_auc, roc_auc]

    # Train together with EdgeBank
    if model_name == 'EdgeBank':
        continue
    print(f'\n\n------------- {model_name} + EdgeBank -------------')
    formula = "labels ~ " + model_name + ' * EdgeBank'
    model = sm.Logit.from_formula(formula, data=train_df)
    result = model.fit()
    print(result.conf_int())

    # Test
    probs = result.predict(test_df[[model_name, 'EdgeBank']])
    predicts = (probs > 0.5).astype(int)
    hits = predicts == test_df['labels']
    # print(np.mean(hits))

    # Loop over every 101th value of probs
    probs = np.array(probs)
    probs_batches = np.array_split(probs, len(probs)/101)
    mrrs = []
    for p_batch in probs_batches:
        
        # Eval metrics
        input_dict = {
            'y_pred_pos': np.array(p_batch[0]),
            'y_pred_neg': np.array(p_batch[1:]),
            'eval_metric': ['mrr']
        }   

        mrrs.append(evaluator.eval(input_dict)['mrr'])

    train_perf = get_link_prediction_metrics(torch.Tensor(predicts), torch.Tensor(test_df['labels']))
    pr_auc = train_perf['pr_auc']
    roc_auc = train_perf['roc_auc']
    mrr = float(np.mean(np.array(mrrs)))

    print('mrr:', mrr)
    print('pr-auc:', pr_auc)
    print('roc-auc:', roc_auc)

    model_performances['EdgeBank_' + model_name] = [mrr, pr_auc, roc_auc]

In [None]:
pd.DataFrame(model_performances)

In [None]:
# Save models
os.makedirs(f'{folder_name}/frozen_models/', exist_ok=True)
for key in model_performances:
    torch.save(model_performances[key], f'{folder_name}/frozen_models/{key}.pth')
    

In [None]:
evaluator = Evaluator(name='tgbl-wiki')
model_performances = {}

# Drop TCL from model names
# model_names = list(train_df.columns.drop('labels'))
from sklearn.linear_model import LogisticRegressionCV

for model_name in model_names:
    # Prepare the training data for the meta-model
    X_train_meta = train_df[[model_name, 'EdgeBank']].values
    y_train_meta = train_df['labels'].values

    # Train the meta-model
    meta_model = LogisticRegressionCV(cv=10, max_iter=1000)
    meta_model.fit(X_train_meta, y_train_meta)

    # Prepare the testing data for the meta-model
    X_test_meta = test_df[[model_name, 'EdgeBank']].values
    y_test_meta = test_df['labels'].values

    # Make predictions with the meta-model
    probs = meta_model.predict_proba(X_test_meta)[:, 1]
    predicts = (probs > 0.5).astype(int)

    # Loop over every 101th value of probs
    probs = np.array(probs)
    probs_batches = np.array_split(probs, len(probs)/101)
    mrrs = []
    for p_batch in probs_batches:
        
        # Eval metrics
        input_dict = {
            'y_pred_pos': np.array(p_batch[0]),
            'y_pred_neg': np.array(p_batch[1:]),
            'eval_metric': ['mrr']
        }   

        mrrs.append(evaluator.eval(input_dict)['mrr'])

    train_perf = get_link_prediction_metrics(torch.Tensor(predicts), torch.Tensor(test_df['labels']))
    pr_auc = train_perf['pr_auc']
    roc_auc = train_perf['roc_auc']
    mrr = float(np.mean(np.array(mrrs)))

    print('model:', model_name, 'EdgeBank')
    print('weights:', meta_model.coef_)
    print('mrr:', mrr)
    print('pr-auc:', pr_auc)
    print('roc-auc:', roc_auc)

    model_performances['EdgeBank_' + model_name] = [mrr, pr_auc, roc_auc]
    

In [None]:
pd.DataFrame(model_performances)