# Final Project - Evaluation Run and Reporting

## Installing packages (if running on colab)

**IMPORTANT**: Uncomment and run the cell below and then restart the Runtime (Menu Runtime > Restart Runtime, or with Ctrl + M .), then run it again. If you do not do that, then you will get errors. You only need to run it again if your Google Colab / Kaggle instance is restarted or lost.

In [None]:
'''
! pip install --upgrade scipy
! pip install --upgrade pandas
! pip install ipywidgets
! pip uninstall -y pykeen
! pip install git+https://github.com/pykeen/pykeen.git@v1.5.0
! python -c "import pykeen" || pip install git+https://github.com/pykeen/pykeen.git@v1.5.0
from pkg_resources import require
require('pykeen')
'''

After you install the packages above, you can just run from this cell onwards (Ctrl + F10 when this is selected)

If SAVE_TO_DRIVE is set to True, the following cell enables storing of hpo results to your google drive account. Authentication required, and only works on colab instances (as far as we know)

In [1]:
SAVE_TO_DRIVE = False
if (SAVE_TO_DRIVE):
    from google.colab import drive
    drive.mount('/content/drive')


## Imports and parameters

In [2]:
import requests
import json
from pykeen.pipeline import pipeline
from pykeen.pipeline import pipeline_from_config
from pykeen.triples import TriplesFactory
import os.path
import pandas as pd

In [3]:
# Use this value for BASE_DATA_URL if working with local data
BASE_DATA_URL = './data'

# Use this value for BASE_DATA_URL if working with data from the github repo
#BASE_DATA_URL = 'https://raw.githubusercontent.com/hvags/effective-octo-eureka/main/data'


# Use this value for HPO_URL_BASE if working with local data
HPO_URL_BASE = './hpo_results'

# Use this value for HPO_URL_BASE if working with data from the github repo
#HPO_URL_BASE = 'https://raw.githubusercontent.com/hvags/effective-octo-eureka/main/hpo_results'

HPO_URL_TAIL = 'best_pipeline/pipeline_config.json'

# Run on GPU
CPU_DEV = 'gpu'

In [4]:
dataset_urls = dict()

## wn18rr datasets

In [None]:
dataset_urls['wn18rr-full'] = {
    'train': '/wn18rr/train_wn18rr.txt',
    'validate': '/wn18rr/valid_wn18rr.txt',
    'test': '/wn18rr/test_wn18rr.txt'
    }


In [None]:
dataset_urls['wn18rr-sym'] = {
    'train': '/wn18rr/sym_train_wn18rr.txt',
    'validate': '/wn18rr/sym_valid_wn18rr.txt',
    'test': '/wn18rr/sym_test_wn18rr.txt'
    }


In [None]:
dataset_urls['wn18rr-asym'] = {
    'train': '/wn18rr/asym_train_wn18rr.txt',
    'validate': '/wn18rr/asym_valid_wn18rr.txt',
    'test': '/wn18rr/asym_test_wn18rr.txt'
    }

In [None]:
dataset_urls

## fb15k-237 datasets

In [None]:
dataset_urls['fb15k-237-full'] = {
    'train':  '/fb15k-237/train_fb15k-237.txt',
    'validate': '/fb15k-237/valid_fb15k-237.txt',
    'test': '/fb15k-237/test_fb15k-237.txt'
    }

In [None]:
dataset_urls['fb15k-237-sym'] = {
    'train': '/fb15k-237/sym_train_fb15k-237.txt',
    'validate': '/fb15k-237/sym_valid_fb15k-237.txt',
    'test': '/fb15k-237/sym_test_fb15k-237.txt'
    }

In [5]:
dataset_urls['fb15k-237-asym'] = {
    'train': '/fb15k-237/asym_train_fb15k-237.txt',
    'validate': '/fb15k-237/asym_valid_fb15k-237.txt',
    'test': '/fb15k-237/asym_test_fb15k-237.txt'
    }

## Read data from files and create TriplesFactories

In [6]:
datasets = dict()

for key in dataset_urls.keys():
    print(f'Processing: {key}')
    
    datasets[key] = dict()
    
    df_train = pd.read_csv(BASE_DATA_URL + dataset_urls[key]['train'], header=None, sep='\t', names=['head', 'relation','tail'])
    df_validate = pd.read_csv(BASE_DATA_URL + dataset_urls[key]['validate'], header=None, sep='\t', names=['head', 'relation','tail'])
    df_test = pd.read_csv(BASE_DATA_URL + dataset_urls[key]['test'], header=None, sep='\t', names=['head', 'relation','tail'])
    
    datasets[key]['train'] = TriplesFactory.from_labeled_triples(df_train.astype('str').to_numpy())
    entity_mapping = datasets[key]['train'].entity_to_id
    relation_mapping = datasets[key]['train'].relation_to_id
    
    datasets[key]['validate'] = TriplesFactory.from_labeled_triples(df_validate.astype('str').to_numpy(),
                                                                    entity_to_id=entity_mapping,
                                                                    relation_to_id=relation_mapping
                                                                    )
    
    datasets[key]['test'] = TriplesFactory.from_labeled_triples(df_test.astype('str').to_numpy(),
                                                                    entity_to_id=entity_mapping,
                                                                    relation_to_id=relation_mapping
                                                                    )
    print('\n')

Processing: fb15k-237-asym


You're trying to map triples with 11 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 11 from 17011 triples were filtered out
You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 19856 triples were filtered out






## Models to include in the evaluation

If doing a partial run, comment out models that should not be included

In [None]:
models = [
          'TransE',
          'TransH',
          'TransD',
          'TransR',
          'RESCAL',
          'ComplEx',
          'RotatE'
         ]

In [None]:
import datetime
from pathlib import Path

def results_to_table(results):
  
    # make sure we have a results folder
    Path('./results').mkdir(parents=True, exist_ok=True)    
    
    # select metrics to extract from models
    metrics = [('both.realistic.mean_reciprocal_rank', 'MRR'),                  
                ('both.realistic.hits@1', 'Hits@1'),                
                ('both.realistic.hits@10', 'Hits@10')]                
    
    for dataset in dataset_urls.keys():
        print(dataset)
        resultstring = dataset + '\n'
        table = pd.DataFrame()
       
        for model_label in models:                            
            row = [model_label]
            
            for metric in metrics:      
                if ((dataset, model_label) not in results):
                    result = 'NaN'
                    row += [result]
                else:
                    result = results[(dataset, model_label)].get_metric(metric[0])
                    row += [round(result,5)]                
            table = table.append(pd.DataFrame([row], columns =['model'] + [metric[1] for metric in metrics]))

        table_string = table.to_string(index=False)
        print(table_string)        
        print('\n')
        resultstring += table_string + '\n'
        
        datestring = datetime.datetime.now().strftime('%Y-%m-%dT%H%M')
        with open (f'results/{dataset}_{datestring}.txt', 'a') as file:
            file.write(resultstring)
        if (SAVE_TO_DRIVE):
            Path('/content/drive/MyDrive/results').mkdir(parents=True, exist_ok=True) 
            with open (f'/content/drive/MyDrive/results/{dataset}_{datestring}.txt', 'a') as file:
                file.write(resultstring)

In [None]:
results = dict()

for dataset_name, dataset_url in dataset_urls.items():    
    for model in models:
        gc.collect()
        torch.cuda.empty_cache()
        url = f'{HPO_URL_BASE}/{dataset_name}/{model}/{HPO_URL_TAIL}'        
        
        if not (url.startswith('http')):
            if not (os.path.exists(url)):
                print(url, ' - file not found')
                continue
            else:
                with open(url) as file:
                    config = json.load(file)
        
        else:
            response = requests.get(url).text

            if (response.startswith('404')):
                print(url, response)
                continue

            config = json.loads(response)

        config['pipeline'].pop('testing')
        config['pipeline'].pop('training')
        config['pipeline'].pop('validation')        
        
        if (config['pipeline']['loss'] == 'marginranking'):
            if('margin_activation' in config['pipeline']['loss_kwargs']):
                config['pipeline']['loss_kwargs'].pop('margin_activation')
        
        dataset = datasets[dataset_name]
        print(f'{dataset_name}, {model} loaded')
        results[dataset_name, model] = pipeline_from_config(config,
                              device=CPU_DEV,
                              testing=dataset['test'],
                              training=dataset['train'],
                              validation=dataset['validate'])

In [None]:
results_to_table(results)