For ease of running multiple experiments, the code has been extracted out of the python script and put into a jupyter notebook

Below are all settins for running experiments, please uncomment the four arguments corresponding with a certain experiment.

Make sure to run all cells. The last two cells will output the commandline scripts to run the ranklib script, and then run the evaluation afterwards.


In [None]:
import gensim
import pandas as pd
from scipy import spatial
import numpy as np
import json
import os


class args(object):
    # The following lines correspond to: Wikipedia2Vec 2019:
    # embedding = '../src/WKN-vectors/WKN-vectors.bin'
    # ranklib_output = '../w2vconcepts'
    # score_output = '../w2vconcepts.txt'
    # embedding_type = 'wikipedia2vec_old'

    # The following lines correspond to: Wikipedia2Vec 2015:
    # embedding = '../src/wikipedia2vec_trained_20151002/wikipedia-20151002_trained_500' 
    # ranklib_output = '../w2v2015concepts'
    # score_output = 'w2v2015concepts.txt'
    # embedding_type = 'wikipedia2vec'
    
    #The following lines correspond to: rdf2vec
    # embedding = '../src/walks_1306/model.kv'
    # ranklib_output = '../rdf2vec1306concepts'
    # score_output = '../rdf2vec1306concepts.txt'
    # embedding_type = "rdf2vec"

    # The following lines correspond to: rdf2vec + pagelinks
    # embedding = '../src/walks_1708/model.kv'
    # ranklib_output = '../rdf2vec1708concepts'
    # score_output = '../rdf2vec1708concepts.txt'
    # embedding_type = "rdf2vec"

    # The following lines correspond to: complex
    embedding = '../src/20220511-231012-dbpedia_small-complex_gpu/checkpoint_best.pt'
    ranklib_output = '../complex2vecredconcepts'
    score_output = '../complex2vecredconcepts.txt'
    embedding_type = 'complex'
    

    # The following lines correspond to: complex + pagelinks
    # embedding = '../src/20220818-151937-dbpedia_pagelinks-complex_gpu/checkpoint_best.pt'
    # ranklib_output ='../complex2vecpagelinksconcepts'
    # score_output = '../complex2vecpagelinksconcepts.txt'
    # embedding_type = 'complex'
    
    
    # Use any of the following linkers, or other linkers in the /Data directory: 

    #linker = "../Data/geeer_ready.csv"
    #linker = "../Data/geeer_annotated_radboud.csv"
    linker = "../Data/geeer_annotated_ELQ.csv"
    #linker = "../Data/geeer_annotated_total.csv"

    dbpedia_input = "../src/DBpedia-Entity/runs/v2/bm25f-ca_v2.run"
    def __init__(self, i_var):
        self.i_var = i_var

path_to_dbpedia = "../src/DBpedia-Entity"

In [None]:
if not os.path.exists(args.embedding):
    print("Embedding file does not exist! Make sure you have unzipped the file in /src")
else:
    print("Found Embeddings")
if not os.path.exists(args.dbpedia_input):
    print("Dbpedia file does not exist! Make sure you have unzipped the file in /src")
else:
    print("Found Dbpedia")
if not os.path.exists(args.linker):
    print("Linker file does not exist!")
else:
    print("Found Linker")



In [None]:
def entity_lookup(tag, model, tagme = False):
    # Input: entity string
    # Output: True wikipedia2vec format
    # Looking up the embedding of entities, returning [] when entity not in corpus
    if tagme and args.embedding_type == 'rdf2vec':
        tag = tag.replace('ENTITY/', 'http://dbpedia.org/resource/')
    elif tagme and args.embedding_type == 'complex':
        tag = tag.replace('ENTITY/', '<http://dbpedia.org/resource/')
        tag = tag + '>'
    elif tagme and args.embedding_type == 'wikipedia2vec':
        tag = tag.replace('ENTITY/', '')
        tag = tag.replace('_', ' ')
    elif tagme and args.embedding_type == 'wikipedia2vec_old':
        tag = tag
    elif tag in redirect_dict:
        tag = redirect_dict[tag]
    else:
        try: 
            backup = entity_converter(redirect_dict[tag])
        except KeyError as e:
            backup = ''
        tag = entity_converter(tag)
    if args.embedding_type == 'wikipedia2vec': 
        if model.get_entity(tag):
            return model.get_entity_vector(tag)
        else:
            return []
    if args.embedding_type == 'rdf2vec' or args.embedding_type == 'wikipedia2vec_old':
        #if tag in model.vocab:
        if tag in model.key_to_index:
            return model[tag]
        else:
            return []
    if args.embedding_type == 'complex': 
        #if tag in model.dataset.entity_ids():
        if tag in entity_ids_dict:
            #ent_index = model.dataset.entity_ids().index(tag)
            ent_index = entity_ids_dict[tag]
            model_tag = model.get_s_embedder().embed(torch.Tensor([ent_index]).long())
            return model_tag
        elif backup in entity_ids_dict:
            ent_index = entity_ids_dict[backup]
            model_tag = model.get_s_embedder().embed(torch.Tensor([ent_index]).long())
            #print(tag, backup)

            return model_tag
        else:
            return []
    else:
        return []


In [None]:
def entity_converter(word, reverse = False, nospace = True):
    # Input: an entity string in dbpedia format
    # Output: string in wikipedia2vec format or readible format
    # Getting entities in the right format
    if args.embedding_type == 'rdf2vec':
        word = word.replace("<dbpedia:", "http://dbpedia.org/resource/")
        word = word.replace(">", "")
        return(word)
    elif args.embedding_type == 'complex':
        word = word.replace("dbpedia:", "http://dbpedia.org/resource/")
        return(word)
    elif args.embedding_type == 'wikipedia2vec':
        word = word.replace("<dbpedia:", "")
        word = word.replace(">", "")
        word = word.replace("_", " ")
        return(word)
    elif args.embedding_type == 'wikipedia2vec_old':
        word = word.replace("<dbpedia:", "ENTITY/")
        word = word.replace(">", "")
        return word
    elif reverse:
        word = word.replace("<dbpedia:", "")
        word = word.replace(">", "")
        if nospace:
            return word
        else:
            word = word.replace("_", " ")
            return word
    else:
        word = word.replace("<dbpedia:", "ENTITY/")
        word = word.replace(">", "")
        return word

In [None]:
if args.embedding_type == 'complex':
    import torch
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
def ranking_feature(query, entity, dist = "cosine", conf = True):
    # Input: a query-entity pair to score, a distance function and if we want to use confidence scores
    # Output: query-entity based score

    # getting all the linked entities to the query
    #a_ent = pd.DataFrame(confidence.loc[confidence['query_id'] == query])
    if query not in confidence:
    #if len(a_ent)== 0:
        # if no linked queries, return 0
        return 0
    else:
        total = 0
        a_ent = confidence[query]
        for row in a_ent:
        #for index, row in a_ent.iterrows():
            score = row['confidence']
            #score = row['score']
            ent1 = entity_lookup(entity, model)
            ent2 = entity_lookup(row['tag'], model, tagme = True)
            #ent2 = entity_lookup(row['entity'])
            if len(ent1) == 0 or len(ent2) == 0:
                continue
            else:
                if dist == 'fjaccard':
                    dist = 1-np.minimum(ent1,ent2).sum()/np.maximum(ent1,ent2).sum()
                elif args.embedding_type == 'complex':
                    dist = cos(ent1, ent2).detach().numpy()[0]
                else:
                    dist = 1 - spatial.distance.cosine(ent1, ent2)
                if conf:
                    score = row['confidence']
                    total += score*dist
                else:
                    total += dist
        return total

In [None]:
def to_print_format(queries, filepath):
    # input: A list of queries of which we want the results
    # output: Ranklib ready format of results
    f = open(filepath, "w")
    for q in queries:
        entities = new_df.loc[(new_df['query_id'] == q)]
        for index, row in entities.iterrows():
            printstring = str(int(row['rel'])) + ' qid:' + row['query_id'] + " 1:" + str(row['embedding_score']) + " 2:" + str(row['fsdm_score']) + " # " + row['tag']
            print(printstring, file = f)
    f.close()

In [None]:
def to_print_format_dict(queries, filepath):
    # input: A list of queries of which we want the results
    # output: Ranklib ready format of results
    f = open(filepath, "w")
    for q in queries:
        entities = rerank[q]
        for ent in entities.keys():
            printstring = str(int(entities[ent]['rel'])) + ' qid:' + q + " 1:" + str(entities[ent]['score']) + " 2:" + str(entities[ent]['fsdm_score']) + " # " + ent
            print(printstring, file = f)
    f.close()

In [None]:
print("Loading auxilary files")
# Loading the file to rerank
# Note, if an error is given here when loading a different file to rerank, try changing the seperator to '\t'

qrels_path = path_to_dbpedia + '/collection/v2/qrels-v2.txt'
qrels = {}
with open(qrels_path) as f:
    for line in f:
        terms = line.split()
        if terms[0] not in qrels:
            qrels[terms[0]] = {}
        qrels[terms[0]][terms[2]] = terms[3]



rerank_path = args.dbpedia_input
#rerank =  pd.read_csv(rerank_path, sep='\s+', names = ['query_id', 'x1', 'tag', 'rang', 'fsdm_score', 'x2'])
rerank = {}
with open(rerank_path) as f:
    for line in f:
        terms = line.split()
        if terms[0] not in rerank:
            rerank[terms[0]] = {}
        if terms[2] in qrels[terms[0]]:
            rel = qrels[terms[0]][terms[2]]
        else:
            rel = 0
        rerank[terms[0]][terms[2]] = {'fsdm_score' : terms[4], 'rel' : rel}


# Loading linked entities
#confidence = pd.read_csv(args.linker)


confidence = {}
with open(args.linker) as f:
    f.readline()
    for line in f:
        terms = line.strip().split(',')
        if terms[1] not in confidence:
            confidence[terms[1]] = []
        try: 
            confidence[terms[1]].append({'tag' : terms[2], 'confidence' : float(terms[3])})
        except ValueError as e:
            conf = terms[-1]
            tag = ','.join(terms[2:-1])
            confidence[terms[1]].append({'tag' : tag, 'confidence' : float(conf)})


# Loading auxilary files
#qrels_path = path_to_dbpedia + '/collection/v2/qrels-v2.txt'
#qrels = pd.read_csv(qrels_path, sep='\t',names = ['query_id', '', 'tag', 'rel'])
queries_path = path_to_dbpedia + '/collection/v2/queries-v2.txt'
queries = pd.read_csv(queries_path, sep='\t',names = ['query_id', 'query'])

# Loading previously computed redirects
#df = pd.read_csv('/store/usr/gerritse/results_dict/wikipedia_redirect.csv')
df = pd.read_csv('../Data/wikipedia_redirect.csv')

# Getting the data ready for further processing in RankLib
folds_path = path_to_dbpedia + "/collection/v2/folds/all_queries.json"
with open(folds_path, 'r') as read_file:
    data = json.load(read_file)

In [None]:
print("Loading embeddings")
# Loading the model with a Gensim keyedvector
if args.embedding_type == 'rdf2vec' or args.embedding_type ==  'wikipedia2vec_old': 
    model = gensim.models.KeyedVectors.load(args.embedding, mmap='r')
    
elif args.embedding_type == 'wikipedia2vec':
    from wikipedia2vec import Wikipedia2Vec
    model = Wikipedia2Vec.load(args.embedding)

elif args.embedding_type == 'complex': 
    import torch
    from kge import Dataset, Config
    from kge.model import KgeModel
    from kge.util.io import load_checkpoint
    checkpoint = load_checkpoint(args.embedding)
    if args.ranklib_output == 'complex2vecpagelinksred':
        dataset = dataset.create(checkpoint['config'],folder = '../src/dbpedia_kge_pagelinks')
    else:
        dataset = dataset.create(checkpoint['config'],folder = '../src/dbpedia_kge_small')

    model = KgeModel.create_from(checkpoint, dataset)
    entity_ids = model.dataset.entity_ids()
    entity_ids_dict = {}
    for i, e in enumerate(entity_ids):
        entity_ids_dict[e] = i

    import torch
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

redirect_dict = {}
for index, tags in df.iterrows():
    redirect_dict[tags['original']] = tags['redirect']



In [None]:
# Sanity check:, if this gives an error, something went wrong
entity_lookup(entity_converter('<dbpedia:Amsterdam>'), model)

In [None]:
# Scoring everything (might take a while)
test_x = []

f = open(args.score_output, 'w')

print("Ranking entities:")
for query_id in rerank.keys():
    print(query_id)
    for tag in rerank[query_id].keys():        
        query_based_score = ranking_feature(query_id, tag, conf = True)
        test_x.append(query_based_score)
        rerank[query_id][tag]['score'] = query_based_score
        print(" ".join([query_id, tag, str(query_based_score)] ), file=f)



In [None]:

print("Saving files for Ranklib:")
for i in range(5):
    query_fold = data[str(i)]
    folder = args.ranklib_output + "/Fold" +str(i+1)

    if not os.path.exists(folder):
        os.makedirs(folder)
    
    testpath = folder + "/test.txt"
    print("Now writing test for fold " + str(i+1))
    to_print_format_dict(query_fold['testing'], testpath)

    trainpath = folder + "/train.txt"
    print("Now writing train for fold " + str(i+1))
    test_print = to_print_format_dict(query_fold['training'], trainpath)


In [None]:
args.ranklib_output

In [None]:
print(f"bash Code/train_ranklib.sh {args.ranklib_output}")
print(f"bash Code/score_ranklib.sh {args.ranklib_output}")

In [None]:

print(f"sh eval_to_tex.sh ../{args.ranklib_output}/embed.run")