In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pykeen
import torch

import json

from pykeen.pipeline import pipeline

In [2]:
# nltk.download('wordnet')

In [3]:
dataset = 'FB15k'

In [4]:
ds = pykeen.datasets.get_dataset(dataset=dataset)

training = ds.training.mapped_triples
testing = ds.testing.mapped_triples
ds.training.relation_id_to_label

{0: '/american_football/football_coach/coaching_history./american_football/football_historical_coach_position/position',
 1: '/american_football/football_coach/coaching_history./american_football/football_historical_coach_position/team',
 2: '/american_football/football_coach_position/coaches_holding_this_position./american_football/football_historical_coach_position/coach',
 3: '/american_football/football_coach_position/coaches_holding_this_position./american_football/football_historical_coach_position/team',
 4: '/american_football/football_player/current_team./american_football/football_roster_position/position',
 5: '/american_football/football_player/current_team./american_football/football_roster_position/team',
 6: '/american_football/football_player/current_team./sports/sports_team_roster/team',
 7: '/american_football/football_player/former_teams./american_football/football_historical_roster_position/position_s',
 8: '/american_football/football_player/former_teams./american_

In [5]:
id2idx = {v:k for k,v in ds.training.entity_id_to_label.items()}
id2idx

{'/m/010016': 0,
 '/m/0100mt': 1,
 '/m/0102t4': 2,
 '/m/0104lr': 3,
 '/m/0105y2': 4,
 '/m/0106dv': 5,
 '/m/0108xl': 6,
 '/m/0109vk': 7,
 '/m/010bnr': 8,
 '/m/010bxh': 9,
 '/m/010cw1': 10,
 '/m/010dft': 11,
 '/m/010h9y': 12,
 '/m/010hn': 13,
 '/m/010m55': 14,
 '/m/010nlt': 15,
 '/m/010p3': 16,
 '/m/010r6f': 17,
 '/m/010rvx': 18,
 '/m/010t4v': 19,
 '/m/010tkc': 20,
 '/m/010v8k': 21,
 '/m/010xjr': 22,
 '/m/010y34': 23,
 '/m/010z5n': 24,
 '/m/0113sg': 25,
 '/m/0114m0': 26,
 '/m/0118d3': 27,
 '/m/0118v': 28,
 '/m/011_3s': 29,
 '/m/011_6p': 30,
 '/m/011_vz': 31,
 '/m/011hdn': 32,
 '/m/011hq1': 33,
 '/m/011j5x': 34,
 '/m/011k11': 35,
 '/m/011k1h': 36,
 '/m/011k4g': 37,
 '/m/011k_j': 38,
 '/m/011kn2': 39,
 '/m/011lpr': 40,
 '/m/011lvx': 41,
 '/m/011pcj': 42,
 '/m/011s0': 43,
 '/m/011s9r': 44,
 '/m/011v3': 45,
 '/m/011vx3': 46,
 '/m/011w20': 47,
 '/m/011w4n': 48,
 '/m/011w54': 49,
 '/m/011wdm': 50,
 '/m/011wtv': 51,
 '/m/011x_4': 52,
 '/m/011xg5': 53,
 '/m/011xhx': 54,
 '/m/011xjd': 55,
 '/m/01

In [6]:
e2dta_loc = '/home/gebhart/projects/traversing_knowledge_graphs/data/freebase/entity2wikidata.json'
with open(e2dta_loc) as f_in:
    entity2wiki = json.load(f_in)

In [7]:
n2fbid = {}
for k in entity2wiki:
    name = entity2wiki[k]['label']
    reformat_name = name.lower().replace(' ', '_')
    n2fbid[reformat_name] = k
n2fbid

{'denton': '/m/010016',
 'el_paso': '/m/0100mt',
 'marshall': '/m/0102t4',
 'beaumont': '/m/0104lr',
 'lubbock': '/m/0105y2',
 'waco': '/m/0106dv',
 'tyler': '/m/0108xl',
 'san_angelo': '/m/0109vk',
 'laredo': '/m/010bnr',
 'wichita_falls': '/m/010bxh',
 'hackensack,_new_jersey': '/m/010cw1',
 'logan': '/m/010dft',
 'ogden': '/m/010h9y',
 'amy_grant': '/m/010hn',
 'mclean': '/m/010m55',
 'monte_carlo': '/m/010nlt',
 'adam_carolla': '/m/010p3',
 'bellevue': '/m/010r6f',
 'bremerton': '/m/010rvx',
 'tacoma': '/m/010t4v',
 'everett': '/m/010tkc',
 'spokane': '/m/010v8k',
 'richard_harris': '/m/010xjr',
 'morgantown': '/m/010y34',
 'parkersburg': '/m/010z5n',
 'nikolai_gogol': '/m/0113sg',
 'la_crosse': '/m/0114m0',
 'racine': '/m/0118d3',
 'lily_tomlin': '/m/011_3s',
 'kazoo': '/m/011_6p',
 'faith_no_more': '/m/011_vz',
 'alice_cooper': '/m/011hdn',
 'sikkim': '/m/011hq1',
 'post-punk': '/m/011j5x',
 'decca_records': '/m/011k11',
 'emi': '/m/011k1h',
 'carl_stalling': '/m/011k4g',
 'timpa

In [8]:
rname2idx = {}
for k in ds.training.relation_id_to_label:
    rname = ds.training.relation_id_to_label[k]
    rname_shortened = rname[rname.rfind('/')+1:]
    rname2idx[rname_shortened] = k
rname2idx

{'position': 1325,
 'team': 1171,
 'coach': 25,
 'position_s': 1178,
 'season': 1194,
 'players': 1096,
 'player': 1326,
 'architectural_style': 44,
 'structures_designed': 32,
 'structure': 33,
 'architects': 34,
 'examples': 35,
 'building_function': 36,
 'buildings': 37,
 'citytown': 979,
 'state_province_region': 981,
 'type_of_museum': 40,
 'architect': 43,
 'currency': 1315,
 'owner': 765,
 'museums': 47,
 'category': 989,
 'objects': 49,
 'orbited_by': 50,
 'orbits': 51,
 'planetary_system': 52,
 'star_system': 53,
 'aircraft_models_made': 54,
 'aircraft_type': 55,
 'aircraft_owner': 56,
 'aircraft_model': 57,
 'aircraft_of_this_type': 58,
 'airport': 59,
 'cities_served': 67,
 'alliance': 61,
 'focus_cities': 62,
 'hubs': 63,
 'member_airlines': 64,
 'flight_destination': 65,
 'airline': 66,
 'focus_city_for': 68,
 'hub_for': 69,
 'serves': 70,
 'presented_by': 72,
 'category_of': 73,
 'disciplines_or_subjects': 74,
 'nominee': 75,
 'award_nominee': 90,
 'nominated_for': 91,
 '

In [9]:
filename = '/home/gebhart/projects/traversing_knowledge_graphs/data/freebase/dev'
head_entities = []
tail_entities = []
relations = []
with open(filename, 'r') as f:
    num_examples = 0
    for line in f:
        items = line.split()
        if len(items) >= 4:
            label = items[3]
        else:
            label = '1'  # if no label, assume positive
            s, path, t = items[:3]
            rels = tuple(path.split(','))
            head_entities.append(s)
            tail_entities.append(t)
            relations.append(rels)

In [10]:
relations

[('religion',),
 ('gender',),
 ('nationality', '**nationality', 'profession', '**profession'),
 ('gender',),
 ('place_of_death', '**place_of_death', 'gender', '**gender', 'nationality'),
 ('**place_of_birth',
  'profession',
  '**profession',
  'nationality',
  '**place_of_death'),
 ('place_of_death', '**location'),
 ('gender', '**gender'),
 ('place_of_birth',
  '**place_of_birth',
  'profession',
  '**profession',
  'institution'),
 ('profession',),
 ('cause_of_death',),
 ('gender', '**gender', 'profession'),
 ('**place_of_death', 'place_of_birth', '**place_of_birth', 'location'),
 ('gender', '**gender', 'place_of_birth'),
 ('gender',),
 ('profession', '**children', '**parents'),
 ('nationality', '**place_of_death', 'gender', '**gender'),
 ('gender', '**gender', 'gender', '**gender'),
 ('gender', '**gender'),
 ('gender', '**gender'),
 ('**place_of_birth', 'institution', '**institution'),
 ('gender', '**gender'),
 ('location', '**place_of_death'),
 ('gender', '**gender', 'nationality',

In [11]:
head_entities

['ananda_maitreya',
 'enrique_de_villena',
 'simone_renant',
 'moses_harvey',
 'isaac_r_harrington',
 'ontario',
 'bridget_of_sweden',
 'saint_alban',
 'asa_p_blunt',
 'jr_mitchell',
 'donald_pleasence',
 'horst_bienek',
 'telavi',
 'norberto_romualdez',
 'werner_klemperer',
 'francisco_guerrero',
 'aleksander_wielopolski',
 'pablo_antonio',
 'ramon_sampedro',
 'ninette_de_valois',
 'vouziers',
 'jan_august_hendrik_leys',
 'john_rauch',
 'peter_iii_of_portugal',
 'david_r_brower',
 'mumbai',
 'totteridge',
 'catherine_of_siena',
 'angelique_pettyjohn',
 'unity_mitford',
 'teresa_de_la_parra',
 'george_holland',
 'henry_darcy',
 'robert_stirling',
 'francois_edouard_picot',
 'eduard_vogel',
 'francis_l_sullivan',
 'samuel_baldwin_marks_young',
 'hubert_harrison',
 'octave_chanute',
 'ernst_von_wildenbruch',
 'gussie_davis',
 'richard_yates_1860',
 'johann_gottlieb_heineccius',
 'masaharu_homma',
 'wladyslaw_grabski',
 'bess_flowers',
 'amman',
 'john_langdon',
 'richard_b_anderson',
 'o

In [12]:
mapped_dataset = []
inverse_info = []
miss_cnt = 0
for i in range(len(head_entities)):
    head_name = head_entities[i]
    tail_name = tail_entities[i]
    relation_names = relations[i]
    if head_name in n2fbid and tail_name in n2fbid:
        head_idx = id2idx[n2fbid[head_name]]
        tail_idx = id2idx[n2fbid[tail_name]]
    else:
        miss_cnt += 1
        continue
    rel_idxs = []
    rel_inverses = []
    for rel_name in relation_names:
        if '**' in rel_name:
            rel_inverses.append(1)
            rname = rel_name.replace('**', '')
        else:
            rel_inverses.append(0)
            rname = rel_name
        rel_idx = rname2idx[rname]
        rel_idxs.append(rel_idx)
    row = [head_idx, rel_idxs, tail_idx]
    mapped_dataset.append(row)

In [13]:
print(miss_cnt,  len(head_entities))

26828 27163


- want: list of lists: [entity_idx0, [rel_idx0, rel_idx1, rel_idx2, ... ], entity_idx1]
- have: list of lists: [entity_name0, [rel_name0, rel_name1, rel_name2, ...], entity_name1]
- have: mapping entity indices to ids -- > mapping entity ids to indices
- have: mapping entity ids to names --> mapping names to entity ids
- have: mapping relation idxs to names --> mapping names to relation idxs

- plan:
    - loop over json file, creating dictionary from (lowercased, space underscored) names to fb ids (n2fbid)
    - create mapping of relation names to shortened names (taking final /word) then invert this dictionary
    - for each row in traversal dataset:
        - map each entity name via n2fbid then map result to index via fbid2idx
        - for each relation:
            - map relation name to relation short long name the map result to relation idx
            - keep track of which are inverses 
    