In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from Kelpie.dataset import Dataset
import numpy as np
import pandas as pd
from helpers.helpers import print_fact, extract_subgraph_of_kg, print_entity_id, print_sample, find_head_tail_rel
from helpers.kelpie_models_helpers import train_complex
from helpers.knowledge_graph_simulation_experiment import KnowledgeGraphMitigationExperiment
from helpers.candidate_selection_helpers import preview_samples_from_rel
import json

In [4]:

fb15k237_path = 'Kelpie_package/Kelpie/data/FB15k-237'
train_path = fb15k237_path + '/train.txt'
test_path = fb15k237_path + '/test.txt'
valid_path = fb15k237_path + '/valid.txt'
fb15k237_dataset = Dataset(name="FB15k-237", load=True, train_path=train_path, test_path=test_path, valid_path=valid_path)

train_test_valid_paths = [train_path, test_path, valid_path]
label_map_path = 'entity2wikidata.json'
label_map = json.load(open(label_map_path))

## Facts with 1-to-m Relations

In [4]:
ids = []
for idx in fb15k237_dataset.relation_2_type:
    if fb15k237_dataset.relation_2_type[idx] == '1-N':
        ids.append(idx)
        print(fb15k237_dataset.get_name_for_relation_id(idx))
        # (?, r, ?)

        

/film/special_film_performance_type/film_performance_type./film/performance/film
/organization/role/leaders./organization/leadership/organization
/base/locations/continents/countries_within
/location/country/second_level_divisions
/sports/sports_team_location/teams
/film/director/film
/sports/sport/pro_athletes./sports/pro_sports_played/athlete
/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee
/sports/sports_league/teams./sports/sports_league_participation/team
/medicine/disease/notable_people_with_this_condition
/people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony
/government/political_party/politicians_in_this_party./government/political_party_tenure/politician
/tv/non_character_role/tv_regular_personal_appearances./tv/tv_regular_personal_appearance/person
/base/americancomedy/celebrity_impressionist/celebrities_impersonated


### Politicians belonging to which party

In [15]:
rel = '/government/political_party/politicians_in_this_party./government/political_party_tenure/politician'
rel_id = fb15k237_dataset.get_id_for_relation_name(rel)

res = find_head_tail_rel(fb15k237_dataset, rel_id=rel_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)



Communist Party of the Soviet Union /government/political_party/politicians_in_this_party./government/political_party_tenure/politician Vladimir Lenin
Democratic Party /government/political_party/politicians_in_this_party./government/political_party_tenure/politician James A. Michener
Communist Party of the Soviet Union /government/political_party/politicians_in_this_party./government/political_party_tenure/politician Georgy Zhukov
Democratic Party /government/political_party/politicians_in_this_party./government/political_party_tenure/politician Lewis Cass
Canadian Alliance /government/political_party/politicians_in_this_party./government/political_party_tenure/politician Stephen Harper


### Film Directors

In [7]:
film_director = '/film/director/film'
film_director_id = fb15k237_dataset.get_id_for_relation_name(film_director)

In [3]:
experiment_inputs = []

for i in range(1, 8):
    with open(f'experiment_inputs/input_{i}.txt', 'r') as input:
        good_fact, bad_fact = input.readlines()
        good_fact = good_fact.strip().split('-')
        bad_fact = bad_fact.strip().split('-')
        experiment_inputs.append((good_fact, bad_fact))


In [7]:
import json
with open('experiment_candidates/candidate_1.json', 'r') as f:
    data = json.load(f)
    for good_fact, bad_fact in experiment_inputs:
        for candidate in data:
            if ''.join(candidate[0]) == ''.join(good_fact) and ''.join(candidate[1]) == ''.join(bad_fact):
                print_fact(candidate[0], label_map)
                print_fact(candidate[1], label_map)
                print("Num overlapping facts:", end=' ')
                print(candidate[3])
                print("Ranking:", end=' ')
                print(candidate[2])
                print()
                break




Ben Affleck /film/director/film The Town
Ben Affleck /film/director/film Argo
Num overlapping facts: 3
Ranking: [1, 2]

Steven Spielberg /film/director/film Amistad
Steven Spielberg /film/director/film Saving Private Ryan
Num overlapping facts: 4
Ranking: [1, 2]

Steven Spielberg /film/director/film The Adventures of Tintin
Steven Spielberg /film/director/film Saving Private Ryan
Num overlapping facts: 5
Ranking: [5, 1]



In [14]:
import json
with open('experiment_candidates/candidate_1.json', 'r') as f:
    data = json.load(f)
    data.sort(key=lambda x:x[3])
    finalist_candidates = data[:3]
    i = 1
    for candidate in finalist_candidates:
        print_fact(candidate[0], label_map)
        print_fact(candidate[1], label_map)
        print(candidate[3])
        print(candidate[2])
        print()
        with open(f'experiment_inputs/input_{i}.txt', 'w') as outF:
            outF.write('-'.join(candidate[0]) + '\n')
            outF.write('-'.join(candidate[1]))
        i+=1

Ben Affleck /film/director/film The Town
Ben Affleck /film/director/film Argo
3
[1, 2]

Steven Spielberg /film/director/film Amistad
Steven Spielberg /film/director/film Saving Private Ryan
4
[1, 2]

Steven Spielberg /film/director/film The Adventures of Tintin
Steven Spielberg /film/director/film Saving Private Ryan
5
[5, 1]



### Hall of Fame Inductees

In [11]:
from helpers.helpers import find_head_tail_rel, print_sample
inductee = "/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee"
inductee_id = fb15k237_dataset.get_id_for_relation_name(inductee)
res = find_head_tail_rel(fb15k237_dataset, rel_id=inductee_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)



Rock and Roll Hall of Fame /award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee Pink Floyd
Rock and Roll Hall of Fame /award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee Santana
Rock and Roll Hall of Fame /award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee Aerosmith
Television Hall of Fame /award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee George Burns
Hollywood Walk of Fame /award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee Julie Andrews


### Sports Team

In [13]:
sports = '/sports/sports_league/teams./sports/sports_league_participation/team'
sports_id = fb15k237_dataset.get_id_for_relation_name(sports)
res = find_head_tail_rel(fb15k237_dataset, rel_id = sports_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)

EFL Championship /sports/sports_league/teams./sports/sports_league_participation/team Coventry City F.C.
Serie A /sports/sports_league/teams./sports/sports_league_participation/team A.S. Roma
EFL Championship /sports/sports_league/teams./sports/sports_league_participation/team Ipswich Town F.C.
National Football League /sports/sports_league/teams./sports/sports_league_participation/team Houston Texans
National Football League /sports/sports_league/teams./sports/sports_league_participation/team Miami Dolphins


### Organization Leadership

In [14]:
leadership = '/organization/role/leaders./organization/leadership/organization'
leadership_id = fb15k237_dataset.get_id_for_relation_name(leadership)
res = find_head_tail_rel(fb15k237_dataset, rel_id = leadership_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)


chief executive officer /organization/role/leaders./organization/leadership/organization General Dynamics
president /organization/role/leaders./organization/leadership/organization Duquesne University
chief executive officer /organization/role/leaders./organization/leadership/organization Cash Money Records
president /organization/role/leaders./organization/leadership/organization Brown University
dean /organization/role/leaders./organization/leadership/organization Northwestern University School of Law


## M-to-M Relations

In [5]:
ids = []
for idx in fb15k237_dataset.relation_2_type:
    if fb15k237_dataset.relation_2_type[idx] == 'N-N':
        ids.append(idx)
        print(fb15k237_dataset.get_name_for_relation_id(idx))

/location/country/form_of_government
/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor
/media_common/netflix_genre/titles
/award/award_winner/awards_won./award/award_honor/award_winner
/soccer/football_team/current_roster./sports/sports_team_roster/position
/soccer/football_team/current_roster./soccer/football_roster_position/position
/film/actor/film./film/performance/film
/award/award_category/nominees./award/award_nomination/nominated_for
/award/award_nominee/award_nominations./award/award_nomination/award_nominee
/music/performance_role/regular_performances./music/group_membership/role
/award/award_category/winners./award/award_honor/ceremony
/award/award_winning_work/awards_won./award/award_honor/award_winner
/film/film/release_date_s./film/film_regional_release_date/film_release_region
/film/film/language
/location/location/contains
/people/person/profession
/location/statistical_region/religions./location/religion_percentage/religion
/award/award_nominee/award_nominati

### Business Industry

In [20]:
preview_samples_from_rel('/business/business_operation/industry', fb15k237_dataset, label_map)

Morgan Stanley /business/business_operation/industry investment bank
Amazon /business/business_operation/industry retail
Happy Madison /business/business_operation/industry film
Natsume /business/business_operation/industry video game
KLM /business/business_operation/industry airline


We could use this but I don't see how many business would have multiple industries

### Nominated For

In [19]:
preview_samples_from_rel('/award/award_nominee/award_nominations./award/award_nomination/nominated_for', fb15k237_dataset, label_map)

James Wong Howe /award/award_nominee/award_nominations./award/award_nomination/nominated_for Hud
Nicolas Cage /award/award_nominee/award_nominations./award/award_nomination/nominated_for National Treasure: Book of Secrets
Harry Shearer /award/award_nominee/award_nominations./award/award_nomination/nominated_for This Is Spinal Tap
Antonio Banderas /award/award_nominee/award_nominations./award/award_nomination/nominated_for The Mask of Zorro
Michael Giacchino /award/award_nominee/award_nominations./award/award_nomination/nominated_for Star Trek


This could work too

### Film Country

In [18]:
preview_samples_from_rel('/film/film/country', fb15k237_dataset, label_map)

2046 /film/film/country Hong Kong
The Chronicles of Narnia: Prince Caspian /film/film/country Czech Republic
The Polar Express /film/film/country United States of America
Face/Off /film/film/country United States of America
A League of Their Own /film/film/country United States of America


We can try to use this as well

### Profession

In [17]:
preview_samples_from_rel('/people/person/profession', fb15k237_dataset, label_map)

Julianna Margulies /people/person/profession television producer
Michael Powell /people/person/profession actor
Asia Argento /people/person/profession film director
Grace Jones /people/person/profession actor
Charles Mingus Jr. /people/person/profession bandleader


We can use profession!

### Award Nominee

In [16]:
preview_samples_from_rel('/award/award_nominee/award_nominations./award/award_nomination/award_nominee', fb15k237_dataset, label_map)

Marilyn Bergman /award/award_nominee/award_nominations./award/award_nomination/award_nominee Marvin Hamlisch
Dennis Cromwell /award/award_nominee/award_nominations./award/award_nomination/award_nominee Russell Crowe
Stephen Graham /award/award_nominee/award_nominations./award/award_nomination/award_nominee Shea Whigham
Demi Moore /award/award_nominee/award_nominations./award/award_nomination/award_nominee Joshua Jackson
Julie Bowen /award/award_nominee/award_nominations./award/award_nomination/award_nominee Candice Bergen


### Award winner

In [9]:
rel = '/award/award_winner/awards_won./award/award_honor/award_winner'
rel_id = fb15k237_dataset.get_id_for_relation_name(rel)
res = find_head_tail_rel(fb15k237_dataset, rel_id = rel_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)

Christina Hendricks /award/award_winner/awards_won./award/award_honor/award_winner Elisabeth Moss
Ariel Winter /award/award_winner/awards_won./award/award_honor/award_winner Ty Burrell
Betty Comden /award/award_winner/awards_won./award/award_honor/award_winner Jule Styne
Rory Cochrane /award/award_winner/awards_won./award/award_honor/award_winner Bryan Cranston
Kanye West /award/award_winner/awards_won./award/award_honor/award_winner T-Pain


Doesn't really make sense. All of these are actors/singers who collaborated. So does this relation mean they won an award together?

### Organization Founders

In [10]:

rel = '/organization/organization_founder/organizations_founded'
rel_id = fb15k237_dataset.get_id_for_relation_name(rel)
res = find_head_tail_rel(fb15k237_dataset, rel_id = rel_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)

Ronald Reagan /organization/organization_founder/organizations_founded Republican Party
George Bernard Shaw /organization/organization_founder/organizations_founded London School of Economics
Steven Spielberg /organization/organization_founder/organizations_founded DreamWorks Records
Richard Branson /organization/organization_founder/organizations_founded Virgin Records
Tom Rosenberg /organization/organization_founder/organizations_founded Lakeshore Entertainment


We can use this!

### Influenced by

In [11]:
rel = '/influence/influence_node/influenced_by'
rel_id = fb15k237_dataset.get_id_for_relation_name(rel)
res = find_head_tail_rel(fb15k237_dataset, rel_id = rel_id)

for s in list(res)[:5]:
    print_sample(s, fb15k237_dataset, label_map)

Iain Banks /influence/influence_node/influenced_by Ursula K. Le Guin
Victor Hugo /influence/influence_node/influenced_by Voltaire
Steve Miller Band /influence/influence_node/influenced_by The Beatles
Herman Melville /influence/influence_node/influenced_by William Shakespeare
Zach Galifianakis /influence/influence_node/influenced_by Bill Murray


This is what you get on [google](https://www.google.com/search?q=who+influenced+victor+hugo&oq=who+influenced+victor+hugo&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIHCAEQABiABDIHCAIQABiABDIHCAMQABiABDIHCAQQABiABDIHCAUQABiABDIHCAYQABiABDIHCAcQABiABDIHCAgQABiABDIHCAkQABiABNIBCDQzNTRqMGo0qAIAsAIA&sourceid=chrome&ie=UTF-8) search as well! We could use this too.

### District Represented

In [14]:
preview_samples_from_rel('/government/legislative_session/members./government/government_position_held/district_represented', fb15k237_dataset, label_map)

108th United States Congress /government/legislative_session/members./government/government_position_held/district_represented Oklahoma
32nd United States Congress /government/legislative_session/members./government/government_position_held/district_represented Missouri
32nd United States Congress /government/legislative_session/members./government/government_position_held/district_represented New Jersey
113th United States Congress /government/legislative_session/members./government/government_position_held/district_represented Maine
26th United States Congress /government/legislative_session/members./government/government_position_held/district_represented Maryland


It looks like this relation is just linking congressman -> congress year -> district represented.

### Acted In

In [4]:
acted_in = '/film/actor/film./film/performance/film'
acted_in_id = fb15k237_dataset.get_id_for_relation_name(acted_in)

In [8]:
import json
with open('experiment_candidates/candidate_2.json', 'r') as f:
    data = json.load(f)
    for good_fact, bad_fact in experiment_inputs:
        for candidate in data:
            if ''.join(candidate[0]) == ''.join(good_fact) and ''.join(candidate[1]) == ''.join(bad_fact):
                print_fact(candidate[0], label_map)
                print_fact(candidate[1], label_map)
                print("Num overlapping facts:", end=' ')
                print(candidate[3])
                print("Ranking:", end=' ')
                print(candidate[2])
                print()
                break

Ben Affleck /film/actor/film./film/performance/film Shakespeare in Love
Ben Affleck /film/actor/film./film/performance/film Jersey Girl
Num overlapping facts: 0
Ranking: [4, 3]

Ben Affleck /film/actor/film./film/performance/film Shakespeare in Love
Ben Affleck /film/actor/film./film/performance/film Pearl Harbor
Num overlapping facts: 1
Ranking: [4, 5]

Ben Affleck /film/actor/film./film/performance/film Shakespeare in Love
Ben Affleck /film/actor/film./film/performance/film Gigli
Num overlapping facts: 1
Ranking: [3, 4]

George Clooney /film/actor/film./film/performance/film Ocean's Twelve
George Clooney /film/actor/film./film/performance/film Good Night, and Good Luck.
Num overlapping facts: 2
Ranking: [4, 1]



In [17]:
import json
with open('experiment_candidates/candidate_2.json', 'r') as f:
    data = json.load(f)
    data.sort(key=lambda x:x[3])
    # for candidate in data:
    #     print_fact(candidate[0], label_map)
    #     print_fact(candidate[1], label_map)
    #     print(candidate[3])
    #     print(candidate[2])
    #     print()
    finalist_candidates = [data[0], data[2], data[3], data[5]]
    i = 4
    for candidate in finalist_candidates:
        print_fact(candidate[0], label_map)
        print_fact(candidate[1], label_map)
        print(candidate[3])
        print(candidate[2])
        print()
        with open(f'experiment_inputs/input_{i}.txt', 'w') as outF:
            outF.write('-'.join(candidate[0]) + '\n')
            outF.write('-'.join(candidate[1]))
        i+=1

Ben Affleck /film/actor/film./film/performance/film Shakespeare in Love
Ben Affleck /film/actor/film./film/performance/film Jersey Girl
0
[4, 3]

Ben Affleck /film/actor/film./film/performance/film Shakespeare in Love
Ben Affleck /film/actor/film./film/performance/film Pearl Harbor
1
[4, 5]

Ben Affleck /film/actor/film./film/performance/film Shakespeare in Love
Ben Affleck /film/actor/film./film/performance/film Gigli
1
[3, 4]

George Clooney /film/actor/film./film/performance/film Ocean's Twelve
George Clooney /film/actor/film./film/performance/film Good Night, and Good Luck.
2
[4, 1]



## Student of University

In [16]:
uni_student = '/education/educational_institution/students_graduates./education/education/student'

uni_student_id = fb15k237_dataset.get_id_for_relation_name(uni_student)

In [None]:
# from helpers.helpers import find_suitable_candidates

# train_test_valid_paths = [train_path, test_path, valid_path]

# find_suitable_candidates(uni_student, 
#                          fb15k237_dataset, 
#                          train_test_valid_paths,
#                          label_map, 
#                          'experiment_candidates/student_member_of_uni_candidates.json', 
#                          3,
#                          3, 
#                          num_random=10)


## Country is a member of which intl organization?

In [18]:
org_mem = '/organization/organization_member/member_of./organization/organization_membership/organization'

In [19]:
from helpers.helpers import find_suitable_candidates

train_test_valid_paths = [train_path, test_path, valid_path]

find_suitable_candidates(org_mem, 
                         fb15k237_dataset, 
                         train_test_valid_paths,
                         label_map, 
                         'experiment_candidates/organization_membership_candidates.json', 
                         3,
                         3, 
                         num_random=10)


['/m/0345h', '/organization/organization_member/member_of./organization/organization_membership/organization', '/m/0b6css']
['/m/0345h', '/organization/organization_member/member_of./organization/organization_membership/organization', '/m/01rz1']

Reducing the dataset size from 272115 to 13605
Finished generating dataset.
Saving in file test.txt ...
Saved dataset in file test.txt
Good fact: ['/m/0345h', '/organization/organization_member/member_of./organization/organization_membership/organization', '/m/0b6css']
Bad fact: ['/m/0345h', '/organization/organization_member/member_of./organization/organization_membership/organization', '/m/01rz1']
	 saving model...
	 done.
['/m/0345h', '/organization/organization_member/member_of./organization/organization_membership/organization', '/m/0b6css']
['/m/0345h', '/organization/organization_member/member_of./organization/organization_membership/organization', '/m/018cqq']

Reducing the dataset size from 272115 to 13605
Finished generating dataset

[[['/m/0345h',
   '/organization/organization_member/member_of./organization/organization_membership/organization',
   '/m/0b6css'],
  ['/m/0345h',
   '/organization/organization_member/member_of./organization/organization_membership/organization',
   '/m/018cqq'],
  48,
  50,
  (3, 1)]]

## Film genre

In [20]:
film_genre = '/film/film/genre'

In [21]:
from helpers.helpers import find_suitable_candidates

train_test_valid_paths = [train_path, test_path, valid_path]

find_suitable_candidates(film_genre, 
                         fb15k237_dataset, 
                         train_test_valid_paths,
                         label_map, 
                         'experiment_candidates/film_genre_candidates.json', 
                         3,
                         3, 
                         num_random=10)


['/m/0df2zx', '/film/film/genre', '/m/09q17']
['/m/0df2zx', '/film/film/genre', '/m/0jtdp']

Reducing the dataset size from 272115 to 13605
Finished generating dataset.
Saving in file test.txt ...
Saved dataset in file test.txt
Good fact: ['/m/0df2zx', '/film/film/genre', '/m/09q17']
Bad fact: ['/m/0df2zx', '/film/film/genre', '/m/0jtdp']
	 saving model...
	 done.
['/m/0bmc4cm', '/film/film/genre', '/m/06l3bl']
['/m/0bmc4cm', '/film/film/genre', '/m/03q4nz']

Reducing the dataset size from 272115 to 13605
Finished generating dataset.
Saving in file test.txt ...
Saved dataset in file test.txt
Good fact: ['/m/0bmc4cm', '/film/film/genre', '/m/06l3bl']
Bad fact: ['/m/0bmc4cm', '/film/film/genre', '/m/03q4nz']
	 saving model...
	 done.
['/m/07sp4l', '/film/film/genre', '/m/01hmnh']
['/m/07sp4l', '/film/film/genre', '/m/02kdv5l']

Reducing the dataset size from 272115 to 13605
Finished generating dataset.
Saving in file test.txt ...
Saved dataset in file test.txt
Good fact: ['/m/07sp4l', '/

[[['/m/0df2zx', '/film/film/genre', '/m/09q17'],
  ['/m/0df2zx', '/film/film/genre', '/m/0jtdp'],
  46,
  45,
  (1, 2)],
 [['/m/07sp4l', '/film/film/genre', '/m/01hmnh'],
  ['/m/07sp4l', '/film/film/genre', '/m/02kdv5l'],
  431,
  430,
  (1, 2)]]

## WN18RR select candidates

In [21]:

wn18rr_path = 'Kelpie_package/Kelpie/data/WN18RR_text'
train_path = wn18rr_path + '/train.txt'
test_path = wn18rr_path + '/test.txt'
valid_path = wn18rr_path + '/valid.txt'
wn18rr_dataset = Dataset(name="WN18RR", load=True, train_path=train_path, test_path=test_path, valid_path=valid_path)

train_test_valid_paths = [train_path, test_path, valid_path]
# label_map_path = 'entity2wikidata.json'
# label_map = json.load(open(label_map_path))

In [22]:
ids = []
for idx in wn18rr_dataset.relation_2_type:
    if wn18rr_dataset.relation_2_type[idx] == '1-N':
        ids.append(idx)
        print(wn18rr_dataset.get_name_for_relation_id(idx))

_member_meronym
_member_of_domain_usage
_member_of_domain_region


In [23]:
preview_samples_from_rel('_member_meronym', dataset=wn18rr_dataset, label_map=None)

('lycium.n.01', '_member_meronym', 'christmasberry.n.01')
('hamamelidae.n.01', '_member_meronym', 'hamamelidaceae.n.01')
('genus_luffa.n.01', '_member_meronym', 'luffa.n.02')
('cracticidae.n.01', '_member_meronym', 'strepera.n.01')
('chamaemelum.n.01', '_member_meronym', 'chamomile.n.01')


In [24]:
ids = []
for idx in wn18rr_dataset.relation_2_type:
    if wn18rr_dataset.relation_2_type[idx] == 'N-N':
        ids.append(idx)
        print(wn18rr_dataset.get_name_for_relation_id(idx))

_derivationally_related_form
_instance_hypernym
_also_see
_has_part
