# Task 2: KG Embedding Model

In [1]:
%%capture 
!pip install ampligraph; 

In [2]:
%tensorflow_version 1.x 
import numpy as np
import pandas as pd
import ampligraph

ampligraph.__version__

TensorFlow 1.x selected.


'1.3.2'

In [3]:
import requests
from ampligraph.datasets import load_from_csv

url = 'https://ampgraphenc.s3-eu-west-1.amazonaws.com/datasets/freebase-237-merged-and-remapped.csv'
open('freebase.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'freebase.csv', sep=',')
X[:5, ]

array([['queens college, city university of new york',
        '/education/educational_institution/students_graduates./education/education/student',
        'carol leifer'],
       ['digital equipment corporation',
        '/business/business_operation/industry', 'computer hardware'],
       ['/m/0drtv8',
        '/award/award_ceremony/awards_presented./award/award_honor/award_winner',
        'laurence mark'],
       ['the departed',
        '/award/award_winning_work/awards_won./award/award_honor/award_winner',
        'leonardo dicaprio'],
       ['marilyn manson', '/people/person/profession', 'actor']],
      dtype=object)

In [4]:
positives_filter = X

In [5]:
entities = np.unique(np.concatenate([X[:, 0], X[:, 2]]))
entities

array(['/m/011xg5', '/m/011yd2', '/m/011yxg', ..., 'zoology', 'zurich',
       'zz top'], dtype=object)

In [6]:
relations = np.unique(X[:, 1])
relations

array(['/american_football/football_team/current_roster./sports/sports_team_roster/position',
       '/award/award_category/category_of',
       '/award/award_category/disciplines_or_subjects',
       '/award/award_category/nominees./award/award_nomination/nominated_for',
       '/award/award_category/winners./award/award_honor/award_winner',
       '/award/award_category/winners./award/award_honor/ceremony',
       '/award/award_ceremony/awards_presented./award/award_honor/award_winner',
       '/award/award_ceremony/awards_presented./award/award_honor/honored_for',
       '/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for',
       '/award/award_nominee/award_nominations./award/award_nomination/award',
       '/award/award_nominee/award_nominations./award/award_nomination/award_nominee',
       '/award/award_nominee/award_nominations./award/award_nomination/nominated_for',
       '/award/award_winner/awards_won./award/award_honor/award_winner',
      

In [7]:
from ampligraph.evaluation import train_test_split_no_unseen 

num_test = 10000

data = {}
data['train'], data['test'] = train_test_split_no_unseen(X, test_size=num_test, seed=0, allow_duplication=False) 

In [8]:
print('Train set size: ', data['train'].shape)
print('Test set size: ', data['test'].shape)

Train set size:  (298722, 3)
Test set size:  (10000, 3)


In [9]:
from ampligraph.latent_features import ComplEx

In [10]:
model = ComplEx(batches_count=100, 
                seed=0, 
                epochs=100, 
                k=150, 
                eta=5,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

In [11]:
model.fit(data['train'], early_stopping = False)

Average Loss:   0.028910: 100%|██████████| 100/100 [33:34<00:00, 20.14s/epoch]


In [12]:
from ampligraph.latent_features import save_model, restore_model

In [13]:
save_model(model, './best_model.pkl')

In [14]:
del model

In [15]:
model = restore_model('./best_model.pkl')

In [16]:
if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

The model is fit!


In [17]:
from ampligraph.evaluation import evaluate_performance

In [18]:
ranks = evaluate_performance(data['test'], 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)



100%|██████████| 10000/10000 [15:07<00:00, 11.01it/s]


In [19]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.23
Hits@10: 0.39
Hits@3: 0.25
Hits@1: 0.14


In [20]:
X_unseen = data['test'][:10]

In [21]:
unseen_filter = np.array(list({tuple(i) for i in np.vstack((positives_filter, X_unseen))}))

In [22]:
ranks_unseen = evaluate_performance(
    X_unseen, 
    model=model, 
    filter_triples=unseen_filter,   # Corruption strategy filter defined above 
    corrupt_side = 's+o',
    use_default_protocol=False, # corrupt subj and obj separately while evaluating
    verbose=True
)

100%|██████████| 10/10 [00:00<00:00, 10.39it/s]


In [23]:
scores = model.predict(X_unseen)

In [24]:
from scipy.special import expit
probs = expit(scores)

In [25]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseen, 
                      np.squeeze(scores),
                      np.squeeze(probs))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
7,chris nurse /people/person/place_of_birth hamm...,4482,3.299764,0.964421
2,kymx /broadcast/content/artist richard marx,457,9.575715,0.999931
8,benjamin bratt /award/award_nominee/award_nomi...,58,10.927811,0.999982
5,david cross /influence/influence_node/influenc...,128,14.479103,1.0
3,morehouse college /business/business_operation...,13,14.723392,1.0
0,holly hunter /award/award_nominee/award_nomina...,48,16.066914,1.0
6,norman stiles /award/award_nominee/award_nomin...,4,16.827826,1.0
4,erika christensen /award/award_nominee/award_n...,46,17.34783,1.0
1,pop music /music/genre/artists alanis morissette,26,17.611797,1.0
9,tyler perry /award/award_nominee/award_nominat...,45,18.875919,1.0
