In [0]:
! pip install ampligraph

In [6]:
import numpy as np
import pandas as pd
import ampligraph

ampligraph.__version__

'1.3.1'

In [7]:
import requests
from ampligraph.datasets import load_from_csv

url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/GoT.csv'
open('GoT.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'GoT.csv', sep=',')
X[:5, ]

array([['Smithyton', 'SEAT_OF', 'House Shermer of Smithyton'],
       ['House Mormont of Bear Island', 'LED_BY', 'Maege Mormont'],
       ['Margaery Tyrell', 'SPOUSE', 'Joffrey Baratheon'],
       ['Maron Nymeros Martell', 'ALLIED_WITH',
        'House Nymeros Martell of Sunspear'],
       ['House Gargalen of Salt Shore', 'IN_REGION', 'Dorne']],
      dtype=object)

In [9]:
entities = np.unique(np.concatenate([X[:, 0], X[:, 2]]))
entities

array(['Abelar Hightower', 'Acorn Hall', 'Addam Frey', ..., 'the Antlers',
       'the Paps', 'unnamed tower'], dtype=object)

In [8]:
relations = np.unique(X[:, 1])
relations

array(['ALLIED_WITH', 'BRANCH_OF', 'FOUNDED_BY', 'HEIR_TO', 'IN_REGION',
       'LED_BY', 'PARENT_OF', 'SEAT_OF', 'SPOUSE', 'SWORN_TO'],
      dtype=object)

In [0]:
from ampligraph.evaluation import train_test_split_no_unseen 

X_train, X_test = train_test_split_no_unseen(X, test_size=100) 

In [11]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)

Train set size:  (3075, 3)
Test set size:  (100, 3)


In [0]:
! pip install tensorflow==1.13.2

In [0]:
from ampligraph.latent_features import RandomBaseline
from ampligraph.latent_features import TransE
from ampligraph.latent_features import DistMult
from ampligraph.latent_features import ComplEx
from ampligraph.latent_features import HolE

Lets go through the parameters to understand what's going on:

- **`k`** : the dimensionality of the embedding space
- **`eta`** ($\eta$) : the number of negative, or false triples that must be generated at training runtime for each positive, or true triple
- **`batches_count`** : the number of batches in which the training set is split during the training loop. If you are having into low memory issues than settings this to a higher number may help.
- **`epochs`** : the number of epochs to train the model for.
- **`optimizer`** : the Adam optimizer, with a learning rate of 1e-3 set via the *optimizer_params* kwarg.
- **`loss`** : pairwise loss, with a margin of 0.5 set via the *loss_params* kwarg.
- **`regularizer`** : $L_p$ regularization with $p=2$, i.e. l2 regularization. $\lambda$ = 1e-5, set via the *regularizer_params* kwarg. 

Now we can instantiate the model:


In [0]:
def getModel(m):
  if m == RandomBaseline:
    return m()
  else:
    return m(
             seed=0, 
             epochs=200, 
             k=150, 
             eta=5,
             optimizer='adam', 
             optimizer_params={'lr':1e-3},
             loss='multiclass_nll', 
             regularizer='LP', 
             regularizer_params={'p':3, 'lambda':1e-5}, 
             verbose=True
             )

In [15]:
modelRandomBaseline = getModel(RandomBaseline)
modelTransE = getModel(TransE)
modelComplEx = getModel(ComplEx)
modelDistMult = getModel(DistMult)
modelHolE = getModel(HolE)



In [0]:
positives_filter = X

In [17]:
%tensorflow_version 1.x

TensorFlow is already loaded. Please restart the runtime to change versions.


In [0]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
from ampligraph.latent_features import save_model, restore_model

In [0]:
modelRandomBaseline.fit(X_train, early_stopping = False)

In [0]:
save_model(modelRandomBaseline, './modelRandomBaseline.pkl')

In [23]:
modelTransE.fit(X_train, early_stopping = False)

Average Loss:   0.019163: 100%|██████████| 200/200 [02:47<00:00,  1.20epoch/s]


In [0]:
save_model(modelTransE, './modelTransE.pkl')

In [26]:
modelComplEx.fit(X_train, early_stopping = False)

Average Loss:   0.017603: 100%|██████████| 200/200 [05:16<00:00,  1.58s/epoch]


In [0]:
save_model(modelComplEx, './modelComplEx.pkl')

In [29]:
modelDistMult.fit(X_train, early_stopping = False)

Average Loss:   0.016550: 100%|██████████| 200/200 [02:51<00:00,  1.16epoch/s]


In [0]:
save_model(modelDistMult, './modelDistMult.pkl')

In [32]:
modelHolE.fit(X_train, early_stopping = False)

Average Loss:   0.080941: 100%|██████████| 200/200 [05:15<00:00,  1.58s/epoch]


In [0]:
save_model(modelHolE, './modelHolE.pkl')

In [0]:
#modelRandomBaseline = restore_model('./RandomBaseline.pkl')
#modelTransE = restore_model('./TransE.pkl')
#modelComplEx = restore_model('./ComplEx.pkl')
#modelDistMult = restore_model('./DistMult.pkl')
#modelHolE = restore_model('./HolE.pkl')

In [0]:
from ampligraph.evaluation import evaluate_performance

In [0]:
def getEvaluatePerformance(model):
  return evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)

In [37]:
ranksRandomBaseline = getEvaluatePerformance(modelRandomBaseline)
ranksTransE = getEvaluatePerformance(modelTransE)
ranksDistMult = getEvaluatePerformance(modelDistMult)
ranksComplEx = getEvaluatePerformance(modelComplEx)
ranksHolE = getEvaluatePerformance(modelHolE)



100%|██████████| 100/100 [00:00<00:00, 292.40it/s]




100%|██████████| 100/100 [00:00<00:00, 316.45it/s]




100%|██████████| 100/100 [00:00<00:00, 102.26it/s]




100%|██████████| 100/100 [00:00<00:00, 107.10it/s]


## Metrics

Let's compute some evaluate metrics and print them out.

We're going to use the mrr_score (mean reciprocal rank) and hits_at_n_score functions. 

- ***mrr_score***:  The function computes the mean of the reciprocal of elements of a vector of rankings ranks.
- ***hits_at_n_score***: The function computes how many elements of a vector of rankings ranks make it to the top n positions.


In [0]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

def viewMetrics(ranks, modelName):
    print(modelName)
    
    mrr = mrr_score(ranks)
    print("MRR: %.2f" % (mrr))

    hits_10 = hits_at_n_score(ranks, n=10)
    print("Hits@10: %.2f" % (hits_10))
    hits_3 = hits_at_n_score(ranks, n=3)
    print("Hits@3: %.2f" % (hits_3))
    hits_1 = hits_at_n_score(ranks, n=1)
    print("Hits@1: %.2f" % (hits_1))
    print()

In [41]:
viewMetrics(ranksRandomBaseline, "RandomBaseline")
viewMetrics(ranksTransE, "TransE")
viewMetrics(ranksDistMult, "DistMult")
viewMetrics(ranksComplEx, "ComplEx")
viewMetrics(ranksHolE, "HolE")

RandomBaseline
MRR: 0.00
Hits@10: 0.00
Hits@3: 0.00
Hits@1: 0.00

TransE
MRR: 0.20
Hits@10: 0.34
Hits@3: 0.21
Hits@1: 0.12

DistMult
MRR: 0.39
Hits@10: 0.54
Hits@3: 0.43
Hits@1: 0.31

ComplEx
MRR: 0.41
Hits@10: 0.55
Hits@3: 0.45
Hits@1: 0.34

HolE
MRR: 0.53
Hits@10: 0.72
Hits@3: 0.58
Hits@1: 0.43



Now, how do we interpret those numbers? 

[Hits@N](http://docs.ampligraph.org/en/1.0.3/generated/ampligraph.evaluation.hits_at_n_score.html#ampligraph.evaluation.hits_at_n_score) indicates how many times in average a true triple was ranked in the top-N. Therefore, on average, we guessed the correct subject or object 53% of the time when considering the top-3 better ranked triples. The choice of which N makes more sense depends on the application.

The [Mean Reciprocal Rank (MRR)](http://docs.ampligraph.org/en/latest/generated/ampligraph.evaluation.mrr_score.html) is another popular metrics to assess the predictive power of a model.

In [0]:
X_unseen = np.array([
    ['Jorah Mormont', 'SPOUSE', 'Daenerys Targaryen'],
    ['Tyrion Lannister', 'SPOUSE', 'Missandei'],
    ["King's Landing", 'SEAT_OF', 'House Lannister of Casterly Rock'],
    ['Sansa Stark', 'SPOUSE', 'Petyr Baelish'],
    ['Daenerys Targaryen', 'SPOUSE', 'Jon Snow'],
    ['Daenerys Targaryen', 'SPOUSE', 'Craster'],
    ['House Stark of Winterfell', 'IN_REGION', 'The North'],
    ['House Stark of Winterfell', 'IN_REGION', 'Dorne'],
    ['House Tyrell of Highgarden', 'IN_REGION', 'Beyond the Wall'],
    ['Brandon Stark', 'ALLIED_WITH', 'House Stark of Winterfell'],
    ['Brandon Stark', 'ALLIED_WITH', 'House Lannister of Casterly Rock'],    
    ['Rhaegar Targaryen', 'PARENT_OF', 'Jon Snow'],
    ['House Hutcheson', 'SWORN_TO', 'House Tyrell of Highgarden'],
    ['Daenerys Targaryen', 'ALLIED_WITH', 'House Stark of Winterfell'],
    ['Daenerys Targaryen', 'ALLIED_WITH', 'House Lannister of Casterly Rock'],
    ['Jaime Lannister', 'PARENT_OF', 'Myrcella Baratheon'],
    ['Robert I Baratheon', 'PARENT_OF', 'Myrcella Baratheon'],
    ['Cersei Lannister', 'PARENT_OF', 'Myrcella Baratheon'],
    ['Cersei Lannister', 'PARENT_OF', 'Brandon Stark'],
    ["Tywin Lannister", 'PARENT_OF', 'Jaime Lannister'],
    ["Missandei", 'SPOUSE', 'Grey Worm'],
    ["Brienne of Tarth", 'SPOUSE', 'Jaime Lannister']
])

In [0]:
unseen_filter = np.array(list({tuple(i) for i in np.vstack((positives_filter, X_unseen))}))

In [0]:
def getEvaluatePerformanceUnseen(model):
  return evaluate_performance(
    X_unseen, 
    model=model, 
    filter_triples=unseen_filter,   # Corruption strategy filter defined above 
    corrupt_side = 's+o',
    use_default_protocol=False, # corrupt subj and obj separately while evaluating
    verbose=True
)

In [45]:
ranks_unseenRandomBaseline = getEvaluatePerformanceUnseen(modelRandomBaseline)
ranks_unseenTransE = getEvaluatePerformanceUnseen(modelTransE)
ranks_unseenDistMult = getEvaluatePerformanceUnseen(modelDistMult)
ranks_unseenComplEx = getEvaluatePerformanceUnseen(modelComplEx)
ranks_unseenHolE = getEvaluatePerformanceUnseen(modelHolE)

100%|██████████| 22/22 [00:00<00:00, 200.61it/s]
100%|██████████| 22/22 [00:00<00:00, 222.03it/s]
100%|██████████| 22/22 [00:00<00:00, 86.55it/s]
100%|██████████| 22/22 [00:00<00:00, 91.50it/s]


In [0]:
scoresRandomBaseline = modelRandomBaseline.predict(X_unseen)
scoresTransE = modelTransE.predict(X_unseen)
scoresDistMult = modelDistMult.predict(X_unseen)
scoresComplEx = modelComplEx.predict(X_unseen)
scoresHolE = modelHolE.predict(X_unseen)

In [0]:
from scipy.special import expit

probsRandomBaseline = expit(scoresRandomBaseline)
probsTransE = expit(scoresTransE)
probsDistMult = expit(scoresDistMult)
probsComplEx = expit(scoresComplEx)
probsHolE = expit(scoresHolE)

#RandomBaseLine
Это фиктивная модель, котора выдает псевдослучайные оценки от 0 до 1 из нормального распределения, она полезна, чтобы сравнить работает ли обученная модель лучше случайной. Соответственно для RandomBaseLine имеем результат в котором вероятность каждой из проверяемых троек колеблется примерно на одном уровне, вне зависимости от  того истинно утверждение или нет. Соответственно данная модель имеет самые низкие показтели mrr_score и hits_at_n_score.

In [48]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseenRandomBaseline, 
                      np.squeeze(scoresRandomBaseline),
                      np.squeeze(probsRandomBaseline))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
18,Cersei Lannister PARENT_OF Brandon Stark,1234,0.00101,0.500252
13,Daenerys Targaryen ALLIED_WITH House Stark of ...,3564,0.103816,0.525931
20,Missandei SPOUSE Grey Worm,3085,0.156636,0.539079
3,Sansa Stark SPOUSE Petyr Baelish,2314,0.229489,0.557122
7,House Stark of Winterfell IN_REGION Dorne,356,0.29024,0.572055
19,Tywin Lannister PARENT_OF Jaime Lannister,2755,0.386393,0.595414
8,House Tyrell of Highgarden IN_REGION Beyond th...,3832,0.491862,0.620545
5,Daenerys Targaryen SPOUSE Craster,1736,0.538885,0.631553
15,Jaime Lannister PARENT_OF Myrcella Baratheon,826,0.551878,0.634571
4,Daenerys Targaryen SPOUSE Jon Snow,539,0.577176,0.640417


#TransE
Модель Translating Embeddings имеет скоринговую функцию равную L1 или L2 норме разности эмбеддингов объекта и субъекта сложенных с эмбеддингом предиката (f = ||Esubj - Eobj + Epred||). Эта можель показывает самые низкие значения mrr_score и hits_at_n_score за исключением RandomBaseLine. 



In [49]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseenTransE, 
                      np.squeeze(scoresTransE),
                      np.squeeze(probsTransE))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
5,Daenerys Targaryen SPOUSE Craster,3307,-34.936989,6.715194e-16
10,Brandon Stark ALLIED_WITH House Lannister of C...,1647,-32.715092,6.194642e-15
7,House Stark of Winterfell IN_REGION Dorne,1890,-31.759661,1.610477e-14
9,Brandon Stark ALLIED_WITH House Stark of Winte...,1214,-30.22847,7.446331e-14
1,Tyrion Lannister SPOUSE Missandei,2215,-29.230003,2.021021e-13
3,Sansa Stark SPOUSE Petyr Baelish,962,-28.080956,6.3767e-13
4,Daenerys Targaryen SPOUSE Jon Snow,1261,-27.729847,9.059005e-13
12,House Hutcheson SWORN_TO House Tyrell of Highg...,753,-27.111053,1.681973e-12
8,House Tyrell of Highgarden IN_REGION Beyond th...,371,-26.379459,3.495798e-12
14,Daenerys Targaryen ALLIED_WITH House Lannister...,822,-26.067791,4.774218e-12


#DistMult
Скоринговая функция (f = <Epred, Esubj, Eobj>). Модель показывает неплохие результаты, что наглядно видно по таблице ниже, где утверждене получили, более или менее похожее на правду упорядочевание по вероятности появления связи, однако некоторые верные утверждения получили низкую вероятность возникновения

In [50]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseenDistMult, 
                      np.squeeze(scoresDistMult),
                      np.squeeze(probsDistMult))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
10,Brandon Stark ALLIED_WITH House Lannister of C...,3957,-3.151287,0.041041
7,House Stark of Winterfell IN_REGION Dorne,3896,-1.646955,0.161521
18,Cersei Lannister PARENT_OF Brandon Stark,3935,-1.372229,0.20226
9,Brandon Stark ALLIED_WITH House Stark of Winte...,3197,-1.072514,0.254925
2,King's Landing SEAT_OF House Lannister of Cast...,3577,-0.986115,0.27168
8,House Tyrell of Highgarden IN_REGION Beyond th...,3174,-0.568788,0.361517
15,Jaime Lannister PARENT_OF Myrcella Baratheon,2735,-0.166903,0.458371
1,Tyrion Lannister SPOUSE Missandei,1767,0.072192,0.51804
11,Rhaegar Targaryen PARENT_OF Jon Snow,1373,0.136841,0.534157
19,Tywin Lannister PARENT_OF Jaime Lannister,1226,0.21206,0.552817


#Complex embeddings
Модель имеет несколько лучшие показатели mrr_score и hits_at_n_score чем DistMult, что логично, так как она является ее расширением за счет использования trilinear Hermitian dot product в пространстве комплексных чисел. То есть используется аналог скоринговой функции прошлой модели, но в комплексном пространстве. Как мы видим результаты в таблице, так же лучше соответствуют действительности

In [51]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseenComplEx, 
                      np.squeeze(scoresComplEx),
                      np.squeeze(probsComplEx))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
10,Brandon Stark ALLIED_WITH House Lannister of C...,4017,-3.814219,0.021579
18,Cersei Lannister PARENT_OF Brandon Stark,4083,-1.994111,0.119823
9,Brandon Stark ALLIED_WITH House Stark of Winte...,2995,-0.74719,0.321434
1,Tyrion Lannister SPOUSE Missandei,3389,-0.740519,0.322891
21,Brienne of Tarth SPOUSE Jaime Lannister,3493,-0.702409,0.331278
5,Daenerys Targaryen SPOUSE Craster,3319,-0.702271,0.331309
15,Jaime Lannister PARENT_OF Myrcella Baratheon,2943,-0.206157,0.448642
0,Jorah Mormont SPOUSE Daenerys Targaryen,2450,-0.20187,0.449703
8,House Tyrell of Highgarden IN_REGION Beyond th...,2155,-0.131669,0.46713
2,King's Landing SEAT_OF House Lannister of Cast...,1724,-0.015005,0.496249


#Holographic Embeddings
Скоринговая функция данной модели выражается через функцию ComplEx модели следующим образом Fhole = (2/n) * Fcomplex. Эти 2 модели являются прямо пропорциональнымии для наших данных результаы показываемые последними двумя моделями довольно близки, однако в ряде случае качество этих двух моделей может значительно отличатся что описано в статье Complex and Holographic Embeddings of Knowledge Graphs:A Comparison

In [52]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseenHolE, 
                      np.squeeze(scoresHolE),
                      np.squeeze(probsHolE))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
10,Brandon Stark ALLIED_WITH House Lannister of C...,4010,-3.461155,0.030438
18,Cersei Lannister PARENT_OF Brandon Stark,3935,-0.496292,0.378412
5,Daenerys Targaryen SPOUSE Craster,2913,-0.25538,0.4365
8,House Tyrell of Highgarden IN_REGION Beyond th...,2904,-0.148639,0.462908
1,Tyrion Lannister SPOUSE Missandei,2679,-0.133798,0.4666
11,Rhaegar Targaryen PARENT_OF Jon Snow,1939,0.002664,0.500666
4,Daenerys Targaryen SPOUSE Jon Snow,1214,0.034547,0.508636
21,Brienne of Tarth SPOUSE Jaime Lannister,1564,0.036938,0.509233
15,Jaime Lannister PARENT_OF Myrcella Baratheon,1346,0.045614,0.511402
2,King's Landing SEAT_OF House Lannister of Cast...,811,0.300612,0.574592


#Выводы
Как мы можем заметить результаты работы моделей не слишком разнятся однако мы все же можем выделить модель показавшую себя хуже всего (TransE) и модель показавшую лучшие результаты на наших данных (Holographic Embeddings). Однако все они информотивны, так как работают заметно лучше фиктивной модели использующей случайный скоринг.