In [None]:
import numpy as np
import pandas as pd

In [None]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

TensorFlow  version: 1.15.2


In [None]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

In [None]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# Loading data, Create training, validation and test splits

Let's use the [`train_test_split_no_unseen`](https://docs.ampligraph.org/en/1.3.1/generated/ampligraph.evaluation.train_test_split_no_unseen.html?#train-test-split-no-unseen) function provided by Ampligraph to create the training, validation and test splits. 

This API ensures that the test and validation splits contains triples whose entities are "seen" during training. 

In [None]:
data = np.load('Knowledge_Graph /movielens_triplets.npy',allow_pickle=True)
dataset  = pd.DataFrame(data,columns = ['subject', 'predicate', 'object'])
dataset = dataset.applymap(str)
dataset.head(5)

Unnamed: 0,subject,predicate,object
0,Toy Story,Released in,1995
1,Jumanji,Released in,1995
2,Grumpier Old Men,Released in,1995
3,Waiting to Exhale,Released in,1995
4,Father of the Bride Part II,Released in,1995


In [None]:
dataset.shape

(2028829, 3)

In [None]:
# create train/test/valid splits, train the model and evaluate using train_test_split_no_unseen API
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 5000, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 20000, seed=0)

In [None]:
X_train[:,2].shape

(2003829,)

In [None]:
print(np.unique(X_train[:,0]).shape)
print(np.unique(X_train[:,1]).shape)
print(np.unique(X_train[:,2]).shape)
print(len(set(np.unique(X_train[:,0])).union(np.unique(X_train[:,2]))))
print('#########')
print(np.unique(X_valid[:,0]).shape)
print(np.unique(X_valid[:,1]).shape)
print(np.unique(X_valid[:,2]).shape)
print(len(set(np.unique(X_valid[:,0])).union(np.unique(X_valid[:,2]))))
print('#########')
print(np.unique(X_test[:,0]).shape)
print(np.unique(X_test[:,1]).shape)
print(np.unique(X_test[:,2]).shape)
print(len(set(np.unique(X_test[:,0])).union(np.unique(X_test[:,2]))))
print(len(set(np.unique(X_valid[:,0])).union(np.unique(X_valid[:,2])).union(np.unique(X_test[:,0])).union(np.unique(X_test[:,2]))))

(9869,)
(3663,)
(3782,)
9896
#########
(2713,)
(1293,)
(1271,)
3971
#########
(4750,)
(2241,)
(2344,)
6998
7450


In [None]:
print(len(set(dataset.predicate).union(dataset.subject)))

13533


### TransE

In [None]:
# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 100, verbose=True)

model.fit(X_train)


from ampligraph.utils import save_model, restore_model

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)
print('Size of ranks:', ranks.shape)


display_aggregate_metrics(ranks)


Average TransE Loss:   0.059722: 100%|██████████| 100/100 [07:12<00:00,  4.32s/epoch]
100%|██████████| 20000/20000 [03:27<00:00, 96.34it/s] 


Total triples: (2028829, 3)
Size of train: (2003829, 3)
Size of valid: (5000, 3)
Size of test: (20000, 3)
Size of ranks: (20000, 2)
Mean Rank: 1568.195425
Mean Reciprocal Rank: 0.1424676283072877
Hits@1: 0.07945
Hits@10: 0.245625
Hits@100: 0.36035


### DistMult

In [None]:
model = DistMult(k=300, epochs=100, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, batches_count= 100, verbose=True)

model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)
ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
# ranks are computed per triple
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)
print('Size of ranks:', ranks.shape)
display_aggregate_metrics(ranks)


Average DistMult Loss:   0.035177: 100%|██████████| 100/100 [08:05<00:00,  4.86s/epoch]
100%|██████████| 20000/20000 [03:12<00:00, 103.79it/s]


Total triples: (2028829, 3)
Size of train: (2003829, 3)
Size of valid: (5000, 3)
Size of test: (20000, 3)
Size of ranks: (20000, 2)
Mean Rank: 1021.1678
Mean Reciprocal Rank: 0.16830689257961962
Hits@1: 0.09715
Hits@10: 0.2794
Hits@100: 0.39375


In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)

Size of entity embeddings: (9896, 300)
Size of entity embeddings: (3663, 300)


In [None]:
len(np.unique(X_train[:,0]))

9869

### Complex

In [None]:
model = ComplEx(k=300, epochs=100, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, batches_count= 100, verbose=True)

model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)

ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
# ranks are computed per triple
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)
print('Size of ranks:', ranks.shape)
display_aggregate_metrics(ranks)

Average ComplEx Loss:   0.027870: 100%|██████████| 100/100 [34:50<00:00, 20.90s/epoch]
100%|██████████| 20000/20000 [17:22<00:00, 19.18it/s]


Total triples: (2028829, 3)
Size of train: (2003829, 3)
Size of valid: (5000, 3)
Size of test: (20000, 3)
Size of ranks: (20000, 2)
Mean Rank: 1081.8704
Mean Reciprocal Rank: 0.16274188244918117
Hits@1: 0.09535
Hits@10: 0.27
Hits@100: 0.360525


In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)

Size of entity embeddings: (9896, 600)
Size of entity embeddings: (3663, 600)


## HolE

In [None]:
model = HolE(k=300, epochs=100, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, batches_count= 100, verbose=True)

model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)

ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
# ranks are computed per triple
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)
print('Size of ranks:', ranks.shape)
display_aggregate_metrics(ranks)

Average HolE Loss:   0.073980: 100%|██████████| 100/100 [35:12<00:00, 21.13s/epoch]
100%|██████████| 20000/20000 [11:51<00:00, 28.12it/s]


Total triples: (2028829, 3)
Size of train: (2003829, 3)
Size of valid: (5000, 3)
Size of test: (20000, 3)
Size of ranks: (20000, 2)
Mean Rank: 1001.13915
Mean Reciprocal Rank: 0.18144388934051395
Hits@1: 0.108625
Hits@10: 0.297
Hits@100: 0.416925


## Convolutional models



### ConvKB 

In [None]:


model = ConvKB(k=300, epochs=100, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, 
                batches_count= 100, # Goes OOM (ResourceExhaustedError) if batch count is 1
                verbose=True)


model.fit(X_train)
X_filter = np.concatenate([X_train, X_valid, X_test], 0)
ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
# ranks are computed per triple
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)
print('Size of ranks:', ranks.shape)
display_aggregate_metrics(ranks)

In [None]:
X_filter = np.concatenate([X_train, X_valid, X_test], 0)
ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
display_aggregate_metrics(ranks)

In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)

### ConvE

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 5000, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 20000, seed=0)
model = ConvE(k=150, epochs=100, loss='bce', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, batches_count= 20, verbose=True)

model.fit(X_train)
X_filter = np.concatenate([X_train, X_valid, X_test], 0)
ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='o',
                             ranking_strategy='worst')
# ranks are computed per triple
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)
print('Size of ranks:', ranks.shape)
display_aggregate_metrics(ranks)

In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)