# Some notes and references

Ampligraph uses a skip gram w2v model for its embeddings
(good explanation of skip gram - https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-skip-gram.html and this - https://towardsdatascience.com/skip-gram-nlp-context-words-prediction-algorithm-5bbf34f84e0c and this - https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314)

w2v - https://medium.datadriveninvestor.com/word2vec-skip-gram-model-explained-383fa6ddc4ae

Ampligraph blog - https://medium.com/featurepreneur/ampligraph-what-is-it-8b243800818c

Tutorials from https://github.com/Accenture/AmpliGraph/blob/master/docs/tutorials/AmpliGraphBasicsTutorial.ipynb

# Import packages

In [None]:
!python --version #this script will try it with Ampligraph 2.0.0 so python version is 3.10.10

In [None]:
import tensorflow

In [None]:
print(tensorflow.__version__) #should be 1.15 or lower - it is 2.9.0 for apmpligraph v2

In [None]:
import ampligraph
ampligraph.__version__
#should be 2.0.0 

In [None]:
import numpy as np
import pandas as pd

import re
import string

# Import triples 

These were generated from SNOMED_CT instance on CKG using terms from the pain lexicon

In [None]:
df_triples = pd.read_csv("all_parent_child_for_kge.csv")

In [None]:
df_triples.head()

Note: burn seems irrelevant but it has been picked up in order to capture 'burning pain'. It will be of use later on
because some of the gold standard annotations (sentences from CRIS) also pick up burn like burn injury or 
burning things but their label in the classification will be 0 because it is not related to pain
so leave it as is.

In [None]:
df_triples = df_triples.drop(columns='Unnamed: 0')
df_triples.tail()

In [None]:
#Common text preprocessing
text = "   This is a message to be cleaned. 92. It may involve some things like: *+ {[<br>]}, ?, :, ''  adjacent spaces and tabs     .  "

#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace 
    #text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    #text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    #text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    
    return text

text=preprocess(text)
print(text)  #text is a string

In [None]:
df_triples["subject"] = df_triples["subject"].apply(lambda x: preprocess(x)) 
df_triples["predicate"] = df_triples["predicate"].apply(lambda x: preprocess(x)) 
df_triples["object"] = df_triples["object"].apply(lambda x: preprocess(x)) 

In [None]:
df_triples.tail()

In [None]:
df_triples.describe() #top does not mean the top triple, it is the top for each category

# Variation 1

In [None]:
#convert the dataframe of triples to a list

triples = df_triples.values.tolist()

In [None]:
triples

In [None]:
#what is type and length

print('type is: ', type(triples))
print('length is: ', len(triples))

In [None]:
def removeDuplicates(triples):
     
    return [t for t in (set(tuple(i) for i in triples))]
         

triples = removeDuplicates(triples)

Defining the train and test sets

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen 

n = round((len(triples))*0.20) #get 20% of the data as test set

X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=n)

In [None]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)

Train the model

In [None]:
#from ampligraph.latent_features import ComplEx # this is for ampligraph 1.4
from ampligraph.latent_features import ScoringBasedEmbeddingModel #this is for ampligraph 2.0

In [None]:
#Now we can instantiate the model:

''' this is for ampligraph 2.0'''

model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   scoring_type='ComplEx',
                                   #scoring_type='TransE',
                                   seed=555)

from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-3)
loss = get_loss('multiclass_nll') #, {'margin': 5}) #including margin reduces performance
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})

model.compile(loss=loss,
              optimizer=optimizer,
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform') #including or excluding this did not make a difference on performance

''' this is for ampligraph 1.4'''
'''
model = ComplEx(batches_count=100, 
                seed=555, 
                epochs=10, 
                k=150, 
                eta=10,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True) '''

In [None]:
#Fitting the model

import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.ERROR)

'''this is for ampligraph 2.0'''
model.fit(X_train,
          batch_size=5000, # this improved performance
          epochs=200, #this improved performance
          verbose=True)

''' this is for ampligraph 1.4'''
#model.fit(X_train, early_stopping = False)


In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

Evaluate the model

In [None]:
#these are filtered because they are the true triples. negatives ones are the corrput ones generated by the algorithm and are false combinations of triples

positives_filter = {'test' : np.concatenate([X_train, X_valid])} # this is for ampligraph 2.0

#filter_triples = np.concatenate((X_train, X_valid)) # this is or ampligraph 1.4



arguments: 

X - the data to evaluate on. We're going to use our test set to evaluate.

model - the model we previously trained.

filter_triples - will filter out the false negatives generated by the corruption strategy.

use_default_protocol - specifies whether to use the default corruption protocol. If True, then subj and obj are corrupted separately during evaluation.

verbose - will give some nice log statements. Let's leave it on for now.

In [None]:
''' this is for ampligraph 2.0'''
ranks = model.evaluate(X_valid, 
                       use_filter=positives_filter,   # Corruption strategy filter defined above 
                       corrupt_side='s,o', # corrupt subj and obj separately while evaluating
                       verbose=True)

'''this is for ampligraph 1.4'''
'''
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True) '''

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))



K Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

# Define the number of folds for k-fold cross-validation
knum = 10  # You can change this value to any desired number of folds

In [None]:
# Initialize an empty list to store the evaluation results
evaluation_results = []

In [None]:
# Create a k-fold cross-validator
kf = KFold(n_splits=knum, shuffle=True, random_state=42)

In [None]:
# Iterate through each fold
for fold, (train_index, valid_index) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}/{knum}...")
    
    # Split the data into training and validation sets for this fold
    X_train_fold = X_train[train_index]
    X_valid_fold = X_train[valid_index]

    
    # Define the ComplEx model (here k is embedding size)

    model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type='ComplEx', seed=555)

    # Compile the model before training
    model.compile(optimizer=optimizer, loss=loss, entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')

    #model.compile(loss=loss, optimizer='adam', entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')
    
    # Fit the model to the training data
    model.fit(X_train_fold, batch_size=5000, epochs=200, verbose=True)

    positives_filter = {'test' : np.concatenate([X_train_fold, X_valid_fold])}
    
    # Evaluate the model on the validation data
    ranks = model.evaluate(X_valid_fold, use_filter=positives_filter, corrupt_side='s,o', verbose=True)

    mrr = np.mean(mrr_score(ranks))
    #print("MRR: %.2f" % (mrr))

    hits_10 = np.mean(hits_at_n_score(ranks, n=10))
    #print("Hits@10: %.2f" % (hits_10))
    hits_3 = np.mean(hits_at_n_score(ranks, n=3))
    #print("Hits@3: %.2f" % (hits_3))
    hits_1 = np.mean(hits_at_n_score(ranks, n=1))
    #print("Hits@1: %.2f" % (hits_1))
    
    # Calculate the Hits@1, Hits@3, and Mean Rank for this fold
    #hits_10 = np.mean(ranks[:, 0])
    #hits_3 = np.mean(ranks[:, 1])
    
    # Store the evaluation results for this fold
    evaluation_results.append({'Hits@10': hits_10, 'Hits@3': hits_3, 'Hits@1': hits_1, 'mrr': mrr})


In [None]:
# Calculate the mean and standard deviation of the evaluation metrics across all folds
hits_1_mean = np.mean([result['Hits@1'] for result in evaluation_results])
hits_3_mean = np.mean([result['Hits@3'] for result in evaluation_results])
hits_10_mean = np.mean([result['Hits@10'] for result in evaluation_results])
mrr_mean = np.mean([result['mrr'] for result in evaluation_results])

hits_1_std = np.std([result['Hits@1'] for result in evaluation_results])
hits_3_std = np.std([result['Hits@3'] for result in evaluation_results])
hits_10_std = np.std([result['Hits@10'] for result in evaluation_results])
mrr_std = np.std([result['mrr'] for result in evaluation_results])


print("\nMean Evaluation Metrics:")
print(f"Hits@1: {hits_1_mean} ± {hits_1_std}")
print(f"Hits@3: {hits_3_mean} ± {hits_3_std}")
print(f"Hits@10: {hits_10_mean} ± {hits_10_std}")
print(f"MRR: {mrr_mean} ± {mrr_std}")

Saving and restoring the model

In [None]:
#from ampligraph.latent_features import save_model, restore_model #this is for ampligraph 1.4

from ampligraph.utils import save_model, restore_model #this is for amploigraph 2.0

In [None]:
save_model(model, 'models/complex_model_mar23_variation1_kfold.pkl') 

In [None]:
#model = restore_model('models/complex_model_mar23_variation1_kfold.pkl')

Repeat for TransE

In [None]:
#Now we can instantiate the model:

''' this is for ampligraph 2.0'''

model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   #scoring_type='ComplEx',
                                   scoring_type='TransE',
                                   seed=555)

from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-3)
loss = get_loss('pairwise' , {'margin': 5})
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})

model.compile(loss=loss,
              optimizer=optimizer,
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform') #including or excluding this did not make a difference on performance

''' this is for ampligraph 1.4'''
'''
model = TransE(batches_count=100, 
               seed=555, 
               epochs=10, 
               k=150, 
               loss='pairwise',
               loss_params={'margin':5},
               verbose=True) '''

In [None]:
#Fitting the model

#import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.ERROR)


'''this is for ampligraph 2.0'''
model.fit(X_train,
          batch_size=5000, # this improved performance
          epochs=200, #this improved performance
          verbose=True)

''' this is for ampligraph 1.4'''
#model.fit(X_train, early_stopping = False)

In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

In [None]:
positives_filter = {'test' : np.concatenate([X_train, X_valid])} # this is for ampligraph 2.0

#filter_triples = np.concatenate((X_train, X_valid)) # this is or ampligraph 1.4

In [None]:


''' this is for ampligraph 2.0'''
ranks = model.evaluate(X_valid, 
                       use_filter=positives_filter,   # Corruption strategy filter defined above 
                       corrupt_side='s,o', # corrupt subj and obj separately while evaluating
                       verbose=True)

'''this is for ampligraph 1.4'''
'''
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True) '''



In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))

K fold Cross Validation

In [None]:
# Iterate through each fold - other parameters were defined above for ComplEx
for fold, (train_index, valid_index) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}/{knum}...")
    
    # Split the data into training and validation sets for this fold
    X_train_fold = X_train[train_index]
    X_valid_fold = X_train[valid_index]

    
    # Define the ComplEx model (here k is embedding size)

    model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type='TransE', seed=555)

    # Compile the model before training
    model.compile(optimizer=optimizer, loss=loss, entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')

    #model.compile(loss=loss, optimizer='adam', entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')
    
    # Fit the model to the training data
    model.fit(X_train_fold, batch_size=5000, epochs=200, verbose=True)

    positives_filter = {'test' : np.concatenate([X_train_fold, X_valid_fold])}
    
    # Evaluate the model on the validation data
    ranks = model.evaluate(X_valid_fold, use_filter=positives_filter, corrupt_side='s,o', verbose=True)

    mrr = np.mean(mrr_score(ranks))
    #print("MRR: %.2f" % (mrr))

    hits_10 = np.mean(hits_at_n_score(ranks, n=10))
    #print("Hits@10: %.2f" % (hits_10))
    hits_3 = np.mean(hits_at_n_score(ranks, n=3))
    #print("Hits@3: %.2f" % (hits_3))
    hits_1 = np.mean(hits_at_n_score(ranks, n=1))
    #print("Hits@1: %.2f" % (hits_1))
    
    # Calculate the Hits@1, Hits@3, and Mean Rank for this fold
    #hits_10 = np.mean(ranks[:, 0])
    #hits_3 = np.mean(ranks[:, 1])
    
    # Store the evaluation results for this fold
    evaluation_results.append({'Hits@10': hits_10, 'Hits@3': hits_3, 'Hits@1': hits_1, 'mrr': mrr})


In [None]:
# Calculate the mean and standard deviation of the evaluation metrics across all folds
hits_1_mean = np.mean([result['Hits@1'] for result in evaluation_results])
hits_3_mean = np.mean([result['Hits@3'] for result in evaluation_results])
hits_10_mean = np.mean([result['Hits@10'] for result in evaluation_results])
mrr_mean = np.mean([result['mrr'] for result in evaluation_results])

hits_1_std = np.std([result['Hits@1'] for result in evaluation_results])
hits_3_std = np.std([result['Hits@3'] for result in evaluation_results])
hits_10_std = np.std([result['Hits@10'] for result in evaluation_results])
mrr_std = np.std([result['mrr'] for result in evaluation_results])


print("\nMean Evaluation Metrics:")
print(f"Hits@1: {hits_1_mean} ± {hits_1_std}")
print(f"Hits@3: {hits_3_mean} ± {hits_3_std}")
print(f"Hits@10: {hits_10_mean} ± {hits_10_std}")
print(f"MRR: {mrr_mean} ± {mrr_std}")

In [None]:
save_model(model, 'models/transe_model_mar23_variation1_kfold.pkl')

In [None]:
# model = restore_model('models/transe_model_mar23_variation1_kfold.pkl')

# Variation 2

Import the gold standard data (CRIS)

This is the manually annotated data. I have added the strings for the SCTIDs that were found in the sentences of this dataset. (incorrectly called cuis, they are sctids)

In [None]:
df2 = pd.read_csv("gold_std_for_kge_cleaned.csv")

df2.head()

In [None]:
df2["str_for_cui"] = df2["str_for_cui"].apply(lambda x: preprocess(x)) 
df2.head()

In [None]:
df2 = df2.rename(columns={"str_for_cui": "subject"})
df2.head()

In [None]:
df2.describe()

In [None]:
df2['subject'].nunique()

Merge triples from lexicon with pain terms in gold std

Ref - https://stackoverflow.com/questions/44842458/merging-pandas-columns-one-to-many

In [None]:
df = pd.merge(df2, df_triples, on='subject', how='left')

In [None]:
df.head()

In [None]:
df.shape[0]

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape[0]

In [None]:
df_triples = df[['subject','predicate','object']]
df_triples.head()

In [None]:
df_triples = df_triples.dropna()

In [None]:
print('subject has nan: ', df_triples['subject'].isnull().sum())
print('predicate has nan: ', df_triples['predicate'].isnull().sum())
print('object has nan: ', df_triples['object'].isnull().sum())

In [None]:
#convert dataframe to list

triples = df_triples.values.tolist()

In [None]:
triples = removeDuplicates(triples)

In [None]:
#what is type and length

print('type is: ', type(triples))
print('length is: ', len(triples))

Define the training and test sets

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen 

n = round((len(triples))*0.20) #get 20% of the data as test set

X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=n)

In [None]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)

 Training the model

In [None]:
#from ampligraph.latent_features import ComplEx # this is for ampligraph 1.4
from ampligraph.latent_features import ScoringBasedEmbeddingModel #this is for ampligraph 2.0

In [None]:
#Now we can instantiate the model:

''' this is for ampligraph 2.0'''

model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   scoring_type='ComplEx',
                                   #scoring_type='TransE',
                                   seed=555)

from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-3)
loss = get_loss('multiclass_nll') # , {'margin': 0.5})
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})

model.compile(loss=loss,
              optimizer=optimizer,
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform') #including or excluding this did not make a difference on performance

''' this is for ampligraph 1.4'''
'''
#Now we can instantiate the model:

model = ComplEx(batches_count=100, 
                seed=555, 
                epochs=10, 
                k=150, 
                eta=10,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)
 '''

In [None]:
#Fitting the model

import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.ERROR)

'''this is for ampligraph 2.0'''
model.fit(X_train,
          batch_size=5000, # this improved performance
          epochs=200, #this improved performance
          verbose=True)

''' this is for ampligraph 1.4'''
#model.fit(X_train, early_stopping = False)


In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

Evaluate the model

In [None]:
#these are filtered because they are the true triples. negatives ones are the corrput ones generated by the algorithm and are false combinations of triples

positives_filter = {'test' : np.concatenate([X_train, X_valid])} # this is for ampligraph 2.0

#filter_triples = np.concatenate((X_train, X_valid)) # this is or ampligraph 1.4



arguments: 

X - the data to evaluate on. We're going to use our test set to evaluate.

model - the model we previously trained.

filter_triples - will filter out the false negatives generated by the corruption strategy.

use_default_protocol - specifies whether to use the default corruption protocol. If True, then subj and obj are corrupted separately during evaluation.

verbose - will give some nice log statements. Let's leave it on for now.

In [None]:
''' this is for ampligraph 2.0'''
ranks = model.evaluate(X_valid, 
                       use_filter=positives_filter,   # Corruption strategy filter defined above 
                       corrupt_side='s,o', # corrupt subj and obj separately while evaluating
                       verbose=True)

'''this is for ampligraph 1.4'''
'''
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True) '''

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))



K fold Cross Validation

In [None]:
# Iterate through each fold - other parameters were defined above for ComplEx
for fold, (train_index, valid_index) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}/{knum}...")
    
    # Split the data into training and validation sets for this fold
    X_train_fold = X_train[train_index]
    X_valid_fold = X_train[valid_index]

    
    # Define the ComplEx model (here k is embedding size)

    model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type='ComplEx', seed=555)

    # Compile the model before training
    model.compile(optimizer=optimizer, loss=loss, entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')

    #model.compile(loss=loss, optimizer='adam', entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')
    
    # Fit the model to the training data
    model.fit(X_train_fold, batch_size=5000, epochs=200, verbose=True)

    positives_filter = {'test' : np.concatenate([X_train_fold, X_valid_fold])}
    
    # Evaluate the model on the validation data
    ranks = model.evaluate(X_valid_fold, use_filter=positives_filter, corrupt_side='s,o', verbose=True)

    mrr = np.mean(mrr_score(ranks))
    #print("MRR: %.2f" % (mrr))

    hits_10 = np.mean(hits_at_n_score(ranks, n=10))
    #print("Hits@10: %.2f" % (hits_10))
    hits_3 = np.mean(hits_at_n_score(ranks, n=3))
    #print("Hits@3: %.2f" % (hits_3))
    hits_1 = np.mean(hits_at_n_score(ranks, n=1))
    #print("Hits@1: %.2f" % (hits_1))
    
    # Calculate the Hits@1, Hits@3, and Mean Rank for this fold
    #hits_10 = np.mean(ranks[:, 0])
    #hits_3 = np.mean(ranks[:, 1])
    
    # Store the evaluation results for this fold
    evaluation_results.append({'Hits@10': hits_10, 'Hits@3': hits_3, 'Hits@1': hits_1, 'mrr': mrr})


In [None]:
# Calculate the mean and standard deviation of the evaluation metrics across all folds
hits_1_mean = np.mean([result['Hits@1'] for result in evaluation_results])
hits_3_mean = np.mean([result['Hits@3'] for result in evaluation_results])
hits_10_mean = np.mean([result['Hits@10'] for result in evaluation_results])
mrr_mean = np.mean([result['mrr'] for result in evaluation_results])

hits_1_std = np.std([result['Hits@1'] for result in evaluation_results])
hits_3_std = np.std([result['Hits@3'] for result in evaluation_results])
hits_10_std = np.std([result['Hits@10'] for result in evaluation_results])
mrr_std = np.std([result['mrr'] for result in evaluation_results])


print("\nMean Evaluation Metrics:")
print(f"Hits@1: {hits_1_mean} ± {hits_1_std}")
print(f"Hits@3: {hits_3_mean} ± {hits_3_std}")
print(f"Hits@10: {hits_10_mean} ± {hits_10_std}")
print(f"MRR: {mrr_mean} ± {mrr_std}")

In [None]:
save_model(model, 'models/complex_model_mar23_variation2_kfold.pkl') 

In [None]:
#model = restore_model('models/complex_model_mar23_variation2_kfold.pkl')

Repeat for TransE

In [None]:
#Now we can instantiate the model:

''' this is for ampligraph 2.0'''

model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   #scoring_type='ComplEx',
                                   scoring_type='TransE',
                                   seed=555)

from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-3)
loss = get_loss('pairwise' , {'margin': 5})
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})

model.compile(loss=loss,
              optimizer=optimizer,
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform') #including or excluding this did not make a difference on performance

''' this is for ampligraph 1.4'''
'''
model = TransE(batches_count=100, 
               seed=555, 
               epochs=10, 
               k=150, 
               loss='pairwise',
               loss_params={'margin':5},
               verbose=True) '''

In [None]:
#Fitting the model

#import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.ERROR)


'''this is for ampligraph 2.0'''
model.fit(X_train,
          batch_size=5000, # this improved performance
          epochs=200, #this improved performance
          verbose=True)

''' this is for ampligraph 1.4'''
#model.fit(X_train, early_stopping = False)

In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

In [None]:
positives_filter = {'test' : np.concatenate([X_train, X_valid])} # this is for ampligraph 2.0

#filter_triples = np.concatenate((X_train, X_valid)) # this is or ampligraph 1.4

In [None]:


''' this is for ampligraph 2.0'''
ranks = model.evaluate(X_valid, 
                       use_filter=positives_filter,   # Corruption strategy filter defined above 
                       corrupt_side='s,o', # corrupt subj and obj separately while evaluating
                       verbose=True)

'''this is for ampligraph 1.4'''
'''
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True) '''



In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))

K fold Cross Validation

In [None]:
# Iterate through each fold - other parameters were defined above for ComplEx
for fold, (train_index, valid_index) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}/{knum}...")
    
    # Split the data into training and validation sets for this fold
    X_train_fold = X_train[train_index]
    X_valid_fold = X_train[valid_index]

    
    # Define the ComplEx model (here k is embedding size)

    model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type='TransE', seed=555)

    # Compile the model before training
    model.compile(optimizer=optimizer, loss=loss, entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')

    #model.compile(loss=loss, optimizer='adam', entity_relation_regularizer=regularizer, entity_relation_initializer='glorot_uniform')
    
    # Fit the model to the training data
    model.fit(X_train_fold, batch_size=5000, epochs=200, verbose=True)

    positives_filter = {'test' : np.concatenate([X_train_fold, X_valid_fold])}
    
    # Evaluate the model on the validation data
    ranks = model.evaluate(X_valid_fold, use_filter=positives_filter, corrupt_side='s,o', verbose=True)

    mrr = np.mean(mrr_score(ranks))
    #print("MRR: %.2f" % (mrr))

    hits_10 = np.mean(hits_at_n_score(ranks, n=10))
    #print("Hits@10: %.2f" % (hits_10))
    hits_3 = np.mean(hits_at_n_score(ranks, n=3))
    #print("Hits@3: %.2f" % (hits_3))
    hits_1 = np.mean(hits_at_n_score(ranks, n=1))
    #print("Hits@1: %.2f" % (hits_1))
    
    # Calculate the Hits@1, Hits@3, and Mean Rank for this fold
    #hits_10 = np.mean(ranks[:, 0])
    #hits_3 = np.mean(ranks[:, 1])
    
    # Store the evaluation results for this fold
    evaluation_results.append({'Hits@10': hits_10, 'Hits@3': hits_3, 'Hits@1': hits_1, 'mrr': mrr})


In [None]:
# Calculate the mean and standard deviation of the evaluation metrics across all folds
hits_1_mean = np.mean([result['Hits@1'] for result in evaluation_results])
hits_3_mean = np.mean([result['Hits@3'] for result in evaluation_results])
hits_10_mean = np.mean([result['Hits@10'] for result in evaluation_results])
mrr_mean = np.mean([result['mrr'] for result in evaluation_results])

hits_1_std = np.std([result['Hits@1'] for result in evaluation_results])
hits_3_std = np.std([result['Hits@3'] for result in evaluation_results])
hits_10_std = np.std([result['Hits@10'] for result in evaluation_results])
mrr_std = np.std([result['mrr'] for result in evaluation_results])


print("\nMean Evaluation Metrics:")
print(f"Hits@1: {hits_1_mean} ± {hits_1_std}")
print(f"Hits@3: {hits_3_mean} ± {hits_3_std}")
print(f"Hits@10: {hits_10_mean} ± {hits_10_std}")
print(f"MRR: {mrr_mean} ± {mrr_std}")