# Some notes and references

Ampligraph uses a skip gram w2v model for its embeddings
(good explanation of skip gram - https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-skip-gram.html and this - https://towardsdatascience.com/skip-gram-nlp-context-words-prediction-algorithm-5bbf34f84e0c and this - https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314)

w2v - https://medium.datadriveninvestor.com/word2vec-skip-gram-model-explained-383fa6ddc4ae

Ampligraph blog - https://medium.com/featurepreneur/ampligraph-what-is-it-8b243800818c

Tutorials from https://github.com/Accenture/AmpliGraph/blob/master/docs/tutorials/AmpliGraphBasicsTutorial.ipynb

# Import packages

In [None]:
!python --version #should be 3.7 or lower

In [None]:
#!pip install "tensorflow>=1.15.2,<2.0" #no need to run this every time

In [None]:
import tensorflow

In [None]:
print(tensorflow.__version__) #should be 1.15 or lower

In [None]:
#!pip install ampligraph #no need to run this every time

In [None]:
import ampligraph
ampligraph.__version__
#should be '1.4.0'

In [None]:
import numpy as np
import pandas as pd

import re
import string

# Import triples 

These were generated from SNOMED_CT instance on CKG using terms from the pain lexicon

In [None]:
df_triples = pd.read_csv("all_parent_child_for_kge.csv")

In [None]:
df_triples.head()

Note: burn seems irrelevant but it has been picked up in order to capture 'burning pain'. It will be of use later on
because some of the gold standard annotations (sentences from CRIS) also pick up burn like burn injury or 
burning things but their label in the classification will be 0 because it is not related to pain
so leave it as is.

In [None]:
df_triples = df_triples.drop(columns='Unnamed: 0')
df_triples.tail()

In [None]:
#Common text preprocessing
text = "   This is a message to be cleaned. 92. It may involve some things like: *+ {[<br>]}, ?, :, ''  adjacent spaces and tabs     .  "

#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace 
    #text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    #text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    #text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    
    return text

text=preprocess(text)
print(text)  #text is a string

In [None]:
df_triples["subject"] = df_triples["subject"].apply(lambda x: preprocess(x)) 
df_triples["predicate"] = df_triples["predicate"].apply(lambda x: preprocess(x)) 
df_triples["object"] = df_triples["object"].apply(lambda x: preprocess(x)) 

In [None]:
df_triples.tail()

In [None]:
df_triples.describe() #top does not mean the top triple, it is the top for each category

# Variation 1

In [None]:
#convert the dataframe of triples to a list

triples = df_triples.values.tolist()

In [None]:
triples

In [None]:
#what is type and length

print('type is: ', type(triples))
print('length is: ', len(triples))

In [None]:
def removeDuplicates(triples):
     
    return [t for t in (set(tuple(i) for i in triples))]
         

triples = removeDuplicates(triples)

Defining the train and test sets

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen 

n = round((len(triples))*0.20) #get 20% of the data as test set

X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=n)

In [None]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)

Train the model

In [None]:
from ampligraph.latent_features import ComplEx

In [None]:
#Now we can instantiate the model:

model = ComplEx(batches_count=100, 
                seed=555, 
                epochs=10, 
                k=150, 
                eta=10,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

In [None]:
#Fitting the model

import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

Evaluate the model

In [None]:
#these are filtered because they are the true triples. negatives ones are the corrput ones generated by the algorithm and are false combinations of triples

filter_triples = np.concatenate((X_train, X_valid))

arguments: 

X - the data to evaluate on. We're going to use our test set to evaluate.

model - the model we previously trained.

filter_triples - will filter out the false negatives generated by the corruption strategy.

use_default_protocol - specifies whether to use the default corruption protocol. If True, then subj and obj are corrupted separately during evaluation.

verbose - will give some nice log statements. Let's leave it on for now.

In [None]:
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True)

The ranks returned by the evaluate_performance function indicate the rank at which the test set triple was found when performing link prediction using the model.

For example, given the triple:

<House Stark of Winterfell, IN_REGION The North>

The model returns a rank of 7. This tells us that while it's not the highest likelihood true statement (which would be given a rank 1), it's pretty likely.


Metrics

Let's compute some evaluate metrics and print them out.

We're going to use the mrr_score (mean reciprocal rank) and hits_at_n_score functions.

mrr_score: The function computes the mean of the reciprocal of elements of a vector of rankings ranks. hits_at_n_score: The function computes how many elements of a vector of rankings ranks make it to the top n positions.

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))

Saving and restoring the model

In [None]:
from ampligraph.latent_features import save_model, restore_model

In [None]:
save_model(model, 'models/complex_model_mar23_variation1.pkl') 

In [None]:
model = restore_model('models/complex_model_mar23_variation1.pkl')

Repeat for TransE

In [None]:
from ampligraph.latent_features import TransE

In [None]:
model = TransE(batches_count=100, 
               seed=555, 
               epochs=10, 
               k=150, 
               loss='pairwise',
               loss_params={'margin':5},
               verbose=True)

In [None]:
#Fitting the model

import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

In [None]:
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True)

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))

In [None]:
save_model(model, 'models/transe_model_mar23_variation1.pkl')

In [None]:
model = restore_model('models/transe_model_mar23_variation1.pkl')

# Variation 2

Import the gold standard data (CRIS)

This is the manually annotated data. I have added the strings for the SCTIDs that were found in the sentences of this dataset. (incorrectly called cuis, they are sctids)

In [None]:
df2 = pd.read_csv("gold_std_for_kge_cleaned.csv")

df2.head()

In [None]:
df2["str_for_cui"] = df2["str_for_cui"].apply(lambda x: preprocess(x)) 
df2.head()

In [None]:
df2 = df2.rename(columns={"str_for_cui": "subject"})
df2.head()

In [None]:
df2.describe()

In [None]:
df2['subject'].nunique()

Merge triples from lexicon with pain terms in gold std

Ref - https://stackoverflow.com/questions/44842458/merging-pandas-columns-one-to-many

In [None]:
df = pd.merge(df2, df_triples, on='subject', how='left')

In [None]:
df.head()

In [None]:
df.shape[0]

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape[0]

In [None]:
df_triples = df[['subject','predicate','object']]
df_triples.head()

In [None]:
df_triples = df_triples.dropna()

In [None]:
print('subject has nan: ', df_triples['subject'].isnull().sum())
print('predicate has nan: ', df_triples['predicate'].isnull().sum())
print('object has nan: ', df_triples['object'].isnull().sum())

In [None]:
#convert dataframe to list

triples = df_triples.values.tolist()

In [None]:
triples = removeDuplicates(triples)

In [None]:
#what is type and length

print('type is: ', type(triples))
print('length is: ', len(triples))

Define the training and test sets

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen 

n = round((len(triples))*0.20) #get 20% of the data as test set

X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=n)

In [None]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)

Training the model

In [None]:
from ampligraph.latent_features import ComplEx

In [None]:
#Now we can instantiate the model:

model = ComplEx(batches_count=100, 
                seed=555, 
                epochs=10, 
                k=150, 
                eta=10,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

Fitting the model

In [None]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

In [None]:
#Check if the model is fit

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

In [None]:
filter_triples = np.concatenate((X_train, X_valid))

Evaluate the model

In [None]:
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True)

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.5f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.5f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.5f" % (hits_1))

Saving and restoring a model

In [None]:
from ampligraph.latent_features import save_model, restore_model

In [None]:
save_model(model, 'models/complex_model_mar23_variation2.pkl') 

In [None]:
model = restore_model('models/complex_model_mar23_variation2.pkl')

Repeat for TransE