In [None]:
!nvidia-smi

### Installing missing packages & Loading

In [None]:
!pip install rdflib

In [None]:
#!pip install kglab
!pip install textdistance
!pip install textdistance[extras]
!pip install deep_translator

In [None]:
import sys

In [None]:
from rdflib import Graph
from rdflib import URIRef
from rdflib.namespace import RDF
import random 
import numpy as np
import pandas as pd
import textdistance as td
import itertools    
import collections

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
my_local_drive='/content/gdrive/My Drive/Colab Notebooks/WebSemantique'
# Ajout du path pour les librairies, fonctions et données
sys.path.append(my_local_drive)
# Se positionner sur le répertoire associé
%cd $my_local_drive

%pwd

In [None]:
target_graph = Graph()
target_graph.parse('target.ttl', format='ttl')
np_arr_target = np.array(target_graph).astype("str")

In [None]:
source_graph = Graph()
source_graph.parse('source.ttl', format='ttl')
np_arr_source = np.array(source_graph).astype("str")

In [None]:
# all_subject_source = np.unique(np_arr_source[:,0])
# all_subject_target = np.unique(np_arr_target[:,0])
# all_pred_source = np.unique(np_arr_source[:,1])
# all_pred_target = np.unique(np_arr_target[:,1])
# all_object_source = np.unique(np_arr_source[:,2])
# all_object_target = np.unique(np_arr_target[:,2])


### Preprocessing

In [None]:
import re
import nltk
import copy as cp
from deep_translator import GoogleTranslator
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import RegexpParser
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

stopwords = np.array(nltk.corpus.stopwords.words('english'))

In [None]:
def find_index():
  # getting rid of all nodes objects
  rNode = re.compile('^n[0-9]*.*')

  # getting rid of all url objects
  rURL = re.compile("http://.*")

  # filtering object with no useful information in both source and target graph
  idx_to_keep_source_obj = [i for i in range(len(np_arr_source[:,2])) if not (bool(rNode.match(np_arr_source[i,2])) or 
                                                                          bool(rURL.match(np_arr_source[i,2])))] 
  idx_to_keep_target_obj = [i for i in range(len(np_arr_target[:,2])) if not (bool(rNode.match(np_arr_target[i,2])) or 
                                                                          bool(rURL.match(np_arr_target[i,2])))]

  # filtering subject with no useful information in both source and target graph
  idx_to_keep_source_subj = [i for i in range(len(np_arr_source[:,0])) if not bool(rNode.match(np_arr_source[i,0]))] 
  idx_to_keep_target_subj = [i for i in range(len(np_arr_target[:,0])) if not bool(rNode.match(np_arr_target[i,0]))]

  # keep only triplet with useful information
  idx_to_keep_source = np.intersect1d(idx_to_keep_source_obj,idx_to_keep_source_subj)
  idx_to_keep_target = np.intersect1d(idx_to_keep_target_obj,idx_to_keep_target_subj)

  return idx_to_keep_source,idx_to_keep_target

idx_to_keep_source, idx_to_keep_target = find_index()

In [None]:
print(len(idx_to_keep_source), len(idx_to_keep_target))

In [None]:
# def translate(np_array):
#   for i,s in zip(range(len(np_array)),np_array):
#     try:
#       np_array[i] = GoogleTranslator(source='auto', target='fr').translate(s)
#     except:
#       pass
#     finally:
#       pass
#   return np_array


# objet_source = translate(objet_target)
# objet_target = translate(objet_target)


In [None]:
# # Saving to a csv file to avoid long time running
# df_source = pd.DataFrame(data = objet_source)
# df_source.to_csv("objet_value_translated.csv",sep=',',index=False)

# df_target = pd.DataFrame(data = objet_target)
# df_target.to_csv("objet_target_translated.csv",sep=',',index=False)

# #### Loading 
# df_source = pd.read_csv("train.csv")
# objet_sourcebis = df_source.values
# objet_targetbis = df_target.values

In [None]:
def preprocess(np_array):
  for i,sentence in zip(range(len(np_array)),np_array):
    # Removing punctuation
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    sentence = tokenizer.tokenize(sentence)

    sentence = " ".join(sentence)
    sentence = word_tokenize(sentence)

    # converting each word to lowercase
    sentence = [word.lower() for word in sentence]

    # Removing stopwords
    sentence = [word for word in sentence if not word in stopwords]

    # lemmatizer words 
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    
    # Removing pronouns infinitive verbs dt...
    for word in nltk.pos_tag(sentence):
      word_to_keep = []
      if word[1] not in ['IN', 'PP', 'DT','PRP$','VB','MD']:
        word_to_keep.append(word[0])

    sentence = (" ").join(sentence)
    np_array[i] = sentence
  return np_array

objet_source = np_arr_source[idx_to_keep_source,2]
objet_target = np_arr_target[idx_to_keep_target,2]

objet_source = preprocess(objet_source)
objet_target = preprocess(objet_target)

### <Predicate, Value> Comparaison; Matching Strategies

In [None]:
def find_prop_with_occurence(np_graph):
  """
  retourne toutes les propriétés avec leurs occurences respectives pour un graphe rdf donné
  """
  return collections.Counter(x for x in np_graph[:,1])


idx_to_keep_source, idx_to_keep_target = find_index()
prop_source = np.unique(np_arr_source[idx_to_keep_source,1])
prop_target = np.unique(np_arr_target[idx_to_keep_target,1])
interesting_prop = np.unique(np.concatenate((prop_source,prop_target),axis=0))

def main_prop(prop = interesting_prop):
  """
  retourne toutes les propriétés en commun au 2 graphes classé par ordre croissant en fonction de leurs occurences
  """
  prop_source = list(find_prop_with_occurence(np_arr_source).keys())
  prop_target = list(find_prop_with_occurence(np_arr_target).keys())

  # occurence of property in both target and source
  prop_union = np.intersect1d(prop_source,prop_target)
  dico_union = find_prop_with_occurence(np.concatenate((np_arr_source,np_arr_target),axis = 0))
  dico_interesting_prop = dict()
  for key in prop:
    dico_interesting_prop[key] = dico_union[key]
  return np.array(list({k: v for k, v in sorted(dico_interesting_prop.items(), key=lambda item: item[1],reverse = True)}.keys()))


In [None]:
# # retourne toutes les propriétés associès à un sujet
# f = lambda subject : np_arr_source[np_arr_source[:,0] == subject][:,1:]

# source_prop_subject = dict((k, []) for k in np.unique(np_arr_source[:,0]))
# for subject in np.unique(np_arr_source[:,0]):
#   source_prop_subject[subject].append(f(subject))

# g = lambda subject : np_arr_target[np_arr_target[:,0] == subject][:,1:]

# target_prop_subject = dict((k, []) for k in np.unique(np_arr_target[:,0]))
# for subject in np.unique(np_arr_target[:,0]):
#   target_prop_subject[subject].append(g(subject))

In [None]:
def input_prop():
  print("Select properties from the following list")
  print(main_prop)
  print("WARNING Due to long time running and RAM space avalaible")
  print("We recommend that you do not use all properties on the same time but only feuw of them")
  print("Select number of property you want to choose: ")
  n = int(input())
  while (n<0 or n>4):
    print("Not correct number of property")
    n = int(input())
  print("Enter the desired property indexes: ")
  arr = input() 
  l = list(map(int,arr.split(' ')))
  return l

In [None]:
main_prop = main_prop()
def choose_property(selected_prop):
  """
  given a rank of occurence return
  """
  # preprocessing mask where we took off all triplet with nodes and URL...
  idx_to_keep_source, idx_to_keep_target = find_index()

  props = main_prop[selected_prop]
  print(props)

  # select all subject which have one of selected property
  idx_to_keep_source = [i for i in idx_to_keep_source if np_arr_source[i,1] in props]
  idx_to_keep_target = [i for i in idx_to_keep_target if np_arr_target[i,1] in props]

  print("Nombre index source conservés: ", len(idx_to_keep_source),"Nombres index target conservés: ",len(idx_to_keep_target))
  return idx_to_keep_source,idx_to_keep_target

# choosing propertys given id based on rank of occurency
selected_prop = input_prop() 
idx_to_keep_source,idx_to_keep_target = choose_property(selected_prop)

In [None]:
prod_obj = itertools.product(np_arr_source[idx_to_keep_source,2], np_arr_target[idx_to_keep_target,2])
prod_obj = np.array([x for x in prod_obj])
### 
prod_subj = itertools.product(np_arr_source[idx_to_keep_source,0], np_arr_target[idx_to_keep_target,0])
prod_subj = np.array([x for x in prod_subj])

In [None]:
size = len(prod_obj)

### Similarity Measure


In [None]:
!pip install strsimpy
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.ngram import NGram
from strsimpy.cosine import Cosine
from strsimpy.metric_lcs import MetricLCS
from strsimpy.jaccard import Jaccard
from strsimpy.overlap_coefficient import OverlapCoefficient
from strsimpy.sorensen_dice import SorensenDice
from strsimpy import SIFT4

In [None]:
identity = lambda x,y : 1.0 if x==y else 0.0  

In [None]:
def normalized_levenshtein(x,y):
  normalized_levenshtein = NormalizedLevenshtein()
  return normalized_levenshtein.similarity(x, y)

In [None]:
def jaroWinkler(x,y):
  jarowinkler = JaroWinkler()
  return jarowinkler.similarity(x,y)

In [None]:
def jaccard_similarity(x, y,k=2):
  jac = Jaccard(k)
  return jac.similarity(x,y)

In [None]:
def cosine_similarity(x,y,value=2):
  cosine = Cosine(value)
  # avoid zero division error
  if len(x)<2 and len(y)<2:
    return identity(x,y)
  return cosine.similarity(x, y)

In [None]:
def overlap_coefficient_similarity(x,y):
  over = OverlapCoefficient()
  # avoid zero division error
  if len(x)<3 or len(y)<3:
    return identity(x,y)
  return over.similarity(x,y)

In [None]:
def sorensen_dice_similarity(x,y):
  sorensen = SorensenDice()
  # avoid zero division error
  if len(x)<3 or len(y)<3:
    return identity(x,y)
  return sorensen.similarity(x,y)

SIFT4 is a general purpose string distance algorithm inspired by JaroWinkler and Longest Common Subsequence. It was developed to produce a distance measure that matches as close as possible to the human perception of string distance. Hence it takes into account elements like character substitution, character distance, longest common subsequence etc. It was developed using experimental testing, and without theoretical background.

source: https://pypi.org/project/strsimpy/

In [None]:
def sift4_similarity(x,y):
  s = SIFT4()
  return 1 - s.distance(x,y)/max(len(x),len(y))  

In [None]:
def apply_distance_evaluation(np_cartesian_product,similarity_function):
  mapp = np.zeros((np.shape(np_cartesian_product)[0],1))
  for i in range(mapp.shape[0]):
    if i%20000 == 0:
      print((i/size)*100, "%")
    mapp[i] = similarity_function(prod_obj[i,0],prod_obj[i,1])
  return mapp

In [None]:
def find_mask(mapp,seuil):
  if seuil > 1 or seuil < 0:
    return "Please choose value between 0.0 and 1.0"
  mask = mapp>seuil
  mask = mask.flatten()
  mask_idx = [i for i in range(np.shape(prod_obj)[0]) if mask[i]]
  print("Nb element: ", len(mask_idx))
  return mask

### Loading True Alignment

In [None]:
from xml.dom.minidom import parse
DOMTree = parse('veriteTerrain.xml')

In [None]:
collection = DOMTree.documentElement

uriSource = collection.getElementsByTagName('entity1')
uriTarget = collection.getElementsByTagName('entity2')

verite_terrain = list()
for uriS,uriT in zip(uriSource,uriTarget):
  verite_terrain.append([uriS.getAttribute('rdf:resource'),uriT.getAttribute('rdf:resource')])

verite_terrain = np.array(verite_terrain).astype("str")

In [None]:
obj_1 = []
obj_2 = []

for subj1 in verite_terrain[:,0]:
  obj_1.append(np_arr_source[np_arr_source[:,0] == subj1][:,2])


for subj2 in verite_terrain[:,1]:
  obj_2.append(np_arr_target[np_arr_target[:,0] == subj2][:,2])

# for i in range(5):
#   print(obj_1[i])
#   print("\n")
#   print(obj_2[i])
#   print("\n\n\n\n\n")


verite_terrain = dict(verite_terrain)

### Evaluation Methods


In [None]:
# mapp = apply_distance_evaluation(prod_obj,jaroWinkler)
# mask = find_mask(mapp,0.9)


def helper_dico(mask):
  alignement_dic = dict(prod_subj[mask])
  alignement_source = list(alignement_dic.keys())
  alignement_target = list(alignement_dic.values())
  return alignement_dic,alignement_source,alignement_target

# alignement_dic,alignement_source,alignement_target = helper_dico(mask)

def precision(mask):
  # parmi tous les alignements trouvés combien sont vrais
  count = 0.0
  for subj in prod_subj[mask][:,0]:
    if subj in alignement_source:
      try:
        if alignement_dic[subj] == verite_terrain[subj]:
          count += 1.0
      except:
        continue
  return count/len(prod_subj[mask][:,0])

In [None]:
def recall(mask):
  # parmi tous les alignements vraies combien ont été correctement identifiés
  count = 0.0
  for subj in list(verite_terrain.keys()):
    if subj in alignement_source:
      if verite_terrain[subj] == alignement_dic[subj]:
        count += 1.0
  return count/len(list(verite_terrain.keys()))

In [None]:
def f1Score(mask):
  if precision(mask)+recall(mask) == 0:
    return 0.0
  return 2*precision(mask)*recall(mask)/(precision(mask)+recall(mask))

### Some vizualisation

In [None]:
seuils = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
similaritys = [identity,normalized_levenshtein,jaroWinkler,jaccard_similarity,cosine_similarity,
               overlap_coefficient_similarity,sorensen_dice_similarity,sift4_similarity]
similaritys_names = ["identity","normalized_levenshtein","jaroWinkler","jaccard_similarity","cosine_similarity",
               "overlap_coefficient_similarity","sorensen_dice_similarity","sift4_similarity"]


data_precision = {sim: [] for sim in similaritys_names}
data_recall = {sim: [] for sim in similaritys_names}
dataf1 = {sim: [] for sim in similaritys_names}

for similarity_name,similarity in zip(similaritys_names,similaritys):
  mapp = apply_distance_evaluation(prod_obj,similarity)
  for seuil in seuils:
    mask = find_mask(mapp,seuil)
    alignement_dic,alignement_source,alignement_target = helper_dico(mask)
    dataf1[similarity_name].append(f1Score(mask))
    data_precision[similarity_name].append(precision(mask))    
    data_recall[similarity_name].append(recall(mask))

In [None]:
# plotting similarities
import matplotlib.pyplot as plt
fig,ax = plt.subplots(3,1,figsize=(12,25))
for similaritys_names, datalist in data_precision.items():
    similaritys_names, datalist = zip(*data_precision.items()) 
for i in range(len(datalist)):
  ax[0].plot(seuils, datalist[i],'-s',label=similaritys_names[i])
ax[0].set_title("Precision for similarities with properties {}".format(main_prop[selected_prop]))
ax[0].legend(bbox_to_anchor=(1, 1),prop={'size': 16})
ax[0].set_ylabel("Precision")


for similaritys_names, datalist in data_recall.items():
    similaritys_names, datalist = zip(*data_recall.items()) 
for i in range(len(datalist)):
  ax[1].plot(seuils, datalist[i],'-s',label=similaritys_names[i])
ax[1].legend(bbox_to_anchor=(1, 1),prop={'size': 16})
ax[1].set_title("Recall for similarities with properties {}".format(main_prop[selected_prop]))
ax[1].set_ylabel("Recall")


for similaritys_names, datalist in dataf1.items():
    similaritys_names, datalist = zip(*dataf1.items()) 
for i in range(len(datalist)):
  ax[2].plot(seuils, datalist[i],'-s',label=similaritys_names[i])
ax[2].set_title("F1 score for similarities with properties {}".format(main_prop[selected_prop]))
ax[2].set_xlabel("Filter's Threashold")
ax[2].legend(bbox_to_anchor=(1, 1),prop={'size': 16})
ax[2].set_ylabel("f1Score")
