# Installing missing packages & Loading

In [None]:
!pip install rdflib

In [None]:
#!pip install kglab
!pip install textdistance
!pip install textdistance[extras]
!pip install deep_translator

In [None]:
import sys

In [None]:
from rdflib import Graph
from rdflib import URIRef
from rdflib.namespace import RDF
import random 
import numpy as np
import textdistance as td
import itertools    
import collections

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
my_local_drive='/content/gdrive/My Drive/Colab Notebooks/WebSemantique'
# Ajout du path pour les librairies, fonctions et données
sys.path.append(my_local_drive)
# Se positionner sur le répertoire associé
%cd $my_local_drive

%pwd

In [None]:
target_graph = Graph()
target_graph.parse('target.ttl', format='ttl')
np_arr_target = np.array(target_graph).astype("str")

In [None]:
source_graph = Graph()
source_graph.parse('source.ttl', format='ttl')
np_arr_source = np.array(source_graph).astype("str")

In [None]:
# all_subject_source = np.unique(np_arr_source[:,0])
# all_subject_target = np.unique(np_arr_target[:,0])
# all_pred_source = np.unique(np_arr_source[:,1])
# all_pred_target = np.unique(np_arr_target[:,1])
# all_object_source = np.unique(np_arr_source[:,2])
# all_object_target = np.unique(np_arr_target[:,2])

# Preprocessing

In [None]:
import re
import nltk
import copy as cp
from deep_translator import GoogleTranslator
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import RegexpParser
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

stopwords = np.array(nltk.corpus.stopwords.words('english'))

In [None]:
# getting rid of all nodes objects
rNode = re.compile('^n[0-9]*.*')

# getting rid of all url objects
rURL = re.compile("http://.*")

idx_to_keep_source = [i for i in range(len(np_arr_source[:,2])) if not (bool(rNode.match(np_arr_source[i,2])) or 
                                                                        bool(rURL.match(np_arr_source[i,2])))] 
idx_to_keep_target = [i for i in range(len(np_arr_target[:,2])) if not bool(rNode.match(np_arr_target[i,2])) or 
                                                                           not bool(rURL.match(np_arr_target[i,2]))]

In [None]:
# We avoid to change the main array of reference and by this use a copy
np_source_cp = cp.deepcopy(np_arr_source)
np_target_cp = cp.deepcopy(np_arr_target)
objet_source = np_source_cp[idx_to_keep_source,2]
objet_target = np_target_cp[idx_to_keep_target,2]

In [None]:
def translate(np_array):
  for i,s in zip(range(len(np_array)),np_array):
    try:
      np_array[i] = GoogleTranslator(source='auto', target='fr').translate(s)
    except:
      pass
    finally:
      pass
  return np_array

def preprocess(np_array):
  for i,sentence in zip(range(len(np_array)),np_array):
    # Removing punctuation
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    sentence = tokenizer.tokenize(sentence)

    sentence = " ".join(sentence)
    sentence = word_tokenize(sentence)

    # Removing word with only one letter
    sentence = [word for word in sentence if len(word) != 1]

    # converting each word to lowercase
    sentence = [word.lower() for word in sentence]

    # Removing stopwords
    sentence = [word for word in sentence if not word in stopwords]

    # lemmatizer words 
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    
    # Removing pronouns infinitive verbs dt...
    for word in nltk.pos_tag(sentence):
      word_to_keep = []
      if word[1] not in ['IN', 'PP', 'DT','PRP$','VB','MD']:
        word_to_keep.append(word[0])

    sentence = (" ").join(sentence)
    np_array[i] = sentence
  return np_array

objet_source = translate(objet_source)
objet_target = translate(objet_target)

objet_source = preprocess(objet_source)
objet_target = preprocess(objet_target)


In [None]:
objet_source[100:1000]

# URI Comparaison

In [None]:
all_subject_source = np_arr_source[:,0]
all_subject_target = np_arr_target[:,0]

In [None]:
# all subjects which have the same URI in the source RDF graph and in the target one
same_uri_list = np.intersect1d(all_subject_source,all_subject_target)


In [None]:
objects = [x[2] for x in np_arr_target if x[0] in same_uri_list]
np.unique(objects)
# it seems like all the uri find identified the same 'thing'

# <Predicate, Value> Comparaison

In [None]:
subject = "http://data.doremus.org/event/25096a99-08f3-33e3-a441-031623040855"

In [None]:
np_arr_source[np_arr_source[:,0] == subject][:,1]

In [None]:
def find_prop_with_occurence(np_graph):
  """
  retourne toutes les propriétés avec leurs occurences respectives pour un graphe rdf donné
  """
  dico = collections.Counter(x for x in np_graph[:,1])
  return {k: v for k, v in sorted(dico.items(), key=lambda item: item[1],reverse = True)}

# list of property ranked by occurence

prop_source = list(find_prop_with_occurence(np_arr_source).keys())
prop_target = list(find_prop_with_occurence(np_arr_target).keys())
prop_union = np.intersect1d(prop_source,prop_source)

In [None]:
# retourne tous les propriétés associès à un sujet
f = lambda subject : np_arr_source[np_arr_source[:,0] == subject][:,1:]

source_prop_subject = list()
for subject in all_subject_source:
  source_prop_subject.append(f(subject))

f = lambda subject : np_arr_source[np_arr_source[:,0] == subject][:,1:]

target_prop_subject = list()
for subject in all_subject_source:
  target_prop_subject.append(f(subject))

In [None]:
# choosing propertys given id based on rank of occurency
prop_ids = [1,4]
# property corresponding
prop_union[prop_ids]

In [None]:
# select all subject which have one this property
triplet_from_source = np_arr_source[np_arr_source[:,1] == "http://erlangen-crm.org/current/P102_has_title"]
triplet_from_target = np_arr_target[np_arr_target[:,1] == "http://erlangen-crm.org/current/P102_has_title"]
# TO DO!!!
# select all subject which have MULTIPLE propertys in common


In [None]:
prod_obj = itertools.product(triplet_from_source[:,2], triplet_from_target[:,2])
prod_obj = np.array([x for x in prod_obj])
np.shape(prod_obj)
### 
prod_subj = itertools.product(triplet_from_source[:,0], triplet_from_target[:,0])
prod_subj = np.array([x for x in prod_subj])


# Comparaison levenshtein Methode Gloutonne


In [None]:
g = lambda x,y : td.levenshtein(x,y)

In [None]:
# WARNING Execution longue (peut prendre plusieurs minutes)
mapp = np.zeros((np.shape(prod_obj)[0],1))
for i in range(mapp.shape[0]):
  mapp[i] = g(prod_obj[i,0],prod_obj[i,1])

In [None]:
seuil = 2

In [None]:
mask = mapp<seuil
mask = mask.flatten()
mask_idx = [i for i in range(np.shape(prod_obj)[0]) if mask[i]]
print("Nb element: ", len(mask_idx))

In [None]:
prod_subj[mask]

In [None]:
#LevenshteinNormalized Similarity
g = lambda x,y : 1 - td.levenshtein(x,y)/max(len(x),len(y))

In [None]:
#LevenshteinNormalized Similarity
mapp = np.zeros((np.shape(prod_obj)[0],1))
for i in range(mapp.shape[0]):
  mapp[i] = g(prod_obj[i,0],prod_obj[i,1])

In [None]:
ngram = lambda x,y : 

### Partie Verité Terrain

In [None]:
from xml.dom.minidom import parse
DOMTree = parse('veriteTerrain.xml')


In [None]:
collection = DOMTree.documentElement

uriSource = collection.getElementsByTagName('entity1')
uriTarget = collection.getElementsByTagName('entity2')

verite_terrain = list()
for uriS,uriT in zip(uriSource,uriTarget):
  verite_terrain.append([uriS.getAttribute('rdf:resource'),uriT.getAttribute('rdf:resource')])

verite_terrain = np.array(verite_terrain).astype("str")

In [None]:
verite_terrain[:,1]

In [None]:
obj_1 = []
obj_2 = []

for subj1 in verite_terrain[:,0]:
  obj_1.append(np_arr_source[np_arr_source[:,0] == subj1][:,2])


for subj2 in verite_terrain[:,1]:
  obj_2.append(np_arr_target[np_arr_target[:,0] == subj2][:,2])

for i in range(5):
  print(obj_1[i])
  print("\n")
  print(obj_2[i])
  print("\n\n\n\n\n")


### Evaluation **Resultats**

In [None]:
# total de couples trouvés
len(prod_subj[mask][:,0])

In [None]:
# couples qui sont veritablement à relié parmi ceux trouvés par notre algo
common = np.intersect1d(prod_subj[mask][:,0],verite_terrain[:,0])
len(common)

In [None]:
prod_subj[mask]
verite_terrain


In [None]:
uriSource = 'http://data.doremus.org/expression/ecc9da64-422b-3a47-8b07-4bb3c2ff4f1e'

In [None]:
verite_terrain[verite_terrain[:,0] == uriSource]

In [None]:
idx = [i for i in range(len(verite_terrain[:,0])) if verite_terrain[:,0][i] in common]

count = 0
for uriSource,uriTarget in verite_terrain[idx,:]:
  if (prod_subj[mask][prod_subj[mask][:,0] == uriSource].flatten()[1]) == (verite_terrain[verite_terrain[:,0] == uriSource].flatten()[1]):
    count+=1
count

Il y'a 238 couples uri qui sont à reliés Parmi les 210 trouvés dans cet exemple 152 sont véritablement à relié. Notre algo à réussi à bien relier 151 d'entres eux et s'est trompé pour un

In [None]:
print("precision")
precision = 151/152

In [None]:
print("recall")
recall = 152/238

In [None]:
print("F measure")
2*(precision*recall)/(precision+recall)