# ** AI & Cognition Project**
# Phase II - Ontology Extraction
 
 5 DS 1 - 
 Group 1:

*   Med Anas FATTOUM
*   Iskander REGAIEG
*   Youssef Aziz ZGHAL
*   Haroun ELLEUCH
*   Saida MAJBOUR
*   Nadhir BOUHAOUALA


# Setting up the environment

In [1]:
#Libraries imports

import time

import matplotlib.pyplot as plt
import glob
import re #regex
import numpy as np
import pandas as pd
#import PyPDF2
import html
import unidecode

from IPython.display import Image, display

import spacy
from spacy.matcher import Matcher 
import visualise_spacy_tree
from spacy import displacy 
from spacy.tokens import DocBin
import textacy


import nltk
from nltk.corpus import wordnet
from nltk.corpus import verbnet
from nltk.metrics import *
#from nltk.corpus import stopwords

from collections import Counter
import seaborn as sns; sns.set_theme()
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# importing custom libraries
from utils import *
from ontology_utils import *

# Concept Hierarchy Extraction

In [3]:
df_hc = extract_annotations("PMBOK5.pdf")

## Pre-processing annotations

The following cell aims to remove special caracters, lowering the text and removing uselesss text.

In [4]:
for i in range(len(df_hc)):
    df_hc[0][i] = clean_annotation(df_hc[0][i])
#df_hc

## Pre-processing titles

In [5]:
# Flattening the index
df_hc = df_hc.reset_index()
df_hc = df_hc.rename(columns = {0 : 'annotation'})
df_hc = df_hc.drop(columns = ['level_3'])

In [6]:
for i in range(len(df_hc)):
    # print(df_hc.title[i],'           ',clean_title(df_hc.title[i]))
    df_hc.title[i] = clean_title(df_hc.title[i])
    df_hc.sub_title[i] = clean_title(df_hc.sub_title[i])
    df_hc.sub_title[i] = df_hc.sub_title[i].replace(df_hc.title[i],'')
    df_hc.sub_title[i] = df_hc.sub_title[i].replace(':_','')
    df_hc.sub_sub_title[i] = clean_title(df_hc.sub_sub_title[i])

# Creating the graph

In [7]:
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, OWL, RDF, RDFS, SKOS, VOID, XMLNS, XSD
from rdflib import URIRef, BNode, Literal, Namespace, Graph, ConjunctiveGraph
from rdflib.extras import describer

Generating the graph to which the data will be appended.

In [8]:
g = Graph()
g.parse()
g.bind("owl",OWL)
ns_url = "http://5ds1.g1.com/h#"
g.bind("pai",ns_url)#Bind prefix to namespace
# Create an RDF URI node to use as the subject for multiple triples
g.add((URIRef('http://5ds1.g1.com/h'), RDF.type, OWL.Ontology ))

<Graph identifier=Nf0c8b9f33b2849ff83a4af159f1bd327 (<class 'rdflib.graph.Graph'>)>

## Adding Classes and Sub-Classes

In [9]:
# Creating Generic Classes
tools_class =  URIRef(ns_url+'tools_techniques')
inputs_class =  URIRef(ns_url+'input')
outputs_class =  URIRef(ns_url+'outputs')

# Adding Generic Classes
g.add((tools_class, RDF.type, OWL.Class))
g.add((inputs_class, RDF.type, OWL.Class))
g.add((outputs_class, RDF.type, OWL.Class))

<Graph identifier=Nf0c8b9f33b2849ff83a4af159f1bd327 (<class 'rdflib.graph.Graph'>)>

In [10]:
#Data frame for Classes and Sub-Classes
df_classes_sub = df_hc[['title','sub_title']]
df_classes_sub = df_classes_sub.drop_duplicates()
df_classes_sub = df_classes_sub[df_classes_sub.sub_title != '']

# df_classes_sub

In [11]:
df_classes_sub

Unnamed: 0,title,sub_title
1,plan_risk_management,inputs
6,plan_risk_management,tools_and_techniques
9,plan_risk_management,outputs
11,identify_risks,inputs
24,identify_risks,tools_and_techniques
31,identify_risks,risk_register
33,perform_qualitative_risk_analysis,inputs
38,perform_qualitative_risk_analysis,tools_and_techniques
44,perform_qualitative_risk_analysis,outputs
46,perform_quantitative_risk_analysis,inputs


In [12]:
for i in range(len(df_classes_sub)):
    parent_class = URIRef(ns_url+df_classes_sub.iloc[i,0])
    sub_class = URIRef(ns_url+df_classes_sub.iloc[i,0]+"#"+df_classes_sub.iloc[i,1])
    # Adding class
    g.add((parent_class, RDF.type, OWL.Class))
    # Adding sub-class
    g.add((sub_class, RDFS.subClassOf, parent_class))
    
    # Affecting to generic classes
    if (df_classes_sub.iloc[i,1] == 'inputs'):
        g.add((sub_class, RDFS.subClassOf, inputs_class))
    elif (df_classes_sub.iloc[i,1] == 'outputs'):
        g.add((sub_class, RDFS.subClassOf, outputs_class))
    elif (df_classes_sub.iloc[i,1] == 'tools_and_techniques'):
        g.add((sub_class, RDFS.subClassOf, tools_class)) 

## Adding indivduals

In [13]:
for i in range(len(df_hc)):
    pc = df_hc.iloc[i,0]
    sc = df_hc.iloc[i,1]
    ind = df_hc.iloc[i,2]
    
    sub_class = URIRef(ns_url+pc+"#"+sc)
    ind = URIRef(ns_url+ind)

    # Adding the individual to the graph
    g.add((ind, RDF.type, sub_class))

## Adding annotations

### Adding annotations for Classes

In [14]:
#Data frame for Classes and their annotations
df_classes_ann = df_hc[['title','annotation']].loc[(df_hc.sub_title == '') ]

In [15]:
df_classes_ann.head(2)

Unnamed: 0,title,annotation
0,plan_risk_management,plan risk management is the process of definin...
10,identify_risks,identify risks is the process of determining w...


In [16]:
for i in range(len(df_classes_ann)):
    cl = URIRef(ns_url+df_classes_ann.iloc[i,0])
    desc = df_classes_ann.iloc[i,1]
    # g.set((cl, RDFS.comment, Literal(desc))) 
    g.set((cl, RDFS.isDefinedBy, Literal(desc)))

### Adding annotations for Sub-Classes

In [17]:
#Data frame for Sub-Classes and their annotations
df_sub_classes_ann = df_hc[['title','sub_title','annotation']].loc[(df_hc.sub_sub_title == '') & (df_hc.sub_title != '')]

In [18]:
df_sub_classes_ann

Unnamed: 0,title,sub_title,annotation
31,identify_risks,risk_register,the primary output from identify risks is the ...
59,plan_risk_responses,tools_and_techniques,several risk response strategies are available...


In [19]:
for i in range(len(df_sub_classes_ann)):
#     ns_url+df_classes_sub.iloc[i,0]+"#"+df_classes_sub.iloc[i,1]
    cl = URIRef(ns_url+df_sub_classes_ann.iloc[i,0]+"#"+df_sub_classes_ann.iloc[i,1])
    desc = df_sub_classes_ann.iloc[i,2]
    g.set((cl, RDFS.comment, Literal(desc)))

### Adding annotations for individuals ( = Sub-Sub-classes = instances)

In [20]:
#Data frame for individuals and their annotations
df_indiv_ann = df_hc[['sub_sub_title','annotation']].loc[(df_hc.sub_sub_title != '')]
df_indiv_ann.head(2)

Unnamed: 0,sub_sub_title,annotation
1,project_management_plan,"in planning risk management, all approved subs..."
2,project_charter,described in section 4.1.3.1. the project char...


In [21]:
for i in range(len(df_indiv_ann)):
    ind = URIRef(ns_url+df_indiv_ann.iloc[i,0])
    desc = df_indiv_ann.iloc[i,1]
    g.set((ind, RDFS.comment, Literal(desc)))    

# Enriching the ontology with N-Triples from PMI and PMBOK 5th Editon

## Loading SpaCy Docs

In [22]:
# Loading the SpaCy NLP engine
nlp = spacy.load('en_core_web_md',disable=['ner','textcat'])

In [23]:
# Loading the doc_bin containing the serialized docs
doc_bin = DocBin().from_disk("./Data/data.spacy")
# Extract the docs from the doc_bin
docs = list(doc_bin.get_docs(nlp.vocab))

### Pre-processing docs

In [24]:
corpus=[]

from nltk.corpus import stopwords
sw = set(stopwords.words('english')) -set([
    "you're", "you've", "she's", "it's", 'who', 'whom', 'am',  'is', 'are', 'was', 'were', 'be','been',
    'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',"a",
    'other', 'some',  'no', 'nor', 'not', 'own', 'same',  's', 't', 'can', 'don', "don't", 'should',
    "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 
    'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", "haven't", 'isn', "isn't",
    'will','such','as', 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'can'])
def stopwords(text): 
# Removing words that contain numbers
    text= re.sub("\S*\d\S*", " ", text).strip()
    text =re.sub('https?://\S+|www\.\S+', ' ', text)
    text=re.sub("\d+", " ", text)
    text = [word for word in text.split() if word not in sw]
    j=" ".join(text)
    corpus.append(j)

    #text = re.sub("[^a-zA-^Z]", " ", text)
  
    return j

In [25]:
# Pre-processed corpora
corpora = []
for doc in docs:
    corpora.append(stopwords(doc.text))

docs_processed = []
for corpus in corpora:
    docs_processed.append(nlp(corpus))

# No further use for unprocessed docs
docs = docs_processed

## Realations Extraction

The relations that we might be interested in are like follows: 

    is a
    such as
    kind of
    including 
    have
    part
    member
    instance
    and or other
    especially
    like
    in

In [26]:
#Defining pertinent and insightful patterns to add the graph
pattern1 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier
                {'POS':'NOUN'}, 
                {'LOWER': 'is'},  {'LOWER': 'a'}, 
                {'POS': 'NOUN'}]
# Pattern such as
pattern2 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
              {'LOWER': 'such'},  {'LOWER': 'as'},
              {'POS':'PROPN'}]
# Pattern kind of
pattern3 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
            {'LOWER': 'kind'},  {'LOWER': 'of'}, 
              {'POS':'PROPN'}]
# Pattern include 
pattern4 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
           {'LOWER': 'include'}, 
              {'DEP':'amod','OP':"?"}, # adjectival modifier 
            {'POS':'NOUN'}]
# Pattern have
pattern5 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
           {'LOWER': 'have'}, 
              {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}]
# Pattern part
pattern6 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
           {'LOWER': 'part'}, 
              {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}]
# Pattern member
pattern7 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
           {'LOWER': 'member'}, 
              {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}]
# Pattern instance
pattern8 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
           {'LOWER': 'instance'}, 
              {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}]

#define the pattern  and|or 
pattern9 = [{'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'LOWER': 'and', 'OP':"?"}, 
           {'LOWER': 'or', 'OP':"?"}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}] 
# Pattern especially
pattern10 = [  {'DEP':'amod','OP':"?"}, # adjectival modifier 
              {'POS':'NOUN'}, 
           {'LOWER': 'especially'}, 
              {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}]
# Pattern like
pattern11 = [{'DEP':'compound', 'OP':"*"},
           {'POS':'NOUN'},
           {'LOWER': 'like'},
           {'DEP':'compound', 'OP':"*"},
           {'POS': 'NOUN'}]
# Pattern In
pattern11 = [{'DEP':'compound', 'OP':"*"},
           {'POS':'NOUN'},
           {'LOWER': 'In'},
           {'DEP':'compound', 'OP':"*"},
           {'POS': 'NOUN'}]

Let’s extract those patterns from the text:

In [27]:
#initializing the matcher
matcher = Matcher(nlp.vocab) 
matcher.add("Found matches:", [pattern1,pattern2,pattern3,pattern4,pattern5,pattern6,pattern7,pattern8,pattern10,pattern11])

In [28]:
matches = []
spans = []
for doc in docs:
    m = matcher(doc)
    matches.append(m)
    spans.append([doc[start:end] for _, start, end in m])

In [29]:
# These are the spans obtained
spans
flattened  = [val for sublist in spans for val in sublist]
spans = flattened

In [30]:
df_relations =  get_relations(spans)
df_relations.head(3)

Unnamed: 0,Y,Relation,X
3,design,is a,trademark
12,risks,include,threats
24,plan,include,roles


Before adding these results to the graph, it is best to lemmatise them.

In [31]:
def lemmatize_str(text, nlp):
    result = ''
    for token in nlp(text):
        result += str(token.lemma_)+' '
    return result.strip()

In [32]:
for i in range(len(df_relations)):
    df_relations.iloc[i,0] = lemmatize_str(df_relations.iloc[i,0],nlp)
    df_relations.iloc[i,1] = lemmatize_str(df_relations.iloc[i,1],nlp)
    df_relations.iloc[i,2] = lemmatize_str(df_relations.iloc[i,2],nlp)

In [33]:
# We can now observe the lemmatised relations
df_relations.head(15)

Unnamed: 0,Y,Relation,X
3,design,be a,trademark
12,risk,include,threat
24,plan,include,role
39,list,be a,list
48,process,include,role
63,context,be a,combination
72,process,include,information
84,process,include,information
99,avoidance,be a,risk
111,transference,be a,risk


Let's add those newly found relations to the graph as objetct properties first, then link the objects and individuals together:

In [34]:
# Adding the new individuals to the graph first 

for i in range(len(df_relations)):
    indX = URIRef(ns_url+df_relations.iloc[i,2].replace(" ","_"))
    indY = URIRef(ns_url+df_relations.iloc[i,0].replace(" ","_"))
    g.add((indX,RDF.type,OWL.NamedIndividual))
    g.add((indY,RDF.type,OWL.NamedIndividual))

In [35]:
# Adding the relations

for i in range(len(df_relations)):
    c = URIRef(ns_url+df_relations.iloc[i,1].replace(" ","_"))
    domain = URIRef(ns_url+df_relations.iloc[i,0].replace(" ","_"))
    # g.add((c,RDF.type,OWL.DatatypeProperty))
    g.add((c,RDF.type,OWL.ObjectProperty))
    g.add((c,RDFS.domain,domain))
    g.add((c,RDFS.range,XSD.string))

In [36]:
# Combining the object properties as Subject-Property-Subject patterns

for i in range(len(df_relations)):
    indX = URIRef(ns_url+df_relations.iloc[i,2].replace(" ","_"))
    indY = URIRef(ns_url+df_relations.iloc[i,0].replace(" ","_"))
    X_Y_property = URIRef(ns_url+df_relations.iloc[i,1].replace(" ","_"))
    g.add( (indY, X_Y_property, indX) )

## Extrating Subject - Verb - Object Relations

In [37]:
# Initializing the TextaCy Extractor
extractor = textacy.extract

# Extracting all verbal phrases
df_verb_ph = extractor.subject_verb_object_triples(docs[0])

#Storing the results in the dataframe
d = { 'Subject': [], 'Verb': [] , 'Object' : [] }

for i in df_verb_ph:
    subject = str(i.subject[0].lemma_)
    for j in range(1,len(i.subject)):
        subject += " "+str(i.subject[j].lemma_)
    d['Subject'].append(subject)
    
    verb = str(i.verb[0].lemma_)   
    for j in range(1,len(i.verb)):
        verb += " "+str(i.verb[j].lemma_)
    d['Verb'].append(verb)  
        
    obj = str(i.object[0].lemma_)
    for j in range(1,len(i.object)):
        obj += " "+str(i.object[j].lemma_)
    d['Object'].append(obj) 
        
    #print(subject,verb,obj)

df_verb_ph = pd.DataFrame(data=d)

We notice that some fields are far too long to be added to the graph as is. REducing the size of those fields is, thus, required. To do so, we simply limit the results to those with a length lower than an arbitrary value of 4.

In [38]:
drop_idc = []
for i in range(len(df_verb_ph)):
    if ( (len(df_verb_ph.iloc[i,0].split()) > 4) | (len(df_verb_ph.iloc[i,1].split()) > 4 ) | (len(df_verb_ph.iloc[i,2].split()) > 4) ):
        drop_idc.append(i)
        
df_verb_ph = df_verb_ph.drop(drop_idc, axis=0)

In [39]:
df_verb_ph

Unnamed: 0,Subject,Verb,Object
0,institute project management isbn,publish,institute fax org internet
1,list pmi mark,send,comment
2,discount,resale,purpose
3,part work,contact,o box
4,order pmi org,print,states america
...,...,...,...
1254,objective risk,link,objective result
1255,barrier template,tool,technique
1257,risk interrelationship,risk,response interaction
1258,management plan trigger condition,plan,risk response


In [40]:
# Adding domains for the verbs as a list of their synonyms
syns = []

for i in range(len(df_verb_ph)):
    syns.append(synonym(df_verb_ph.iloc[i,1]))

df_verb_ph.insert(len(df_verb_ph.columns), 'verb_syns',syns)

In [41]:
# Adding the new individuals to the graph first 

for i in range(len(df_verb_ph)):
    obj = URIRef(ns_url+df_verb_ph.iloc[i,2].replace(" ","_"))
    subj = URIRef(ns_url+df_verb_ph.iloc[i,0].replace(" ","_"))
    g.add((obj,RDF.type,OWL.NamedIndividual))
    g.add((subj,RDF.type,OWL.NamedIndividual))

In [42]:
# Adding the relations

for i in range(len(df_verb_ph)):
    c = URIRef(ns_url+df_verb_ph.iloc[i,1].replace(" ","_"))
    domain = [ elem.replace(" ","_") for elem in df_verb_ph.iloc[i,3] ]
    domain = URIRef(ns_url+df_verb_ph.iloc[i,1].replace(" ","_")+'#domain')
    # g.add((c,RDF.type,OWL.DatatypeProperty))
    g.add((c,RDF.type,OWL.ObjectProperty))
    g.add((c,RDFS.domain,domain))
    g.add((c,RDFS.range,XSD.string))

In [43]:
# Combining the object properties as Subject-Property-Subject patterns

for i in range(len(df_verb_ph)):
    indX = URIRef(ns_url+df_verb_ph.iloc[i,2].replace(" ","_"))
    indY = URIRef(ns_url+df_verb_ph.iloc[i,0].replace(" ","_"))
    X_Y_property = URIRef(ns_url+df_verb_ph.iloc[i,1].replace(" ","_"))
    g.add( (indY, X_Y_property, indX) )

# Exporting the OWL graph

In [72]:
timestr = time.strftime("%Y%m%d-%H%M%S")
with open("./owl/"+timestr+'.owl', 'w') as output:
    output.write(g.serialize(format="nt"))     