# Concept graph

### Creating concept graph from DBpedia

#### Rappel des principales relations entre concepts:

- Hypernyme: A est un hypernyme de B **si** B est un A (animal est hypernyme de chien)
- Hyponyme: A est un hypernyme de B **si** A est un B (insuline est hypernyme de hormone)
- Méronyme: A est méronyme de B **si** A est une partie de B (bras est méronyme de corps)

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import codecs

concepts_path= "/home/ismael/Documents/semantic_deep_neuralIR/data/annotated_collection_tagme_score/relation"

### Là j'extrais les concepts du bail 

- https://wiki.dbpedia.org/Downloads2014
- https://wiki.dbpedia.org/downloads-2016-10
- https://wiki.dbpedia.org/data-set-39

In [92]:
from bs4 import BeautifulSoup

ontology_concepts = set({"Thing"})
ontology_relations = []

with open("data/dbpedia_3.4.owl", "r") as file:
    filecontent = file.read()
    harira = BeautifulSoup(filecontent, "html.parser")
    docs = harira.find_all("owl:class")

for ontology_class in docs:
    if ontology_class.find("rdfs:label"):
        nom = ontology_class.contents[1].string.replace(" ", "")
        parent = ontology_class.contents[3]["rdf:resource"].replace("http://dbpedia.org/ontology/", "")
        ontology_concepts.add(nom)
        if parent != "http://www.w3.org/2002/07/owl#Thing":
            ontology_relations.append((parent, nom))
        else:
            ontology_relations.append(("Thing", nom))

In [86]:
print("il y a {} concepts et {} relations.".format(len(ontology_concepts), len(ontology_relations)))

il y a 205 concepts et 204 relations.


In [90]:
"Entomologist" in ontology_concepts

False

In [97]:
with open("data/dbpedia_2016-10.nt", "r") as file:
    for line in file:
        lol = line.split(" ")
        if lol[1] == "<http://www.w3.org/2000/01/rdf-schema#subClassOf>":
            nom = lol[0].replace('<http://dbpedia.org/ontology/', "")[:-1]
            if '<http://dbpedia.org/ontology/' in lol[2]:
                parent = lol[2].replace('<http://dbpedia.org/ontology/', "")[:-1]
            elif "<http://www.w3.org/2002/07/owl#Thing>" in lol[2]:
                parent = "Thing"
            ontology_concepts.add(parent)
            ontology_concepts.add(nom)
            ontology_relations.append((parent, nom))
print("il y a {} concepts et {} relations.".format(len(ontology_concepts), len(ontology_relations)))

il y a 766 concepts et 973 relations.


In [130]:
"Mammal" in ontology_concepts

True

### Là j'ajoute les instances (entités nommées)

Il y en a beaucoup trop, donc on sélectionne seulement:
- Tous les concepts du corpus robust 2004
- Tous les pays
- Toutes les "organisations"

### Lecture des concepts du corpus robust:

In [137]:
#lecture du corpus annoté de Robust 2004
corpus_concepts = set()

for root, dirs, files in os.walk(concepts_path):
    for file in files:
        with open(root+os.sep+file, "r") as f:
            for line in f:
                for concept in line.strip().split("|"):
                    corpus_concepts.add(concept.replace("_", "").replace("$#!", ""))

In [139]:
print("il y a {} concepts.".format(len(corpus_concepts)))

il y a 82178 concepts.


In [180]:
types_to_keep = ["Organisation", "Country"]

instances_concepts = set()
instances_relations = []

with open("data/instance_types_en.nt", "r") as file:
    i = 0
    bailencours = ""
    relationsencours = []
    for line in file:
        if i > 0:
            lol = line.split()
            if "<http://dbpedia.org/ontology/" in lol[2]:
                nom = lol[0].replace('<http://dbpedia.org/resource/', "")[:-1]
                parent = lol[2].replace('<http://dbpedia.org/ontology/', "")[:-1]
                if nom != bailencours:
                    if (nom in corpus_concepts) or ("Country" in relationsencours):
                        instances_concepts.add(bailencours)
                        instances_relations.append((relationsencours[0], bailencours))
                    bailencours = nom
                    relationsencours = []
                    
                bailencours = nom
                relationsencours.append(parent)
        i+=1

In [193]:
len(instances_concepts) + len(ontology_concepts)

11634

In [199]:
import pickle

pickle.dump(ontology_concepts.union(instances_concepts), open('data/concepts.pkl', 'wb'))

### Là je construis le graphe

In [183]:
import networkx as nx

G = nx.Graph()

G.add_nodes_from(ontology_concepts)
G.add_edges_from(ontology_relations)

G.add_nodes_from(instances_concepts)
G.add_edges_from(instances_relations)

In [198]:
list(nx.shortest_path(G, 'France', 'Bethesda_Softworks'))
#list(G.neighbors('Bethesda_Softworks'))

['France',
 'Country',
 'PopulatedPlace',
 'Place',
 'Thing',
 'Organisation',
 'Company',
 'Bethesda_Softworks']

In [None]:
plt.figure(figsize=(100,75))
plt.axis("off")
nx.draw_networkx(G)
plt.savefig("images/ontology.png")
plt.show()