# Concept graph

### Creating concept graph from DBpedia

#### Rappel des principales relations entre concepts:

- Hypernyme: A est un hypernyme de B **si** B est un A (animal est hypernyme de chien)
- Hyponyme: A est un hypernyme de B **si** A est un B (insuline est hypernyme de hormone)
- Méronyme: A est méronyme de B **si** A est une partie de B (bras est méronyme de corps)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import codecs

concepts_path= "/home/ismael/Documents/semantic_deep_neuralIR/data/annotated_collection_tagme_score/relation"

#### Code utile pour le pré traitement des textes dans la suite:

In [67]:
import re
from gensim.parsing.preprocessing import preprocess_string,remove_stopwords,strip_tags,strip_punctuation,strip_numeric,strip_multiple_whitespaces,strip_short
import string
table = str.maketrans('', '', '!"#$%\'()*+,-./:;<=>?@[\\]^_`{|}~')
printable = set(string.printable)

# Krovetz stemmer is a stemmer much less "destructive" than porter.
from krovetzstemmer import Stemmer # good stemmer for IR
ks = Stemmer()

CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_multiple_whitespaces, strip_punctuation, strip_numeric, lambda x:strip_short(x, minsize=3)] #, lambda x: ks.stem(x)

### Là j'extrais les concepts du bail 

- https://wiki.dbpedia.org/Downloads2014
- https://wiki.dbpedia.org/downloads-2016-10
- https://wiki.dbpedia.org/data-set-39

In [2]:
from bs4 import BeautifulSoup

ontology_concepts = set({"Thing"})
ontology_relations = []

with open("data/dbpedia_3.4.owl", "r") as file:
    filecontent = file.read()
    harira = BeautifulSoup(filecontent, "html.parser")
    docs = harira.find_all("owl:class")

for ontology_class in docs:
    if ontology_class.find("rdfs:label"):
        nom = ontology_class.contents[1].string.replace(" ", "")
        parent = ontology_class.contents[3]["rdf:resource"].replace("http://dbpedia.org/ontology/", "")
        ontology_concepts.add(nom)
        if parent != "http://www.w3.org/2002/07/owl#Thing":
            ontology_relations.append((parent, nom))
        else:
            ontology_relations.append(("Thing", nom))

In [3]:
print("il y a {} concepts et {} relations.".format(len(ontology_concepts), len(ontology_relations)))

il y a 205 concepts et 204 relations.


In [4]:
"Entomologist" in ontology_concepts

False

In [5]:
with open("data/dbpedia_2016-10.nt", "r") as file:
    for line in file:
        lol = line.split(" ")
        if lol[1] == "<http://www.w3.org/2000/01/rdf-schema#subClassOf>":
            nom = lol[0].replace('<http://dbpedia.org/ontology/', "")[:-1]
            if '<http://dbpedia.org/ontology/' in lol[2]:
                parent = lol[2].replace('<http://dbpedia.org/ontology/', "")[:-1]
            elif "<http://www.w3.org/2002/07/owl#Thing>" in lol[2]:
                parent = "Thing"
            ontology_concepts.add(parent)
            ontology_concepts.add(nom)
            ontology_relations.append((parent, nom))
print("il y a {} concepts et {} relations.".format(len(ontology_concepts), len(ontology_relations)))

il y a 766 concepts et 973 relations.


In [6]:
"Mammal" in ontology_concepts

True

### Là j'ajoute les instances (entités nommées)

Il y en a beaucoup trop, donc on sélectionne seulement:
- Tous les concepts du corpus robust 2004
- Tous les pays
- Toutes les "organisations"

### Lecture des concepts du corpus robust:

In [7]:
import json

#lecture du corpus annoté de Robust 2004
corpus_concepts = set()

docs = {}
collections = ["FR94", "FT", "FBIS", "LATIMES"]
for collection in collections:
    with open("data/annotatedrobust2004"+collection+".json", "r") as f:
        docs.update(json.load(f))
print("docs chargés")

for d in docs:
    for w in docs[d]['text'].strip().split(" "):
        if "$#!" in w:
            corpus_concepts.add(w.replace("$#!", ""))

print("il y a {} concepts.".format(len(corpus_concepts)))

docs chargés
il y a 242761 concepts.


In [8]:
"Mafia" in corpus_concepts

True

### Lecture des concepts des requêtes:

In [9]:
concepts_des_requetes = set()
correspondance = {}

with open("data/topics-title.annotated.csv", 'r') as f:
    for line in f:
        numerorequete = line.split("\t")[0]
        for w in line.split("\t")[1].strip().split(" "):
            if "$#!" in w:
                tkt = w.replace("$#!", "").split(".")[0].capitalize()
                concepts_des_requetes.add(tkt)
                correspondance[tkt] = tkt
                
print("il y a {} concepts.".format(len(concepts_des_requetes)))

il y a 402 concepts.


In [13]:
#concepts_des_requetes

In [10]:
#concepts_des_requetes.remove(None)
def p(x):
    if "_" in x:
        s = x.split("_")
        correspondance[x] = "_".join([e.capitalize() for e in s])
        return "_".join([e.capitalize() for e in s])

lol = {p(x) for x in concepts_des_requetes}
concepts_des_requetes = lol

In [12]:
types_to_keep = ["Organisation", "Country"]

instances_concepts = set()
instances_relations = []

for f in ["data/instance_types_en.nt"]:
    with open(f, "r") as file:
        i = 0
        bailencours = ""
        relationsencours = []
        for line in file:
            if i > 0:
                lol = line.split()
                if "<http://dbpedia.org/ontology/" in lol[2]:
                    nom = lol[0].replace('<http://dbpedia.org/resource/', "")[:-1]
                    parent = lol[2].replace('<http://dbpedia.org/ontology/', "")[:-1]
                    if nom != bailencours:
                        if (bailencours in corpus_concepts) or ("Country" in relationsencours) or (bailencours in concepts_des_requetes):
                            if bailencours not in instances_concepts:
                                instances_concepts.add(bailencours)
                                instances_relations.append((relationsencours[0], bailencours))
                        bailencours = nom
                        relationsencours = []

                    bailencours = nom
                    relationsencours.append(parent)
            i+=1

In [14]:
"Mafia" in instances_concepts

False

In [16]:
len(concepts_des_requetes - instances_concepts)

37

In [17]:
len(corpus_concepts - instances_concepts) / len(corpus_concepts)

0.5244376155972335

#### Attention!! 52% des concepts du corpus ne sont PAS dans le graphe pour l'instant

### On va scraper dbpedia pour récupérer ces 52% restants


C'est parti mon ratpi

In [61]:
import requests
from bs4 import BeautifulSoup


def scrap(concept):
    requete = requests.get("http://dbpedia.org/page/"+concept)
    if requete.status_code != 200:
        print("oops! concept {} not found".format(concept))
        return None
    page = requete.content
    soup = BeautifulSoup(page)
    
    result = {}
    
    types = soup.find("div", {"class": "page-resource-uri"})
    if len(types) > 2 and "href" in types.contents[1].attrs:
        result['parent'] = types.contents[1]["href"].split("/")[-1]
        
        #cas spécial chakal
        if result["parent"] == "javascript:void()":
            result["parent"] = 'Thing'
    else:
        print("oops! an issue occured while parsing entity type of concept {}".format(concept))
        return None
    
    abstract = soup.find("span", {"property": "dbo:abstract", "xml:lang": "en"})
    if abstract:
        result["abstract"] = soup.find("span", {"property": "dbo:abstract", "xml:lang": "en"}).contents[0]
    else:
        print("oops! No abstract for concept {}".format(concept))
        return None
        
    return result

### Exemple de résultat:

In [59]:
scrap("President_of_France")

{'parent': 'Person',
 'abstract': "The President of the French Republic (French: Président de la République française, French pronunciation: \u200b[pʁezidɑ̃ də la ʁepyblik fʁɑ̃sɛz]), is the executive head of state of the French Fifth Republic. The powers, functions and duties of prior presidential offices, and their relation with the first minister and cabinets has over time differed with the various French constitutions. The President of France is also the ex officio Co-Prince of Andorra, Grand Master of the Légion d'honneur and the Ordre national du Mérite and honorary proto-canon of the Basilica of St. John Lateran in Rome. The current President of France is François Hollande, who took office on 15 May 2012."}

## Il est temps de scraper!

<img src="images/boyyy.jpg" width="700">

In [63]:
import json 

abstracts = json.load(open("data/concepts_text.json", "r"))

In [84]:
import time

cpt = 0
calls = 0

for concept in (corpus_concepts - instances_concepts):
    #attendre un peu pour ne pas risquer de se faire bannir
    if (calls % 1000) == 0:
        #print("waiting 10s...")
        #time.sleep(10)
        print("{} found".format(cpt))
        
    res = scrap(concept)
    calls += 1
    if res:
        cpt += 1 #comme en L1 khoya
        instances_concepts.add(concept)
        instances_relations.append((res["parent"], concept))
        abstracts[concept] = " ".join(preprocess_string(res["abstract"].strip(), CUSTOM_FILTERS))
        
print("{} found.".format(cpt))

save = json.dumps(abstracts)
with open("data/concepts_text.json", "w") as f:
    f.write(save)
print("abstracts saved.")

import pickle

pickle.dump(ontology_concepts.union(instances_concepts), open('data/concepts.pkl', 'wb'))
pickle.dump(instances_relations+ontology_relations, open('data/relations.pkl', 'wb'))
print("concepts and relations saved.")

oops! concept Shotgun_(Junior_Walker_&_the_All_Stars_song) not found
oops! concept Monk_(Dungeons_&_Dragons) not found
oops! No abstract for concept Jim_Healy
oops! No abstract for concept Kalács
624 cpt found
oops! No abstract for concept Patriarchate_of_Peć
oops! concept Scandal_(Kangta_&_Vanness_album) not found
oops! No abstract for concept Ebby_Thust
oops! No abstract for concept Sōmon
oops! No abstract for concept Jennifer_Schooler
oops! No abstract for concept Liveryman
oops! concept Sorcerer_(Dungeons_&_Dragons) not found
oops! No abstract for concept Kevin_Sundher
oops! No abstract for concept Totemism
1615 cpt found
oops! No abstract for concept Administrative_divisions_of_Vladimir_Oblast
oops! No abstract for concept Awayday
oops! No abstract for concept Association_(statistics)
oops! concept Plane_(Dungeons_&_Dragons) not found
oops! No abstract for concept Guo
oops! No abstract for concept Chicago_"L"
oops! No abstract for concept List_of_Graeco-Roman_geographers
oops! No 

oops! No abstract for concept Headmaster_(Transformers)
oops! concept Macklemore_&_Ryan_Lewis not found
oops! No abstract for concept Iori_Kogawa
oops! No abstract for concept Donald_"Buz"_Lukens
oops! concept Mövenpick_Hotels_&_Resorts not found
oops! No abstract for concept Stwo
oops! concept Ministry_of_Highways,_Ports_&_Shipping not found
oops! concept Dungeons_&_Dragons_gameplay not found
oops! concept Black_&_Decker_DustBuster not found
oops! concept Promise_(Delirious?_song) not found
14453 cpt found
oops! No abstract for concept Tim_"Ripper"_Owens
oops! No abstract for concept Eduardo_González
oops! No abstract for concept Enactus
oops! concept Texas_A&M_Aggies_men's_basketball not found
oops! No abstract for concept Chocolatey
oops! concept Construct_(Dungeons_&_Dragons) not found
oops! concept First_Berkshire_&_The_Thames_Valley not found
oops! No abstract for concept FETP
oops! concept Blood_&_Oil not found
oops! No abstract for concept Han_(surname)
oops! concept Braddock_D

oops! No abstract for concept Atsuko_Kohashi
oops! No abstract for concept Constituent_country
oops! No abstract for concept Expobank
oops! No abstract for concept Farpoint
oops! No abstract for concept Chiropractor
oops! No abstract for concept Intrusion_prevention_system
oops! No abstract for concept Teflon_(nickname)
27289 cpt found
oops! No abstract for concept Sim_(Korean_surname)
oops! No abstract for concept Mustad
oops! No abstract for concept Shvetsov
oops! No abstract for concept Sukhdeep_Sukh
oops! No abstract for concept BM&F_Bovespa
oops! No abstract for concept Enactment
oops! No abstract for concept Dopant_(Kamen_Rider)
oops! No abstract for concept Loubna_Berrada
oops! No abstract for concept Kamejiro_Senaga
oops! No abstract for concept For_Example_(Kay_Switch_song)
oops! No abstract for concept Cogency
oops! No abstract for concept Who_Wants_to_Be_a_Millionaire?
oops! No abstract for concept Administrative_divisions_of_the_Komi_Republic
28276 cpt found
oops! concept P

oops! concept Touch_(Delirious?_album) not found
oops! No abstract for concept Mouthfeel
oops! No abstract for concept Leka,_Crown_Prince_of_Albania
oops! concept Dyckerhoff_&_Widmann not found
oops! No abstract for concept KC_(internet_service_provider)
oops! No abstract for concept Falderal
oops! No abstract for concept Bunkum
oops! No abstract for concept Anodyne
oops! No abstract for concept Student_Radio_Network
41121 cpt found
oops! No abstract for concept Ceramic_materials
oops! No abstract for concept Haze_(Cannabis)
oops! No abstract for concept Richard_Salsman
oops! No abstract for concept Tomara_clan
oops! No abstract for concept Lost_&_Found_(1961–62)
oops! concept Anyone_for_Doomsday? not found
oops! concept "Heroes"_(David_Bowie_album) not found
oops! No abstract for concept Schuyler_Marvin
oops! No abstract for concept György_Gilyán
oops! No abstract for concept Atlantis_(Marvel_Comics)
42111 cpt found
oops! No abstract for concept Adad
oops! No abstract for concept Reve

oops! No abstract for concept Citibank_(Hong_Kong)
oops! concept List_of_character_races_in_Dungeons_&_Dragons not found
oops! No abstract for concept Northport,_Prince_Edward_Island
oops! concept Henschel_&_Son not found
oops! No abstract for concept Ko_(Korean_surname)
oops! No abstract for concept Rafael_Pallais
oops! No abstract for concept Park_(film)
oops! No abstract for concept Deshwal
oops! No abstract for concept LifeFlight
oops! No abstract for concept Kenny_"Dope"_Gonzalez
oops! concept The_Maybes? not found
oops! No abstract for concept Ascriptin
55952 cpt found
oops! No abstract for concept Periphery_(country_subdivision)
oops! No abstract for concept Distensibility
oops! No abstract for concept Artana
oops! No abstract for concept Hans_"Assi"_Hahn
oops! No abstract for concept CMAC
oops! concept Britney_&_Kevin:_Chaotic_(EP) not found
oops! No abstract for concept David_Magang
oops! No abstract for concept Malhotra
oops! concept Frick_Art_&_Historical_Center not found
oo

In [90]:
print("on a {} concepts".format(len(instances_concepts)))
print("{}% des concepts manquant.".format(len(corpus_concepts - instances_concepts) / len(corpus_concepts)))

on a 243967 concepts
0.003245990912873155% des concepts manquant.


In [93]:
instances_relations[1997]

('Scientist', 'Marie_Curie')

### corriger quelques trucs à la main

In [81]:
instances_concepts.update(["Vitamin", "Vitamin_E", "Vitamin_D", "Olympic_Games", "Pulmonary_Tuberculosis",
                            "Doctor_Of_The_Church", "Motor_Hotel", "Transportation_System", "Wind_Instrument",
                          "Heroic_Verse"])

instances_relations.append(("Biomolecule", "Vitamin"))
instances_relations.append(("Vitamin", "Vitamin_E"))
instances_relations.append(("Vitamin", "Vitamin_D"))
instances_relations.append(("SportsEvent", "Olympic_Games"))
instances_relations.append(("Olympic_Games", "Olympics"))
instances_relations.append(("Olympics", "Olympic_Games"))
instances_relations.append(("Disease", "Pulmonary_Tuberculosis"))
instances_relations.append(("Tuberculosis", "Pulmonary_Tuberculosis"))
instances_relations.append(("Pulmonary_Tuberculosis", "Tuberculosis"))
instances_relations.append(("Cleric", "Doctor_Of_The_Church"))
instances_relations.append(("Hotel", "Motor_Hotel"))
instances_relations.append(("MeanOfTransportation", "Transportation_System"))
instances_relations.append(("Transportation_System", "MeanOfTransportation"))
instances_relations.append(("Musical", "Wind_Instrument"))
instances_relations.append(("Poem", "Heroic_Verse"))

In [95]:
concepts_des_requetes - instances_concepts

{'Acts_Of_The_Apostles',
 'Attention_Deficit_Disorder',
 'Basque_Homeland_And_Freedom',
 'Carbon_Paper',
 'Church_Service',
 'Control_Condition',
 'Credit_Card',
 'Film_Editing',
 'Gas_Constant',
 'Home_Plate',
 'Life_Sentence',
 'Life_Style',
 'Magnetic_Levitation',
 'Natural_Gas',
 None,
 'Personnel_Casualty',
 'Prostate_Gland',
 'Protective_Covering',
 'Savings_Bank',
 'Security_System',
 'Sexual_Activity',
 'Sexual_Intercourse',
 'Signal_Detection',
 'Star_Topology',
 'Terminus_Ad_Quem',
 'Voltaic_Pile',
 'Water_System',
 'Worldly_Concern'}

In [94]:
len(instances_concepts) + len(ontology_concepts)

244733

In [83]:
import pickle

pickle.dump(ontology_concepts.union(instances_concepts), open('data/concepts.pkl', 'wb'))
pickle.dump(instances_relations+ontology_relations, open('data/relations.pkl', 'wb'))

### Là je construis le graphe

In [96]:
import networkx as nx

G = nx.Graph()

G.add_nodes_from(ontology_concepts)
G.add_edges_from(ontology_relations)

G.add_nodes_from(instances_concepts)
G.add_edges_from(instances_relations)

In [97]:
list(nx.shortest_path(G, 'France', 'Africa'))
#list(G.neighbors('Bethesda_Softworks'))

['France', 'Country', 'PopulatedPlace', 'Continent', 'Africa']

In [None]:
plt.figure(figsize=(100,75))
plt.axis("off")
nx.draw_networkx(G)
plt.savefig("images/ontology.png")
plt.show()

In [120]:
"Death" in ontology_concepts

True

In [143]:
res = []

with open("data/topics-title.annotated.csv", 'r') as f:
    for line in f:
        numerorequete = line.split("\t")[0]
        kelbay = [numerorequete]
        for w in line.split("\t")[1].strip().split(" "):
            if "$#!" in w:
                tkt = w.replace("$#!", "").split(".")[0].capitalize()
                kelbay.append("$#!"+correspondance[tkt])
            else:
                kelbay.append(w)
        res.append(kelbay)

In [144]:
with open("data/topics-title.annotated_rectified.csv", "w") as f:
    for line in res:
        f.write(line[0]+"\t"+" ".join(line[1:])+"\n")