In [1]:
import re
import pickle

concepts = pickle.load(open("data/concepts.pkl", 'rb'))
relations = pickle.load(open("data/relations.pkl", "rb"))

print("{} concepts, {} relations".format(len(concepts), len(relations)))

118194 concepts, 118422 relations


### Création du graphe

In [2]:
import networkx as nx

G = nx.Graph()

G.add_nodes_from(concepts)
G.add_edges_from(relations)

In [3]:
nx.shortest_path(G, "Thing", "Pixar")

['Thing', 'Organisation', 'Company', 'Pixar']

### Récupération des embeddings

In [4]:
from gensim.models.doc2vec import Doc2Vec

conceptsEmbeddings = Doc2Vec.load("embeddings/concepts")
conceptsEmbeddings.init_sims(True)

from gensim.models import Word2Vec

wordEmbeddings = Word2Vec.load('embeddings/model_1')
wordEmbeddings.init_sims(True)

In [5]:
queries = {}

from gensim.parsing.preprocessing import preprocess_string,remove_stopwords,strip_numeric, strip_tags, strip_punctuation, strip_short, strip_multiple_whitespaces
from krovetzstemmer import Stemmer
ks = Stemmer()

CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags,
                  strip_multiple_whitespaces, strip_punctuation, remove_stopwords, lambda x: ks.stem(x)]

with open("data/topics-title.annotated_rectified.csv", "r") as f:
    for line in f:
        q_id = line.split("\t")[0]
        query = line.split("\t")[1].strip()
        if "$#!" in query:
            tokens = query.split(" ")
            queries[q_id] = []
            for i in range(len(tokens) - 1):
                w = tokens[i]
                if "$#!" in tokens[i+1]:
                    tiascompris = preprocess_string(w, CUSTOM_FILTERS)
                    if tiascompris != []:
                        queries[q_id].append((tiascompris[0], tokens[i+1]))
                elif "$#!" not in w and "$#!" not in tokens[i+1]:
                    tiascompris = preprocess_string(w, CUSTOM_FILTERS)
                    if tiascompris != []:
                        queries[q_id].append((tiascompris[0], ""))
                
                    

In [6]:
queries['303']

[('hubble', ''),
 ('telescope', '$#!Telescope'),
 ('achievement', '$#!Accomplishment')]

In [7]:
import json

docs = {}
collections = ["FR94", "FT", "FBIS", "LATIMES"]
for collection in collections:
    with open("data/annotatedrobust2004"+collection+".json", "r") as f:
        docs.update(json.load(f))
print("docs chargés")

docs chargés


In [8]:
paires = {}
with open("data/qrels.robust2004.txt","r") as f:
    for line in f :
        l = line.strip().split(' ')
        paires.setdefault(l[0],{})
        paires[l[0]].setdefault('relevant',[])
        paires[l[0]].setdefault('irrelevant',[])
        if l[-1]=='1':
            paires[l[0]]['relevant'].append(l[2])
        else:
            paires[l[0]]['irrelevant'].append(l[2])
print("relevance chargé")

relevance chargé


In [14]:
import numpy as np
def hist(query, document):
    cos = np.dot(query, document.T)
    return np.apply_along_axis(lambda x: np.log10(1 + np.histogram(x, bins=30, range=(-1,1))[0]), 1, cos) #log de l'histogramme

In [None]:
lol = [q for q in self.d_query.keys() if q in self.paires]
query_idf = {}
for id_requete in lol:
    query_idf[id_requete] = self.get_idf_vec(custom_tokenizer(self.d_query[id_requete]))
pickle.dump(query_idf, open("saved_data/query_idf.pkl", "wb"))
del query_idf

print("nombre de requetes: %d." % len(lol)) 
tiascompris = list(self.docs.keys())
lol.remove("634")

for id_requete in lol:
#recuperer les mots dont on connait les embeddings dans la query
    query_embeddings = np.zeros((self.max_length_query, 300))
    i = 0
    for word in self.d_query[id_requete].split():
        if word in self.model_wv:
            query_embeddings[i] = self.model_wv[word]
        i += 1
    query_embeddings = np.array(query_embeddings)

    interractions = []

    for pos in self.paires[id_requete]["relevant"]:
    #lire le doc, la requete et creer l'histogramme d'interraction
        pos_embeddings = []
        for word in self.docs[pos]['text'].split():
            if word in self.model_wv:
                pos_embeddings.append(self.model_wv[word])
        pos_embeddings = np.array(pos_embeddings)

        interractions.append(self.hist(query_embeddings, pos_embeddings)) #append le doc positif

        neg = np.random.choice(self.paires[id_requete]["irrelevant"], 1, replace=False)[0]
        neg_embeddings = []
        for word in self.docs[neg]['text'].split():
            if word in self.model_wv:
                neg_embeddings.append(self.model_wv[word])
            neg_embeddings = np.array(neg_embeddings)

        #interractions.append(self.hist(query_embeddings, pos_embeddings)) #append le doc négatif
        interractions.append(self.hist(query_embeddings, neg_embeddings)) #append le doc négatif

        print("requete %s complete." % id_requete)


    np.save("saved_data2/"+id_requete+"_interractions.npy", np.array(interractions))
print("data completed")

In [45]:
#paires['301']['relevant']

In [9]:
queries["301"]

[('international', '$#!International'),
 ('organized', ''),
 ('crime', '$#!Crime')]

In [10]:
for word in docs['FBIS3-10082']['text'].strip().split(" "):
    if "$#!" in word:
        if word.replace("$#!", "") in conceptsEmbeddings.docvecs.index2entity:
            print("\t\t\t\t\t", word)
        else:
            print(word)

					 $#!Language
$#!British_Forces_Broadcasting_Service
$#!Government
$#!Evidence_(law)
$#!Mafia
$#!Caporegime
$#!Justice
$#!Trade
$#!Mafia
					 $#!Colombia
$#!Justice
$#!Government
					 $#!The_Doors
					 $#!Gustavo_de_Greiff
$#!In_Touch_(radio_series)
$#!News_media
$#!Junius_P._Rodriguez
$#!President_of_France
$#!Terrorism
$#!Authority
$#!Process_philosophy
$#!Evidence
$#!Mafia
$#!Indictment
$#!Warrant_(law)
$#!Arrest
