In [87]:
import numpy as np
from collections import Counter
import porter as p
import re
from Weighter import Weighter1
from IRModel import Vectoriel
from Parser import Parser
from IndexerSimple import IndexerSimple

- Données test (Docs / Requêtes)

In [88]:
docs=["the new home has been saled on top forecasts",
     "the home sales rise in july",
     "there is an increase in home sales in july",
     "july encounter a new home sales rise"]

reqs = ["top sales",
       "sales increase july",
       "new home"]

stopWords=["the","a","an","on","behind","under","there","in"]

index = {}
for i in range(len(docs)):
    index[i] = dict(Counter(map(p.stem, [word for word in (str.lower(docs[i])).split() if word not in stopWords])))           
            
indexInverse = {}
for numDoc, dico in index.items():
    for word, tf in dico.items():
        if(word not in indexInverse):
            indexInverse[word]= {}
        indexInverse[word][numDoc] = tf

- Pertinence des requêtes

In [89]:
pertiReq = {0:{0},      #doc1 pertinent pour req1
           1:{1,2},     #doc2 et 3 pertinent pour req2 (doc2 plus pertinent que doc3)
           2:{}}        #aucuns docs pertinents pour req3

In [90]:
if 1 in pertiReq[0]:
    print("hello")

## Exercice 1.

- modèle booléen

In [91]:
def booleen(query):
    req = list(np.unique(list(map(p.stem, query.split())))) 
    #représentation du résultat comme unensemble de documents
    res=set(index)
    for stem in req:
        res=res.intersection(indexInverse[stem])#On récupère l'intersection des documents contenant un mot de la requete
    return res

In [92]:
booleen(reqs[1])

{2}

- modele vectoriel

In [93]:
#modele vectoriel
def vect(query):
    nbDoc = 4
    req = list(np.unique(list(map(p.stem,query.split()))))
    res = [indexInverse[i] for i in req]
    doc = []
    doc_res = np.zeros(nbDoc)
    for w in res:
        doc.append(set(w.keys()))
    for w in doc:
        for d in w:
            doc_res[d]+=1
    return doc_res

#### Precision (capacité à ne retourner que des docs pertinents)

In [94]:
def preci(idQuery, modele = 0):
    if modele == 0:
        docs_retournés = booleen(reqs[idQuery])
    elif modele == 1:
        docs_retournés = vect(reqs[idQuery])
    pert = set(pertiReq[idQuery])     #docs jugés pertinents
    tp = docs_retournés.intersection(pert)  #True positifs (R & P)
    fp = docs_retournés.difference(preci)   #False positifs (R-P)
    if len(tp) == 0 & len(fp) == 0:
        return 0
    return len(tp)/(len(tp)+len(fp))     #mesure de précision

In [95]:
def preciRangk(rang,idQuery,score,jugement):
    """ score: dict() -> {idDoc: scoreDoc} dépend du modèle de poids adopté
        (trié par score décroissant)
        jugement : pertinence des requêtes"""
    k = rang
    res = 0
    for i in k:
        if score[i] in jugement[idQuery]:
            res+=1
    return res/k

#### Rappel (capacité à retourner tous les docs pertinents)

In [96]:
def rappel(idQuery, modele = 0):
    if modele == 0:
        docs_retournés = booleen(reqs[idQuery])
    elif modele == 1:
        docs_retournés = set(vect(reqs[idQuery]))
    pert = set(pertiReq[idQuery])      #docs jugés pertinents
    tp = docs_retournés.intersection(pert)    #True positifs (R & P)
    fn = pert.difference(docs_retournés)      #False negatifs (P-R)
    if len(tp) == 0 & len(fn) == 0:
        return 0
    return len(tp)/(len(tp)+len(fn))      #mesure de rappel

In [97]:
def rappelRangk(rang,idQuery,score,jugement):
    """ score: dict() -> {idDoc: scoreDoc} dépend du modèle de poids adopté
        (trié par score décroissant)
        jugement : pertinence des requêtes"""
    k = rang
    res = 0
    for i in k:
        if score[i] in jugement[idQuery]:
            res+=1
    return res/len(jugement[idQuery])

In [98]:
rappel(2,1)

0

In [99]:
def fMesure(idQuery, rang):
    r = rappel(idQuery)
    p = preci(idQuery)
    fact1 = 1+rang**2
    fact2 = (p*r)/(p*(rang**2)+r)
    return fact1*fact2

## Exercice 2.

## 1. Chargement des requêtes et de leur docs pertinents

In [100]:
class Query():
    def __init__(self, idQuery):
        self.identifiant = int(idQuery)
        self.texte = ""
        self.docsPertinents = []
     
    #GETTERS
    def getIdentifiant(self):
        return self.identifiant
        
    def getTexte(self):
        return self.texte
        
    def getDocspertinents(self):
        return self.docsPertinents
    
    # SETTERS
    def setTexte(self, texte):
        self.texte = texte
        
    def setDocspertinents(self, docs):
        self.docsPertinents = docs
        
        
    # METHODES
    def addTexte(self, texte):
        self.texte += texte
        
    def addDocspertinents(self,doc):
        self.docsPertinents.append(doc)

In [101]:
class ParserQuery():
    
    def parseQRY(chemin):
        """ 
        Fonction permettant de parser les fichers QRY (requêtes avec leurs identifiants et leur texte)
        """
        file = open(chemin, 'r') 

        res = {}
        currentI = None
        currentBalise = None 
        currentQuery = None

        while True:
            #lis une seule ligne
            line = file.readline()

            #si ligne vide, fin du fichier
            if not line:
                break

            #récupère la ligne sous forme de mots
            words=line.split()

            #Si la ligne n'est pas vide
            if(len(words)>0):
                #Test si on est sur une balise et laquelle
                if(words[0]==".I"):

                    if(currentQuery != None):
                        #J'enregistre la requete courante avant d'en créer une autre
                        res[currentQuery.getIdentifiant()] = currentQuery 

                    del currentQuery
                    currentQuery = Query(words[1])# Création d'une requete avec son identifiant
                    currentI = words[1]
                    currentBalise = 'I' 

                elif(words[0]==".W"):
                    currentBalise='W' #J'indique que je suis danc une balise W
                elif(words[0][0]=='.'): 
                    currentBalise='unknown' #J'indique que je suis dans une balsie inconnue
                else: 
                    #On est dans le contenu d'une balise
                    if(currentBalise=='W'):
                        currentQuery.addTexte(line)#J'ajoute la ligne au texte de la requête

        #J'enregistre la requête courante avant de quitter
        res[currentQuery.getIdentifiant()] = currentQuery

        file.close()
        return res
    
    
    def parseREL(chemin, reqs):
        
        file = open(chemin, 'r') 
        
        while True:
            #lis une seule ligne
            line = file.readline()

            #si ligne vide, fin du fichier
            if not line:
                break
            words=line.split()
            
            if int(words[0][0]) == 0:
                docPertinent = int(re.split('^0*',words[1])[1])
                reqs[int(words[0][1])].addDocspertinents(docPertinent)
            else:
                docPertinent = int(re.split('^0*',words[1])[1])
                reqs[int(words[0])].addDocspertinents(docPertinent)
                     
        file.close()
        return reqs

In [102]:
req1 = ParserQuery.parseQRY('cacm.qry')
req2 = ParserQuery.parseREL('cacm.rel',req1)

In [103]:
req2[1].getTexte()

' What articles exist which deal with TSS (Time Sharing System), an\noperating system for IBM computers?\n'

## 2. Métriques

In [104]:
class EvalMesure():
    
    def __init__(self):
        self.query = None
        
    def evalQuery(self,liste):
        pass
    
    #GETTERS
    def getQuery(self):
        return self.query
    
    #SETTERS
    def setQuery(self, query):
        self.query = query

In [105]:
class Precision(EvalMesure):
    
    def __init__(self,k):
        
        super().__init__()
        self.rang = k #rang de précision
        
    def evalQuery(self, scores):
        """ scores: liste() -> [idDoc] dépend du modèle de poids adopté
        (trié par score décroissant)"""
        res = 0
        for i in range(self.rang):
            if scores[i] in self.query.getDocspertinents():
                res+=1
        return res/self.rang

In [106]:
class Rappel(EvalMesure):
    
    def __init__(self,k):
        
        super().__init__()
        self.rang = k #rang de rappel
        
    def evalQuery(self, scores):
        """ scores: liste() -> [idDoc] dépend du modèle de poids adopté
        (trié par score décroissant)"""
        res = 0
        for i in range(self.rang):
            if scores[i] in self.query.getDocspertinents():
                res+=1
        return res/len(self.query.getDocspertinents())

In [241]:
class NGCD(EvalMesure):
    
    def __init__(self,k):
        
        super().__init__()
        self.rang = k #nombre de résultats à considérer
        
    def evalQuery(self, scores):
        """ scores: liste() -> [idDoc] dépend du modèle de poids adopté
        (trié par score décroissant)"""
        
        #relevance of returned docs above
        rel = [1 if d in self.query.getDocspertinents() else 0 for d in scores]
        nbRel = len(self.query.getDocspertinents()) #Nombre de dcos pertinents pour la requete
        import ipdb; ipdb.set_trace()
        if nbRel < self.rang:
            r = nbRel
        else:
            r = self.rang
        
        #sorted relevance rank
        sorted_rel = np.sort(rel, axis=0)[::-1]
        metric = np.log([2+i for i in range(r)])
        dcg = np.sum(rel[:r]/metric)
        dcg_max = np.sum(sorted_rel[:r]/metric)
        
        if not dcg_max:
            return 0
        else:
            return dcg/dcg_max

In [108]:
class AP(EvalMesure):
    """Avg precision"""
    
    def __init__(self,k):
        
        super().__init__()
        self.rang = k
        
    def evalQuery(self, scores):
        """ calcul de la preci moyenne"""
        return np.mean([self.preci(scores,k) for k in range(self.rang)])
     
    def preci(self, scores, k):
        """ scores: liste() -> [idDoc] dépend du modèle de poids adopté
        (trié par score décroissant)"""
        res = 0
        for i in range(k):
            if scores[i] in self.query.getDocspertinents():
                res+=1
        return res/k

## Test

In [145]:
print(reqs)
print(docs)
print(pertiReq)

['top sales', 'sales increase july', 'new home']
['the new home has been saled on top forecasts', 'the home sales rise in july', 'there is an increase in home sales in july', 'july encounter a new home sales rise']
{0: {0}, 1: {1, 2}, 2: {}}


In [146]:
#Test query
reqs_test = [Query(idQuery, ) for idQuery, docsPerti in pertiReq.items()]
for query in reqs_test:
    queryId = query.getIdentifiant()
    query.setTexte(reqs[queryId])
    query.setDocspertinents(list(pertiReq[queryId]))

In [147]:
# IR Model
weighter = Weighter1(index, indexInverse)
vect = Vectoriel(weighter, True)
q = reqs_test[1]
q_text = q.getTexte()
scores = vect.getScores(q_text)
ranking = vect.getRanking(scores) #Liste des docs pertinent dans l'ordre décroissant de pertinence
print(ranking)

[54, 2508, 1969, 1427, 3006, 2394, 2317, 1214, 3108, 2177, 1052, 3017, 1643, 2516, 1014, 1117, 46, 2839, 1465, 2421, 2395, 103, 2493, 1030, 1638, 1713, 1884, 3045, 2913, 1325, 2652, 1154, 1516, 2937, 3111, 796, 2302, 243, 1403, 3030, 2004, 2207, 678, 2019, 3090, 1915, 1433, 1997, 2187, 2374, 2140, 2816, 2246, 1265, 1165, 3565, 2751, 3284, 1408, 1769, 2739, 2820, 4036, 2929, 2938, 3513, 2455, 1497, 1740, 1845, 1886, 2826, 2856, 2859, 1933, 1155, 3459, 2846, 2828, 1454, 2985, 4164, 1719, 1811, 2297, 3033, 1879, 1641, 3832, 2798, 3541, 4070, 2911, 4117, 3248, 2033, 4019, 40, 2669, 3091, 3714, 3707, 3394, 3552, 3931, 4062, 4133, 3288, 3940, 4115, 4096, 3601, 3496, 3396, 3455, 4119, 4069, 3410, 3937, 3981, 3337, 3415, 3850, 4016, 3307, 3933, 4030, 3301, 4194, 3624, 3427, 3550, 3346, 3789, 3530, 3580, 4152, 3679, 3686, 3848, 4168, 3329, 3590, 3509, 3704, 3289, 3461, 3304, 4135, 3358, 3426, 3412, 3591, 3474, 3436, 3668, 3980, 3235, 3767, 3953, 3982, 3725, 4090, 3742, 3499, 4151, 4011, 3684, 3

In [112]:
#Eval mesure : NGCD
k=3 #rang
evalMesure = NGCD(k)
evalMesure.setQuery(reqs_test[1]) #Query to consider
evalMesure.evalQuery(ranking)

1.0

## 3. Plateforme d'évaluation

In [242]:
class EvalIRModel():
    
    def __init__(self, path):
        
        self.irModels = [] #(Liste)Objets IRModel (Vectoriel, Okapi, modèle de langue ...)
        self.evalMesures = [] #(Liste)Objets EvalMesure
        self.index = None #index des docs
        self.indexInverse = None #index inverse des docs
        self.querys = None #liste d'objets requêtes parsées (avec docs pertinents)
        self.path = path #cacm ou cisi
        
    def evaluationSimple(self,idIRModel, idEvalMesure):
        model, mesure = self.irModels[idIRModel], self.evalMesures[idEvalMesure]
        evals = []
        for idq,query in self.querys.items():
            mesure.setQuery(query)
            query_text = query.getTexte()
            scores = model.getScores(query_text)
            ranking = model.getRanking(scores)
            evals.append(mesure.evalQuery(ranking))
        return evals
    
    def evaluationQuery(self,idIRModel, idEvalMesure, idQuery):
        model, mesure, query = self.irModels[idIRModel], self.evalMesures[idEvalMesure], self.querys[idQuery]
        mesure.setQuery(query)
        query_text = query.getTexte()
        scores = model.getScores(query_text)
        ranking = model.getRanking(scores)
        evaluation = mesure.evalQuery(ranking)
        return evaluation
    
    def parseQuery(self):
        """Parse querys"""
        reqPath = self.path+'.qry'
        relPath = self.path+'.rel'
        r = ParserQuery.parseQRY('cacm.qry')
        querys = ParserQuery.parseREL('cacm.rel',r)
        self.querys = querys
        
    def parseDoc(self):
        """ Parse and index docs collection"""
        docsPath = self.path+'.txt'
        collecCacm = Parser.parseCacmCisi('cacm.txt')
        indexer=IndexerSimple(collecCacm)
        indexer.indexation()
        self.index, self.indexInverse = indexer.getIndex(), indexer.getIndexInv()
        
    def addModels(self, models):
        if type(models) == list:
            for m in models:
                self.irModels.append(m) 
        else:
            self.irModels.append(models)
            
    def addMesures(self, mesures):
        if type(mesures) == list:
            for m in mesures:
                self.evalMesures.append(m)
        else:
            self.evalMesures.append(mesures)
            
    def getQuerys(self):
        return self.querys
    
    def getIRModels(self):
        return self.irModels
    
    def getEvalMesures(self):
        return self.evalMesures
    
    def getIndex(self):
        return self.index
    
    def getIndexinverse(self):
        return self.indexInverse
    

## Test EvalIRModel (CACM)

In [243]:
path = 'cacm'

#Init evaluation
evalIRModel = EvalIRModel(path)

#Query/ Docs parsing (index creation)
evalIRModel.parseQuery()
evalIRModel.parseDoc()

#IRModels
#Index, IndexeInv
indexCacm, indexInvCacm = evalIRModel.getIndex(), evalIRModel.getIndexinverse()
weighter = Weighter1(indexCacm, indexInvCacm)
irModel1 = Vectoriel(weighter, True)
evalIRModel.addModels(irModel1)

# Eval Mesure
k=3 #rang
evalMesure1 = NGCD(k)
evalIRModel.addMesures(evalMesure1)

In [215]:
# Evaluation 
evalIRModel.evaluationSimple(0,0)

In [244]:
# eval Query
evalIRModel.evaluationQuery(0,0,1)

> [1;32m<ipython-input-241-c5caa26b91df>[0m(16)[0;36mevalQuery[1;34m()[0m
[1;32m     15 [1;33m        [1;32mimport[0m [0mipdb[0m[1;33m;[0m [0mipdb[0m[1;33m.[0m[0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 16 [1;33m        [1;32mif[0m [0mnbRel[0m [1;33m<[0m [0mself[0m[1;33m.[0m[0mrang[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     17 [1;33m            [0mr[0m [1;33m=[0m [0mnbRel[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> nbRel
5
ipdb> rel
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

BdbQuit: 

In [232]:
evalIRModel.getQuerys()[1].getDocspertinents()

[1410, 1572, 1605, 2020, 2358]

In [None]:
sorted_rel = np.sort(rel, axis=0)[::-1]
metric = np.log([2+i for i in range(r)])
dcg = np.sum(rel[:r]/metric)
dcg_max = np.sum(sorted_rel[:r]/metric)

# autre

In [177]:
#Init irModel
weighter = Weighter1(indexTest, indexInverseTest)
irModel1 = Vectoriel(weighter, True)

In [178]:
type(irModel1)

IRModel.Vectoriel

In [148]:
#Parse + indexation Docs
collecCacm = Parser.parseCacmCisi('cacm.txt')
indexer=IndexerSimple(collecCacm)
indexer.indexation()
indexTest, indexInverseTest = indexer.getIndex(), indexer.getIndexInv()

In [149]:
#Parse querys
req1 = ParserQuery.parseQRY('cacm.qry')
querys = ParserQuery.parseREL('cacm.rel',req1)

In [150]:
#Init irModel
weighter = Weighter1(indexTest, indexInverseTest)
irModel1 = Vectoriel(weighter, True)

In [151]:
#Init evaluation modes
k=3 #rang
evalMesure1 = NGCD(k)

In [152]:
#EvalIRModel init

evalIRModel = EvalIRModel([irModel1],[evalMesure1],indexer,querys)
len(evalIRModel.getQuerys())
evalIRModel.evaluationSimple(0,0)

[0.0,
 0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.23463936301137822,
 0,
 0.0,
 0.0,
 0.23463936301137822,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.23463936301137822,
 0.0,
 0.0,
 0.0,
 0.7039180890341346,
 0.46927872602275644,
 0.0,
 0.29608191096586517,
 0.0,
 0,
 0,
 0.0,
 0.0,
 0.29608191096586517,
 0.23463936301137822,
 0.0,
 0,
 0.7039180890341346,
 0.0,
 0.0,
 0.46927872602275644,
 0,
 0,
 0.46927872602275644,
 0.0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1.0,
 0.5307212739772434,
 0.29608191096586517,
 0.0,
 0.7653606369886217,
 0.0,
 0.46927872602275644,
 0]

In [182]:
if type(0) == list:
    print("coucou")

In [176]:
type(0)

int

In [183]:
print(i) for i in range(6)

SyntaxError: invalid syntax (<ipython-input-183-f076fbdbe794>, line 1)

In [184]:
a = [0,1]

In [185]:
a.append([0,0])

In [186]:
a

[0, 1, [0, 0]]