<a href="https://colab.research.google.com/github/ivynasantino/mineracao-de-dados/blob/master/avaliacao_de_sistemas_RI/reports/avaliacao_de_sistemas_RI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Avaliação de sistemas de recuperação da informação

In [6]:
# @title Imports
import pandas as pd
import math

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Antes de começar a codificar, precisamos importar os dados. Dessa vez, iremos manipular dois tipos de arquivos, o csv com as informações coletadas pelo crawler no jornal El país e um json com resultados de pesquisa de relevância do Google.

In [0]:
# @title Dataframe: El país
elpais = pd.read_csv("https://raw.githubusercontent.com/ivynasantino/mineracao-de-dados/master/02-processamento_de_texto/data/results.csv")

In [0]:
# @title Dataframe: Gabarito
gabarito = pd.read_json("https://raw.githubusercontent.com/ivynasantino/mineracao-de-dados/master/avaliacao_de_sistemas_RI/results_final.json")

### Funções auxiliares

Para darmos início as implementações, separei funções auxiliares para facilitar a manipulação das estruturas e deixar o código modularido.

In [0]:
def num_of_docs():
  return len(elpais['text'])

In [0]:
def top10(r):
  return sorted(r, key = lambda x: x[1], reverse = True)[:10]

In [0]:
toker = RegexpTokenizer(r'\w+')
stopwords = stopwords.words("portuguese")

Após isso, iremos repetir algoritmos desenvolvidos em notebooks anteriores para construção de índice invertido e as versões de modelos vetoriais.

### Índice invertido

In [0]:
def buildIndex():
  index = {}
  n = 0
  for doc in elpais.text:
    n += 1
    tokens = [token for token in toker.tokenize(doc.lower()) 
              if token not in stopwords 
              and len(token) > 3
              and not bool(re.search(r'\d', token))]
 
    for t in tokens:
      freq = tokens.count(t)
      if t not in index:
        index[t] = {}
      if n not in index[t]:
        index[t][n] = freq
  return index

In [0]:
index = buildIndex()

In [0]:
for post in index:
  k = len(index[post])
  idf = math.log(num_of_docs() + 1) / k
  index[post]['idf'] = round(idf, 3)

In [0]:
vocabulary = index.keys()

### Modelos vetoriais

In [0]:
def binary_rep(query):
  terms = query.split()
  tokens = [token for token in toker.tokenize(doc.lower()) 
              if token not in stopwords 
              and len(token) > 3
              and not bool(re.search(r'\d', token))]

  q = {}
  d = {}
  
  for term in terms:
    q[term], d[term] = 0,0
    if term in vocabulary:
      q[term] = 1
    if term in tokens:
      d[term] = 1
  
  m = 0
  for term in terms:
    if q[term] != 0 and d[term] != 0:
      m += q[term] * d[term]
  return m

In [0]:
def tf_rep(query):
  terms = query.split()
  tokens = [token for token in toker.tokenize(doc.lower()) 
              if token not in stopwords 
              and len(token) > 3
              and not bool(re.search(r'\d', token))]
  
  q = {}
  d = {}
  
  for term in terms:
    q[term], d[term] = 0,0
    if term in vocabulary:
      q[term] = terms.count(term)
      
    if term in tokens:
      d[term] = tokens.count(term)
  
  m = 0
  for term in terms:
    if q[term] != 0 and d[term] != 0:
      m += q[term] * d[term]
      
  return m

In [0]:
def tf_idf_rep(query):
  terms = query.split()
  tokens = [token for token in toker.tokenize(doc.lower()) 
              if token not in stopwords 
              and len(token) > 3
              and not bool(re.search(r'\d', token))]
  
  q = {}
  d = {}
  
  for term in terms:
    q[term], d[term] = 0,0
    if term in vocabulary:
      q[term] = terms.count(term)
      
    if term in tokens:
      d[term] = tokens.count(term)
  
  m = 0
  for term in terms:
    idf = index[term]['idf']
    if q[term] != 0 and d[term] != 0:
      m += q[term] * d[term] * idf
      
  return round(m , 2)

In [0]:
def bm25_rep(query, k):
  terms = query.split()
  tokens = [token for token in toker.tokenize(doc.lower()) 
              if token not in stopwords 
              and len(token) > 3
              and not bool(re.search(r'\d', token))]
  mtd = []
  for term in terms:
    if term in tokens:
      mtd.append(term)
  
  m = 0
  for mt in mtd:
    cq = terms.count(mt)
    cd = tokens.count(mt)
    m = num_of_docs()
    dw = len(index[mt].keys()) - 1
    m += cq * (((k + 1) * cd) / (cd + k)) * math.log((m + 1) / dw)
    
  return round(m, 2)

Após a construção das versões, iremos plotar a tabela com os resultados de cada modelo.

In [0]:
document = "jair bolsonaro"

In [0]:
query = ['ditadura militar', 'política', 'golpe militar']

In [0]:
data = {
    'query': [],
    'binary': [],
    'tf': [],
    'tf_idf': [],
    'bm25': []
}

for q in query:
  binary, tf, tf_idf, bm25 = [], [], [], []
  n = 0

  for doc in elpais.text:
    binary.append((n ,binary_rep(q)))
    tf.append((n, tf_rep(q)))
    tf_idf.append((n, tf_idf_rep(q)))
    bm25.append((n, bm25_rep(q, 10)))
    n += 1

  data['query'].append(q)
  data['binary'].append(top10(binary))
  data['tf'].append(top10(tf))
  data['tf_idf'].append(top10(tf_idf))
  data['bm25'].append(top10(bm25))

In [22]:
pd.options.display.max_colwidth = 100
pd.DataFrame(data)

Unnamed: 0,query,binary,tf,tf_idf,bm25
0,ditadura militar,"[(0, 2), (2, 2), (5, 2), (6, 2), (24, 2), (62, 2), (94, 2), (103, 2), (113, 2), (114, 2)]","[(6, 25), (114, 18), (215, 16), (222, 16), (237, 16), (229, 15), (2, 13), (24, 13), (94, 12), (1...","[(6, 4.06), (237, 2.81), (215, 2.65), (229, 2.54), (114, 2.5), (222, 2.19), (94, 2.05), (164, 2....","[(114, 258.78), (222, 258.39), (24, 257.96), (6, 257.49), (2, 256.97), (207, 256.97), (103, 255...."
1,política,"[(0, 1), (4, 1), (6, 1), (7, 1), (14, 1), (18, 1), (19, 1), (22, 1), (23, 1), (24, 1)]","[(68, 9), (165, 7), (234, 7), (203, 6), (62, 5), (137, 5), (151, 5), (164, 5), (172, 5), (6, 4)]","[(68, 0.57), (165, 0.44), (234, 0.44), (203, 0.38), (62, 0.32), (137, 0.32), (151, 0.32), (164, ...","[(68, 254.5), (165, 253.78), (234, 253.78), (203, 253.35), (62, 252.87), (137, 252.87), (151, 25..."
2,golpe militar,"[(0, 2), (1, 2), (2, 2), (24, 2), (82, 2), (98, 2), (113, 2), (119, 2), (150, 2), (164, 2)]","[(24, 23), (2, 15), (207, 14), (222, 14), (114, 12), (6, 9), (164, 8), (215, 8), (229, 8), (0, 7)]","[(24, 4.39), (2, 2.66), (207, 2.41), (222, 2.0), (165, 1.62), (164, 1.59), (229, 1.46), (114, 1....","[(114, 258.78), (222, 258.39), (24, 257.96), (6, 257.49), (2, 256.97), (207, 256.97), (103, 255...."
