In [None]:
import math
import numpy as np
import pandas as pd
import nltk 
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="https://raw.githubusercontent.com/liraop/recinfo_lab2/master/data/results.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()
N = documents 

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def get_idf(index):
    for ngram in index:
        k = len(index[ngram])
        m = documents + 1
        index[ngram]['idf'] = math.log(m/k)
         

def build_index(dataset):
    document_index = 0
    index = {}
    
    for entry in dataset.text:
        document_index = document_index + 1
            
        for ngram in parse(entry):
                if ngram in index: #is ngram already on index?
                    if document_index in index[ngram]: # is it in the same document?
                        index[ngram][document_index] = index[ngram][document_index] + 1                        
                    else: # nope
                        index[ngram][document_index] = 1 
                else: # no, sir
                    index[ngram] = {document_index: 1}
    get_idf(index)           
    
    return index
                        
index = build_index(data)

queries = ["juíza","federal","governo","Brasil","presidente"]


In [None]:
def bin_query_vector(index, query):
    query_vector = []
    
    for word in index:
        if word in query.split():
            query_vector.append(1)
        else:
            query_vector.append(0)
            
    return query_vector

def bin_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                doc_vec.append(1)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
    
    return document_vector
                

def similarity(query_vector, doc_vector):
    sim = 0
    
    for d in doc_vector:
        for i in range(len(d)):
            sim += (query_vector[i] * d[i])
    
    return sim

def binary_vsm(index, query):
    query_vector = bin_query_vector(index, query)   
    doc_vector = bin_document_vector(index)
    
    return similarity(query_vector, doc_vector)

In [None]:
def tf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id] * index[ngram]['idf']
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector

def tf_query_vector(index, query):
    query_vector = []
    
    for ngram in index:
        w = 0
        for term in query.split():
            if ngram == term:
                w += 1
        query_vector.append(w)
        
    return query_vector

def tf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return similarity(query_vector, doc_vector)