## Continuación del proyecto - Parte 2 <p> Implementar una búsqueda TF-IDF

In [11]:
from collections import Counter
import itertools
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

### Contruimos el vocabulario

In [12]:
# concatenate all tokenized texts into a single list
tokenized_texts = [doc["tokenized_text"] for doc in data]

# flatten the list of lists
vocab = list(itertools.chain(*tokenized_texts))

# remove duplicates
vocab = list(set(vocab))

In [4]:
# TFIDF SE CALCULA COMO:

# TF = frecuencia del término en el documento (B) / longitud del documento
# IDF = log ( nº documentos / nº docs en los que aparece el término (C) )

# Entonces TFIDF = TF*IDF

In [5]:
# Necesitamos:
# Calcular cuántas veces aparece cada palabra en cada documento - B
# Calcular en cuántos documentos aparece cada palabra - C

In [17]:
# count how many times each token occurs in a document - B
docs_token_counter = []
for doc in data:
    doc_tokenized = doc["tokenized_text"]
    docs_token_counter.append(Counter(doc_tokenized)) # Counter crea un diccionario para cada documento de la forma:
                                                      # {palabra: nº apariciones en el documento}


#for count, value in enumerate(docs_token_counter):
#    print(count, value)


In [18]:
# For each token in corpus vocabulary, count in how many documents it occurs - C
number_docs_with_token  = {}
for token in vocab:
    count_docs = sum([1 for doc in docs_token_counter if token in doc.keys()])
    number_docs_with_token[token] = count_docs

### Calcular TFIDFS

In [19]:
for i, doc in enumerate(docs_token_counter): # asigna un ordinal a cada diccionario (que corresponde a un documento)
    doc_length = len(doc)
    tfidf_vec = []
    for token in vocab:
        
        # compute a term frequency (tf) per document
        tf = doc[token] / len(data[i]["tokenized_text"])
        
        # compute a log of inverse document frequency per document

        idf = np.log(len(data)/number_docs_with_token[token])

        tfidf = tf * idf
        tfidf_vec.append(tfidf)
    
    # add tf_idf vector to the dictionaries
    data[i]['tf_idf'] = tfidf_vec
    
# Save an updates summary with computed Tf-Idf vectors
with open('tfidf.json', 'w') as json_file:
    json.dump(data, json_file)

In [28]:
'''
# Mi intento

def tfidf(vocab, tokenized_text): # diccionario, lista
    # calculamos el vector tfidf de cada documento
    tfidf = dict.fromkeys(vocabulary.word2count, 0)
    for token in tokenized_text:
        tf = tokenized_text.count(token) # fecuencia del término en el documento dado
        idf = 1/(1 + vocabulary.word2count[token]) # 1/(1 + frecuencia del término en la colección)
        tfidf[token] = tf*idf # los tokens no presentes en el documento actual se quedan con valor tfidf 0
    return tfidf
    
with open('tokenized.json', 'r') as jsonFile:
    data = json.load(jsonFile)
    for doc in data:
        doc['tfidf'] = tfidf(vocabulario, doc['tokenized_text']) # Nuevo campo con los vectores tfidf
        
with open('tfidf.json', 'w') as outfile:
    json.dump(data, outfile)
'''

### compute cosine similarities between the document Tf-Idf vectors and the query Tf-Idf vector.

In [20]:
query = "which is the worst illness?"

In [25]:
# We suggest you use scikit-learn library and its cosine_similarity function

# reutilizamos la función process para tokenizar la consulta

# Reimplementamos el calculo de vector TFIDF para la consulta

def vectorize(query, vocab = vocab):
    
    query_tokenized = process(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    for token in vocab:
        
        tf = query_token_counter[token] / len(query_tokenized)
        idf = np.log(len(data) /  number_docs_with_token[token])
        tfidf = tf * idf
        query_vec.append(tfidf)
            
    return query_vec

In [27]:
#función de búsqueda

def search_tfidf(query, docs):
    
    # vectorize query
    query_vec = vectorize(query)
    query_arr = np.array(query_vec)
    
    # Build a list of results and their cosine similarity scores
    rankings = []
    for doc in docs:
        doc_rank = {}
        doc_arr = np.array(doc['tf_idf'])
        rank = cosine_similarity(query_arr.reshape(1,-1), doc_arr.reshape(1, -1))[0][0]
        if rank > 0:
            doc_rank['title'] = doc['title']
            doc_rank['rank'] = rank
            rankings.append(doc_rank)

    #return sorted results
    return sorted(rankings, key=lambda k: k['rank'], reverse=True)
ranking = search_tfidf(query, data)


'''
mi intento: da problemas con las palabras de la consulta que NO están en el vocabulario
def search(document_tfidf, query): # document_tfidf es el vector tfidf del documento, query es una consulta de tipo cadena
    query_tfidf = list(tfidf(vocabulario, list(query.split())).values())
    document_ = list(document_tfidf.values())
    sklearn.metrics.pairwise.cosine_similarity(query_tfidf.reshape(-1, 1), document_.reshape(-1, 1))
    
similarities = []
with open('tfidf.json', 'r') as jsonFile:
    data = json.load(jsonFile)
    for doc in data:
        similarities.append(search(doc['tfidf'], 'illness'))
print(similarities)
'''

'\nmi intento:\ndef search(document_tfidf, query): # document_tfidf es el vector tfidf del documento, query es una consulta de tipo cadena\n    query_tfidf = list(tfidf(vocabulario, list(query.split())).values())\n    document_ = list(document_tfidf.values())\n    sklearn.metrics.pairwise.cosine_similarity(query_tfidf.reshape(-1, 1), document_.reshape(-1, 1))\n'

In [31]:
search_tfidf("which is the worst illness", data)

[{'title': 'Spanish flu', 'rank': 0.09304508055751907},
 {'title': 'HIV/AIDS', 'rank': 0.08528298155751908},
 {'title': 'Swine influenza', 'rank': 0.079167190098216},
 {'title': 'Superspreader', 'rank': 0.07005576161414198}]