## TODO:
- Create functions
- refactor code with functions
- Apply to python file
- Create command line tool

In [139]:
import string, math
import numpy as np
from lxml import etree
from stemming.porter2 import stem
from collections import Counter


from tqdm.notebook import tqdm

In [140]:
def pre_process(txt:str) -> list:
    """Receives a string of text, removes the punctuation, turns text into 
    lowercase and performs stemming. Returns the input string as a list of 
    tokens"""
    txt = txt.translate(str.maketrans('', '', string.punctuation)).lower()
    txt = list(map(stem, txt.split()))
    return txt

def tf_func (text_list: list) -> dict:
    """Returns a dictionary containing the word the its term frequency"""
    return_dict = {}
    x = Counter(text_list)
    max_occurence = x.most_common(1)[0][1]
    x = dict(sorted(x.items()))
    
    for word, count in x.items():
        tf = count/max_occurence
        return_dict.update({word:tf})
    return return_dict

def idf_func(vocab:dict, corpus_len:int) -> dict:
    """Function to calculate IDF over vocab dict. The vocab dict is a dict
    that contains the doc IDs as keys and the text as a list of tokens in its
    values"""
    idf_dict = {}
    for term,count in vocab.items():
        idf = math.log(corpus_len/count)
        idf_dict[term] = idf
    return idf_dict


In [141]:
collectionName = "nytsmall"
#collectionName = "nyt199501"

tree = etree.parse( collectionName + ".xml")
root = tree.getroot()

In [142]:
vocab = {}
corpus = {}


for doc in tqdm(root.iter('DOC')):
    doc_txt = ""
    for element in doc.iter('HEADLINE', "P", "TEXT"):
        doc_txt += element.text

    # Pre Process
    doc_txt = pre_process(doc_txt)
    
    # Creating vocab dict
    for word in set(doc_txt):
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
    # Creating corpus dict
    corpus[doc.attrib['id']] = doc_txt
vocab = dict(sorted(vocab.items()))

    

0it [00:00, ?it/s]

In [143]:
# sorting corpus dict
corpus = dict(sorted(corpus.items()))
# n_docs = len(corpus)

# IDF
idf_dict = idf_func(vocab,len(corpus))

with open (collectionName + '.idf', 'w') as file:
    for term, idf in idf_dict.items():
        file.write('{}\t{}\n'.format(term,idf))


In [144]:
# TF
tf_dict = {}
for doc_id, text in corpus.items():
    tf_dict[doc_id] = tf_func(text)

with open(collectionName+".tf","w") as file:
    for doc_id, w_tf_pair in tf_dict.items():
        for word, tf in w_tf_pair.items():
            file.write('{}\t{}\t{}\n'.format(doc_id,word,tf))

In [145]:
tf_idf = {}

word_to_ix = {key:value for key,value in zip(vocab.keys(),range(len(vocab)))}

for doc_id,text in corpus.items():
    tf_idf[doc_id] = np.zeros(len(vocab))   
    
    for term in text:
        
        tf_idf[doc_id][word_to_ix[term]] = tf_dict[doc_id][term] *idf_dict[term]

In [146]:
query = ['hurricane', 'philadelphia']


# pre process query:
query_str = " ".join(query)
query = pre_process(query_str)

# Query TF:
query_tf = tf_func(query)

# Query TF-IDF
query_tf_idf = np.zeros(len(vocab))

for term in query:
    q = query_tf[term]
    i = idf_dict[term]
    query_tf_idf[word_to_ix[term]] = query_tf[term] * idf_dict[term]


In [147]:
# TF-IDF similarity search


sim_dict = {}
for doc_id, value in tf_idf.items():
    top = np.dot(query_tf_idf, value)
    bottom = np.linalg.norm(query_tf_idf) * np.linalg.norm(value)
    sim = top/bottom
    if sim == 0:
        continue
    else:
        sim_dict.update({doc_id:top/bottom})

sim_dict = sorted(sim_dict.items(),reverse=True, key=lambda item: item[1])
print(type(sim_dict))

<class 'list'>


In [148]:
sim_dict[:10]

[('NYT_ENG_19950101.0001', 0.1523105228405513),
 ('NYT_ENG_19950101.0056', 0.029142469066059024),
 ('NYT_ENG_19950101.0022', 0.028561929187225343),
 ('NYT_ENG_19950101.0017', 0.016747860911235552)]