## TODO:
- Create functions
- refactor code with functions
- Apply to python file
- Create command line tool

In [5]:
import string, math
import numpy as np
from lxml import etree
from stemming.porter2 import stem
from collections import Counter


from tqdm.notebook import tqdm

In [6]:
def pre_process(txt:str) -> list:
    """Receives a string of text, removes the punctuation, turns text into 
    lowercase and performs stemming. Returns the input string as a list of 
    tokens"""
    txt = txt.translate(str.maketrans('', '', string.punctuation)).lower()
    txt = list(map(stem, txt.split()))
    return txt

def tf_func (doc_id: str,text_list: list) -> dict:
    """Returns a dictionary containing the word the its term frequency"""
    tf_dict = {}
    x = Counter(text_list)
    max_occurence = x.most_common(1)[0][1]
    x = dict(sorted(x.items()))
    
    for word, count in x.items():
        tf = count/max_occurence
        tf_dict[doc_id].update({word,tf})
    return tf_dict

def idf_func(corpus:dict) -> dict:
    """Function to calculate IDF over corpus dict. The corpus dict is a dict
    that contains the doc IDs as keys and the text as a list of tokens in its
    values"""
    idf_dict = {}
    n_docs = len(corpus)
    for term,count in corpus.items():
        idf = math.log(n_docs/count)
        idf_dict[term] = idf
    return idf_dict


In [7]:
collectionName = "nytsmall"
#collectionName = "nyt199501"

tree = etree.parse( collectionName + ".xml")
root = tree.getroot()

In [8]:
vocab = {}
corpus = {}


for doc in tqdm(root.iter('DOC')):
    doc_txt = ""
    for element in doc.iter('HEADLINE', "P", "TEXT"):
        doc_txt += element.text

    # Pre Process
    doc_txt = pre_process(doc_txt)
    
    # Creating vocab dict
    for word in set(doc_txt):
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
    # Creating corpus dict
    corpus[doc.attrib['id']] = doc_txt
vocab = dict(sorted(vocab.items()))

    

0it [00:00, ?it/s]

In [9]:
# sorting corpus dict
corpus = dict(sorted(corpus.items()))
n_docs = len(corpus)

idf_dict = {}

# IDF
with open (collectionName + '.idf', 'w') as file:
    for term, count in tqdm(vocab.items()):
        idf = math.log(n_docs/count)
        idf_dict[term] = idf
        file.write('{}\t{}\n'.format(term,idf))


  0%|          | 0/7736 [00:00<?, ?it/s]

In [10]:
# TF
tf_dict = {}
with open(collectionName+".tf","w") as file:
    for doc, text in tqdm(corpus.items()):
        x = Counter(text)
        max_occurence = x.most_common(1)[0][1]
        x = dict(sorted(x.items()))

        tf_dict[doc] = {}
        for word, count in x.items():
            tf = count/max_occurence
            tf_dict[doc].update({word:tf})
            file.write('{}\t{}\t{}\n'.format(doc,word,tf))

  0%|          | 0/102 [00:00<?, ?it/s]

In [11]:
tf_idf = {}

word_to_ix = {key:value for key,value in zip(vocab.keys(),range(len(vocab)))}

for doc_id,text in corpus.items():
    tf_idf[doc_id] = np.zeros(len(vocab))   
    
    for term in text:
        
        tf_idf[doc_id][word_to_ix[term]] = tf_dict[doc_id][term] *idf_dict[term]

In [12]:
query = ['hurricane', 'philadelphia']


# pre process query:
query_str = " ".join(query)
query_str  = query_str.translate(str.maketrans(
        '', '', string.punctuation)).lower()
query = list(map(stem, query_str.split()))

# Query TF:
# A dict of word:tf
query_tf = {}
y = Counter(query)
max_occurence = y.most_common(1)[0][1]
y = dict(sorted(y.items()))

for word,count in y.items():
    tf = count/max_occurence
    query_tf[word] = tf
#print(que)

# Query TF-IDF
query_tf_idf = np.zeros(len(vocab))

for term in query:
    q = query_tf[term]
    i = idf_dict[term]
    query_tf_idf[word_to_ix[term]] = query_tf[term] * idf_dict[term]


In [13]:
# TF-IDF similarity search


sim_dict = {}
for doc_id, value in tf_idf.items():
    top = np.dot(query_tf_idf, value)
    bottom = np.linalg.norm(query_tf_idf) * np.linalg.norm(value)
    sim = top/bottom
    if sim == 0:
        continue
    else:
        sim_dict.update({doc_id:top/bottom})

sim_dict = sorted(sim_dict.items(),reverse=True, key=lambda item: item[1])
print(type(sim_dict))

<class 'list'>


In [14]:
sim_dict[:10]

[('NYT_ENG_19950101.0001', 0.1523105228405513),
 ('NYT_ENG_19950101.0056', 0.029142469066059024),
 ('NYT_ENG_19950101.0022', 0.028561929187225343),
 ('NYT_ENG_19950101.0017', 0.016747860911235552)]