In [138]:
# IR PS3
import re
import string

In [139]:
f_time_all = open("/home/karthikmsd/sem8/20XW86-IR-LAB/ws3/dataset-info/TIME.ALL", 'r')
f_query = open("/home/karthikmsd/sem8/20XW86-IR-LAB/ws3/dataset-info/TIME.QUE", 'r')
f_stp = open("/home/karthikmsd/sem8/20XW86-IR-LAB/ws3/dataset-info/TIME.STP", 'r')

In [140]:
def parse_documents(f):
    documents = []
    fstr = f.read()
    regex = r"\*TEXT\s+\d{3}\s+\d{2}/\d{2}/\d{2}\s+PAGE\s+\d{3}\n\n"
    
    iters = []
    result = re.finditer(regex, fstr, re.DOTALL)
    for key in result:
        iters.append(key)

    for i in range(len(iters)-1):
        i1 = iters[i].span()[1]
        i2 = iters[i+1].span()[0]
        
        documents.append(fstr[i1:i2])
    
    return documents

In [141]:
def parse_queries(f):
    text = f.read()
    pattern = r'\*FIND\s+\d+\s+(.*?)(?=\s*\*FIND\s+\d+|$)'
    result = []
    matches = re.findall(pattern, text, re.DOTALL)

    for match in matches:
        result.append(match.strip())
    
    return result

In [142]:
def parse_stopwords(f):
    text = f.read()
    text = text.lower()
    
    return set(nltk.word_tokenize(text))

In [143]:
docs = parse_documents(f_time_all)

In [144]:
len(docs)

422

In [145]:
queries = parse_queries(f_query)

In [146]:
samp_doc = docs[0]

In [147]:
stopwords = parse_stopwords(f_stp)

In [148]:
# Preprocessing
# 1. conv. to lowercase
# 2. tokenization
# 3. stopword, spl char removal
# 4. stemming

In [149]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [150]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [151]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords and i not in string.punctuation:
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [152]:
transform_text(samp_doc)[:251]

'alli nassau decemb 1960 propos help nato develop nuclear strike forc europ attempt devis plan studi nassau accord presid kennedi prime minist macmillan european saw emerg outlin nuclear nato want support sprang crisi cancel skybolt missil offer suppli'

In [153]:
processed_docs = []

for doc in docs:
    processed_docs.append(transform_text(doc))

In [154]:
# 1. Cosine Similarity

In [155]:
# Wik = (TFik/LENi)*log(N+1)/0.5+DFk
# TFik - term frequency within doc.
# LENi - len of current doc.
# N - total docs
# DFk - document frequency of the word (no. of documents for which the word exists)

In [156]:
import math

In [157]:
def TF(word, mymap):
    return mymap[word]

def DF(word, corpus_maps):
    count = 0
    
    for hm in corpus_maps:
        if word in hm:
            count += 1
    
    return count

def calculate_weights(doc, doc_map, corpus_maps):
    vector = [0]*len(doc_map)
    
    mymap = {}
    
    for i in doc.split():
        if i in mymap:
            mymap[i] +=1
        else:
            mymap[i] = 1
    
    for word in doc.split():
        TFik = TF(word, mymap)
        LENi = len(doc)
        N = len(corpus_maps)
        DFk = DF(word, corpus_maps)
        
        try:
            index = list(doc_map.keys()).index(word)
            vector[index] = (TFik/LENi)*(math.log(N+1)/(0.5+DFk))
        except ValueError:
            pass
    
    return vector

In [158]:
doc_map = {}

for doc in processed_docs:
    for word in doc.split():
        if word in doc_map:
            doc_map[word] += 1
        else:
            doc_map[word] = 1

doc_matrix = []

list_of_maps = []

for doc in processed_docs:
    hm = {}
    
    for word in doc.split():
        if word in hm:
            hm[word] += 1
        else:
            hm[word] = 1
    
    list_of_maps.append(hm)

for doc in processed_docs:
    vector = calculate_weights(doc, doc_map, list_of_maps)
    doc_matrix.append(vector)

In [159]:
import numpy as np

In [160]:
np.array(doc_matrix).shape

(422, 13817)

In [161]:
# Converting queries to TD matrix

In [162]:
queries[:5]

['KENNEDY ADMINISTRATION PRESSURE ON NGO DINH DIEM TO STOP\n\nSUPPRESSING THE BUDDHISTS .',
 "EFFORTS OF AMBASSADOR HENRY CABOT LODGE TO GET VIET NAM'S\n\nPRESIDENT DIEM TO CHANGE HIS POLICIES OF POLITICAL REPRESSION .",
 'NUMBER OF TROOPS THE UNITED STATES HAS STATIONED IN SOUTH\n\nVIET NAM AS COMPARED WITH THE NUMBER OF TROOPS IT HAS STATIONED\n\nIN WEST GERMANY .',
 'U.S . POLICY TOWARD THE NEW REGIME IN SOUTH VIET NAM WHICH OVERTHREW\n\nPRESIDENT DIEM .',
 'PERSONS INVOLVED IN THE VIET NAM COUP .']

In [163]:
processed_queries = []

for query in queries:
    processed_queries.append(transform_text(query))

In [164]:
processed_queries[:5]

['kennedi administr pressur ngo dinh diem stop suppress buddhist',
 'effort ambassador henri cabot lodg viet presid diem chang polici polit repress',
 'number troop unit state station south viet nam compar number troop station west germani',
 'polici new regim south viet nam overthrew presid diem',
 'person involv viet nam coup']

In [165]:
from sklearn.metrics.pairwise import cosine_similarity

In [166]:
def fetch_docs(query):
    processed_query = transform_text(query)
    query_td = calculate_weights(processed_query, doc_map, list_of_maps)
    
    ret = []
    for i in range(len(doc_matrix)):
        vec1 = np.array(doc_matrix[i])
        vec2 = np.array(query_td)
        
        ret.append((i, cosine_similarity([vec1], [vec2])[0][0]))
    
    ret.sort(key= lambda x:-x[1])
    
    return ret

In [167]:
search_res = fetch_docs(queries[0])[:12]
print(f"Query: \n{queries[0]}")

Query: 
KENNEDY ADMINISTRATION PRESSURE ON NGO DINH DIEM TO STOP

SUPPRESSING THE BUDDHISTS .


In [168]:
counter = 0
for doc_no, score in search_res:
    counter += 1
    print(f"Retrieved Document: {counter}, Calculated cosine similarity: {score}\n")
    print(docs[doc_no][:251].strip().lower())
    print()

Retrieved Document: 1, Calculated cosine similarity: 0.18742347658203506

south viet nam the buddhist crisis in saigon's huge xa

loi pagoda, buddhist monks and nuns were holding a 48-hour hunger

strike against the regime of south viet nam's president ngo dinh diem .

expecting trouble, police sealed off nearby streets wit

Retrieved Document: 2, Calculated cosine similarity: 0.15482544638791104

south viet nam 2-12 the religious crisis a dusk-to-dawn curfew emptied

the streets of the ancient vietnamese capital of hue, 400 miles north

of saigon . riot police and armored personnel carriers patrolled the

dark and deserted city . roadblocks we

Retrieved Document: 3, Calculated cosine similarity: 0.12006237606411654

south viet nam : the new regime for a while, saigon looked

like a city liberated . vietnamese g.i.s guarding public buildings

munched oranges, bananas and candy, showered on them by civilians

grateful for the overthrow of the regime . pretty girls

Retrieved Document: 