In [4]:
# IR PS3
import re
import string

In [5]:
f_time_all = open("Z:/20XW86-IR-LAB/ws3/dataset-info/TIME.ALL", 'r')
f_query = open("Z:/20XW86-IR-LAB/ws3/dataset-info/TIME.QUE", 'r')
f_stp = open("Z:/20XW86-IR-LAB/ws3/dataset-info/TIME.STP", 'r')

In [6]:
def parse_documents(f):
    documents = []
    fstr = f.read()
    regex = r"\*TEXT\s+\d{3}\s+\d{2}/\d{2}/\d{2}\s+PAGE\s+\d{3}\n\n"
    
    iters = []
    result = re.finditer(regex, fstr, re.DOTALL)
    for key in result:
        iters.append(key)

    for i in range(len(iters)-1):
        i1 = iters[i].span()[1]
        i2 = iters[i+1].span()[0]
        
        documents.append(fstr[i1:i2])
    
    return documents

In [7]:
def parse_queries(f):
    text = f.read()
    pattern = r'\*FIND\s+\d+\s+(.*?)(?=\s*\*FIND\s+\d+|$)'
    result = []
    matches = re.findall(pattern, text, re.DOTALL)

    for match in matches:
        result.append(match.strip())
    
    return result

In [8]:
def parse_stopwords(f):
    text = f.read()
    text = text.lower()
    
    return set(nltk.word_tokenize(text))

In [9]:
docs = parse_documents(f_time_all)

In [10]:
len(docs)

422

In [11]:
queries = parse_queries(f_query)

In [12]:
samp_doc = docs[0]

In [13]:
stopwords = parse_stopwords(f_stp)

In [14]:
# Preprocessing
# 1. conv. to lowercase
# 2. tokenization
# 3. stopword, spl char removal
# 4. stemming

In [15]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')                                      

In [16]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [17]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords and i not in string.punctuation:
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [18]:
transform_text(samp_doc)[:251]

'alli nassau decemb 1960 propos help nato develop nuclear strike forc europ attempt devis plan studi nassau accord presid kennedi prime minist macmillan european saw emerg outlin nuclear nato want support sprang crisi cancel skybolt missil offer suppli'

In [19]:
processed_docs = []

for doc in docs:
    processed_docs.append(transform_text(doc))

In [20]:
# 1. Cosine Similarity

In [21]:
# Wik = (TFik/LENi)*log(N+1)/0.5+DFk
# TFik - term frequency within doc.
# LENi - len of current doc.
# N - total docs
# DFk - document frequency of the word (no. of documents for which the word exists)

In [22]:
import math

In [23]:
def TF(word, mymap):
    return mymap[word]

def DF(word, corpus_maps):
    count = 0
    
    for hm in corpus_maps:
        if word in hm:
            count += 1
    
    return count

def calculate_weights(doc, doc_map, corpus_maps):
    vector = [0]*len(doc_map)
    
    mymap = {}
    
    for i in doc.split():
        if i in mymap:
            mymap[i] +=1
        else:
            mymap[i] = 1
    
    for word in doc.split():
        TFik = TF(word, mymap)
        LENi = len(doc)
        N = len(corpus_maps)
        DFk = DF(word, corpus_maps)
        
        try:
            index = list(doc_map.keys()).index(word)
            vector[index] = (TFik/LENi)*(math.log(N+1)/(0.5+DFk))
        except ValueError:
            pass
    
    return vector

In [24]:
doc_map = {}

for doc in processed_docs:
    for word in doc.split():
        if word in doc_map:
            doc_map[word] += 1
        else:
            doc_map[word] = 1

doc_matrix = []

list_of_maps = []

for doc in processed_docs:
    hm = {}
    
    for word in doc.split():
        if word in hm:
            hm[word] += 1
        else:
            hm[word] = 1
    
    list_of_maps.append(hm)

for doc in processed_docs:
    vector = calculate_weights(doc, doc_map, list_of_maps)
    doc_matrix.append(vector)

In [25]:
import numpy as np

In [26]:
np.array(doc_matrix).shape

(422, 13817)

In [27]:
# Converting queries to TD matrix

In [28]:
queries[:5]

['KENNEDY ADMINISTRATION PRESSURE ON NGO DINH DIEM TO STOP\n\nSUPPRESSING THE BUDDHISTS .',
 "EFFORTS OF AMBASSADOR HENRY CABOT LODGE TO GET VIET NAM'S\n\nPRESIDENT DIEM TO CHANGE HIS POLICIES OF POLITICAL REPRESSION .",
 'NUMBER OF TROOPS THE UNITED STATES HAS STATIONED IN SOUTH\n\nVIET NAM AS COMPARED WITH THE NUMBER OF TROOPS IT HAS STATIONED\n\nIN WEST GERMANY .',
 'U.S . POLICY TOWARD THE NEW REGIME IN SOUTH VIET NAM WHICH OVERTHREW\n\nPRESIDENT DIEM .',
 'PERSONS INVOLVED IN THE VIET NAM COUP .']

In [29]:
processed_queries = []

for query in queries:
    processed_queries.append(transform_text(query))

In [30]:
processed_queries[:5]

['kennedi administr pressur ngo dinh diem stop suppress buddhist',
 'effort ambassador henri cabot lodg viet presid diem chang polici polit repress',
 'number troop unit state station south viet nam compar number troop station west germani',
 'polici new regim south viet nam overthrew presid diem',
 'person involv viet nam coup']

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
def fetch_docs(query):
    processed_query = transform_text(query)
    query_td = calculate_weights(processed_query, doc_map, list_of_maps)
    
    ret = []
    for i in range(len(doc_matrix)):
        vec1 = np.array(doc_matrix[i])
        vec2 = np.array(query_td)
        
        ret.append((i, cosine_similarity([vec1], [vec2])[0][0]))
    
    ret.sort(key= lambda x:-x[1])
    
    return ret

In [46]:
search_res = fetch_docs(queries[69])[:12]
print(f"Query: \n{queries[69]}")

Query: 
KING WHO SIGNED AWAY HIS POWER OF STATE GIVING FREE REIN TO HIS

HALF-BROTHER FEISAL'S REFORM RULE .


In [47]:
counter = 0
for doc_no, score in search_res:
    counter += 1
    print(f"Retrieved Document: {counter}, Calculated cosine similarity: {score}\n")
    print(docs[doc_no][:251].strip().lower())
    print()

Retrieved Document: 1, Calculated cosine similarity: 0.09799607796275651

iran no longer for the corrupt " corruption is the lubricant of

the iranian economy, " a diplomat in teheran once observed . depending

on the size of the pishkash (bribe), justice was bought and sold, tax

rights were purchased, government jobs auct

Retrieved Document: 2, Calculated cosine similarity: 0.07604042081483217

of firs, flies & fears thaw was in the moscow air last week,

melting the first thin layers of snow after the long months of winter .

but to the 500 writers, musicians, painters and poets gathered in the

kremlin's sverdlov hall last week, the changi

Retrieved Document: 3, Calculated cosine similarity: 0.05310225840266894

saudi arabia the ailing, failing king in recent months

62-year-old kingsaud of saudi arabia has suffered a succession of

intestinal, stomach, chest, circulatory and heart ailments . often they

seem to be aggravated by the swirling political events

Retrieved Document: 

In [39]:
# BIM
doc_map

{'alli': 85,
 'nassau': 11,
 'decemb': 15,
 '1960': 42,
 'propos': 70,
 'help': 136,
 'nato': 108,
 'develop': 51,
 'nuclear': 115,
 'strike': 76,
 'forc': 332,
 'europ': 207,
 'attempt': 55,
 'devis': 5,
 'plan': 154,
 'studi': 31,
 'accord': 50,
 'presid': 267,
 'kennedi': 87,
 'prime': 143,
 'minist': 382,
 'macmillan': 119,
 'european': 114,
 'saw': 23,
 'emerg': 58,
 'outlin': 7,
 'want': 156,
 'support': 160,
 'sprang': 1,
 'crisi': 61,
 'cancel': 24,
 'skybolt': 9,
 'missil': 42,
 'offer': 77,
 'suppli': 71,
 'britain': 315,
 'franc': 240,
 'polari': 35,
 'dec': 10,
 '28': 16,
 'leader': 265,
 'unreservedli': 1,
 'welcom': 40,
 'harold': 65,
 'keep': 26,
 'separ': 28,
 'deterr': 22,
 'save': 38,
 'neck': 10,
 'beam': 12,
 'weapon': 60,
 'gener': 268,
 'term': 50,
 'briton': 26,
 'sure': 37,
 'govern': 658,
 'shoulder': 17,
 'none': 31,
 '800': 4,
 'cost': 52,
 'pour': 36,
 'spend': 24,
 '1': 53,
 'billion': 36,
 'fleet': 40,
 'submarin': 22,
 'british': 247,
 'abl': 46,
 'design