In [167]:
# IR PS3
import re
import string

In [168]:
f_time_all = open("Z:/20XW86-IR-LAB/ws3/dataset-info/TIME.ALL", 'r')
f_query = open("Z:/20XW86-IR-LAB/ws3/dataset-info/TIME.QUE", 'r')
f_stp = open("Z:/20XW86-IR-LAB/ws3/dataset-info/TIME.STP", 'r')

In [169]:
def parse_documents(f):
    documents = []
    fstr = f.read()
    regex = r"\*TEXT\s+\d{3}\s+\d{2}/\d{2}/\d{2}\s+PAGE\s+\d{3}\n\n"
    
    iters = []
    result = re.finditer(regex, fstr, re.DOTALL)
    for key in result:
        iters.append(key)

    for i in range(len(iters)-1):
        i1 = iters[i].span()[1]
        i2 = iters[i+1].span()[0]
        
        documents.append(fstr[i1:i2])
    
    return documents

In [170]:
def parse_queries(f):
    text = f.read()
    result = re.findall(r'FIND\s+\d+\s+(.*)', text)
    
    return result

In [171]:
def parse_stopwords(f):
    text = f.read()
    text = text.lower()
    
    return set(nltk.word_tokenize(text))

In [172]:
docs = parse_documents(f_time_all)

In [173]:
len(docs)

422

In [174]:
queries = parse_queries(f_query)

In [175]:
samp_doc = docs[0]

In [176]:
stopwords = parse_stopwords(f_stp)

In [177]:
# Preprocessing
# 1. conv. to lowercase
# 2. tokenization
# 3. stopword, spl char removal
# 4. stemming

In [178]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [179]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [180]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords and i not in string.punctuation:
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [181]:
transform_text(samp_doc)[:251]

'alli nassau decemb 1960 propos help nato develop nuclear strike forc europ attempt devis plan studi nassau accord presid kennedi prime minist macmillan european saw emerg outlin nuclear nato want support sprang crisi cancel skybolt missil offer suppli'

In [182]:
processed_docs = []

for doc in docs:
    processed_docs.append(transform_text(doc))

In [183]:
# 1. Cosine Similarity

In [184]:
# Wik = (TFik/LENi)*log(N+1)/0.5+DFk
# TFik - term frequency within doc.
# LENi - len of current doc.
# N - total docs
# DFk - document frequency of the word (no. of documents for which the word exists)

In [185]:
import math

In [186]:
def TF(word, mymap):
    return mymap[word]

def DF(word, corpus_maps):
    count = 0
    
    for hm in corpus_maps:
        if word in hm:
            count += 1
    
    return count

def calculate_weights(doc, doc_map, corpus_maps):
    vector = [0]*len(doc_map)
    
    mymap = {}
    
    for i in doc.split():
        if i in mymap:
            mymap[i] +=1
        else:
            mymap[i] = 1
    
    for word in doc.split():
        TFik = TF(word, mymap)
        LENi = len(doc)
        N = len(corpus_maps)
        DFk = DF(word, corpus_maps)
        
        try:
            index = list(doc_map.keys()).index(word)
            vector[index] = (TFik/LENi)*(math.log(N+1)/(0.5+DFk))
        except ValueError:
            pass
    
    return vector

In [187]:
doc_map = {}

for doc in processed_docs:
    for word in doc.split():
        if word in doc_map:
            doc_map[word] += 1
        else:
            doc_map[word] = 1

doc_matrix = []

list_of_maps = []

for doc in processed_docs:
    hm = {}
    
    for word in doc.split():
        if word in hm:
            hm[word] += 1
        else:
            hm[word] = 1
    
    list_of_maps.append(hm)

for doc in processed_docs:
    vector = calculate_weights(doc, doc_map, list_of_maps)
    doc_matrix.append(vector)

In [188]:
import numpy as np

In [189]:
np.array(doc_matrix).shape

(422, 13817)

In [190]:
# Converting queries to TD matrix

In [191]:
queries[:5]

['KENNEDY ADMINISTRATION PRESSURE ON NGO DINH DIEM TO STOP',
 "EFFORTS OF AMBASSADOR HENRY CABOT LODGE TO GET VIET NAM'S",
 'NUMBER OF TROOPS THE UNITED STATES HAS STATIONED IN SOUTH',
 'U.S . POLICY TOWARD THE NEW REGIME IN SOUTH VIET NAM WHICH OVERTHREW',
 'PERSONS INVOLVED IN THE VIET NAM COUP .']

In [192]:
processed_queries = []

for query in queries:
    processed_queries.append(transform_text(query))

In [193]:
processed_queries[:5]

['kennedi administr pressur ngo dinh diem stop',
 'effort ambassador henri cabot lodg viet nam',
 'number troop unit state station south',
 'polici new regim south viet nam overthrew',
 'person involv viet nam coup']

In [194]:
from sklearn.metrics.pairwise import cosine_similarity

In [200]:
def fetch_docs(query):
    processed_query = transform_text(query)
    query_td = calculate_weights(processed_query, doc_map, list_of_maps)
    
    ret = []
    for i in range(len(doc_matrix)):
        vec1 = np.array(doc_matrix[i])
        vec2 = np.array(query_td)
        
        ret.append((i, cosine_similarity([vec1], [vec2])[0][0]))
    
    ret.sort(key= lambda x:-x[1])
    
    return ret

In [219]:
fetch_docs(queries[0])

[(369, 0.1926245789654022),
 (333, 0.10963270489335762),
 (307, 0.10118166781065008),
 (256, 0.09281892129485787),
 (375, 0.08984621901891973),
 (170, 0.08782881252144237),
 (325, 0.08390640540546243),
 (358, 0.08259871712365713),
 (418, 0.06472228032596036),
 (267, 0.06294314066570232),
 (348, 0.06113510469232037),
 (210, 0.058346781859782204),
 (303, 0.05520149097941651),
 (322, 0.05498111763554032),
 (349, 0.04512860845746698),
 (394, 0.04337583373529462),
 (405, 0.041804327597002956),
 (383, 0.03484849298639136),
 (382, 0.030193493959815795),
 (287, 0.028031363746807474),
 (381, 0.01699208157301416),
 (169, 0.0140539768898993),
 (21, 0.01193979035511877),
 (134, 0.011422590655682859),
 (147, 0.010007979853153201),
 (29, 0.009532040934772829),
 (363, 0.0070869440997464),
 (253, 0.0070594356716257405),
 (37, 0.0070316352129795555),
 (278, 0.006924538321859695),
 (19, 0.006847234393622855),
 (163, 0.006356551586136572),
 (0, 0.006018516925990319),
 (184, 0.005909339745204609),
 (46, 0

In [212]:
docs[369]



In [213]:
docs[325]

'SOUTH VIET NAM THE CRACKDOWN OVER AND\n\nOVER, THE DESPERATE VOICE SHOUTED INTO THE TELEPHONE : " THEY ARE\n\nBREAKING INTO XA LOI PAGODA . THEY ARE BREAKING INTO XA LOI PAGODA . "\n\nIN THE BACKGROUND, GUNFIRE MINGLED WITH THE CONFUSED SCREAMS OF\n\nBUDDHIST MONKS AND NUNS AND THE CLANGING ALARM OF THE HUGE BRASS GONG\n\nTHAT HANGS IN THE BELL TOWER OF SAIGON\'S LARGEST PAGODA . SUDDENLY THE\n\nPHONE CONNECTION FROM THE TEMPLE WENT DEAD . IT WAS 12 : 20 A.M .\n\nUSING THEIR RIFLE BUTTS AS CLUBS, SQUADS OF TOUGH, RIOTTRAINED "\n\nSPECIAL FORCES " SMASHED INTO THE PAGODA, BATTERING A PATH THROUGH A\n\nSMALL GUARD OF YOUNG BUDDHIST MONKS . THE TROOPERS HAD A LIST, AND EACH\n\nMONK ON THE LIST WAS CONSIDERED TO BE A " COMMUNIST IN DISGUISE . " ON\n\nTHE TEMPLE\'S SECOND FLOOR, ONE MONK TRIED TO RESIST AND WAS THROWN\n\nBODILY FROM A BALCONY TO THE COURT-YARD 20 FT . BELOW . OTHER MONKS AND\n\nNUNS WERE ROUTED FROM BEHIND A FLIMSY BARRICADE OF WOODEN BENCHES AND\n\nFORCED OUTSIDE BY TEAR 

In [214]:
docs[382]

'SOUTH VIET NAM OPTIMISM AT HONOLULU, PROBLEMS IN SAIGON IN ADMIRAL\n\nHARRY FELT\'S REINFORCED CONCRETE COMMAND POST HIGH ABOVE PEARL HARBOR,\n\nTOP U.S . OFFICIALS LAST WEEK GATHERED FOR THE FIRST EXHAUSTIVE POLICY\n\nSTUDY OF SOUTH VIET NAM SINCE THE COUP THAT TOPPLED THE DIEM REGIME THE\n\nNINE-HOUR CONFERENCE SECRETARY OF STATE DEAN RUSK, DEFENSE SECRETARY\n\nROBERT MCNAMARA AND JOINT CHIEFS OF STAFF CHAIRMAN MAXWELL TAYLOR HAD\n\nFLOWN IN FROM WASHINGTON ; FROM SAIGON CAME AMBASSADOR HENRY CABOT\n\nLODGE AND GENERAL PAUL HARKINS . THE HONOLULU MEETING EXUDED ALMOST\n\nRELENTLESS OPTIMISM ABOUT THE WAR, AND THE POLICYMAKERS CLUNG BRAVELY\n\nTO THE LINE THAT THINGS SHOULD BE SUFFICIENTLY IN HAND BY 1965 TO\n\nPERMIT COMPLETE WITHDRAWAL OF THE 16,500 AMERICAN TROOPS . JUST IN TIME\n\n. ONE OF THE FEW CONCRETE DECISIONS LEAKED FROM THE CONFERENCE WAS A\n\nHARDLY SURPRISING AGREEMENT TO INTENSIFY ANTI-GUERRILLA OPERATIONS IN\n\nSOUTH VIET NAM\'S RICE BOWL, A WEDGE-SHAPED SECTION OF TH

In [215]:
docs[369]

'SOUTH VIET NAM : THE NEW REGIME FOR A WHILE, SAIGON LOOKED\n\nLIKE A CITY LIBERATED . VIETNAMESE G.I.S GUARDING PUBLIC BUILDINGS\n\nMUNCHED ORANGES, BANANAS AND CANDY, SHOWERED ON THEM BY CIVILIANS\n\nGRATEFUL FOR THE OVERTHROW OF THE REGIME . PRETTY GIRLS EMBRACED\n\nSOLDIERS, DRAPED TANK TURRETS WITH GARLANDS, SCRAMBLED SQUEALING ABOARD\n\nARMY JEEPS . WITH THE LIFTING OF A TEMPORARY CURFEW AND MME . NHU\'S BAN\n\nON DANCING, SAIGON\'S LONG-REPRESSED NIGHT LIFE FLOWERED AS NEVER BEFORE\n\n. IN BARS AND CABARETS, THE B-GIRLS SHUCKED THE WHITE, HOSPITAL-LIKE\n\nSMOCKS THEY HAD BEEN FORCED TO WEAR UNDER THE MORALITY LAWS, WRIGGLED\n\nBACK INTO THEIR TRADITIONAL SLIT SKIRTS, OR INTO U.S.-STYLE SLACKS, TO\n\nTWIST AND TANGO WITH VIET AND AMERICAN SOLDIERS INTO THE SMALL HOURS .\n\nSHOPS REOPENED, REPAIRMEN RESTRUNG POWER LINES BLOWN DOWN BY BATTLE,\n\nAND SAFFRON-ROBED BUDDHIST MONKS EMERGED FROM JAIL OR HIDING (AMONG\n\nTHEM : TOP BUDDHIST THICH TRI QUANG, WHO HAD SOUGHT ASYLUM TEN WEEK

In [216]:
docs[322]

'SOUTH VIET NAM SUICIDE SERIES IT WAS THE MOST MACABRE WEEK IN\n\nSOUTH VIET NAM\'S THREE-MONTH-OLD RELIGIOUS AND POLITICAL CRISIS . IN\n\nSAIGON, AN 18-YEAR-OLD GIRL TRIED UNSUCCESSFULLY TO CUT OFF HER LEFT\n\nHAND " AS A HUMBLE OFFERING TO BUDDHA WHILE OUR RELIGION IS IN DANGER .\n\n/ OUTSIDE THE COASTAL CITY OF HUE, A 17-YEAR-OLD NOVICE BUDDHIST MONK\n\nWRAPPED HIMSELF IN A KEROSENE-SOAKED, SIX-COLOR BUDDHIST FLAG, THEN\n\nSTRUCK A MATCH . IN THE VILLAGE OF NINHHOA, 200 MILES NORTH OF SAIGON,\n\nA YOUNG BUDDHIST NUN SAT DOWN IN A CATHOLIC SCHOOL PLAYGROUND AND SET\n\nHERSELF ON FIRE . LESS THAN 24 HOURS LATER, BACK IN HUE, A 71-YEAR-OLD\n\nMONK ANNOUNCED OVER THE TUDAM PAGODA LOUDSPEAKER THAT HE WAS GOING TO\n\nKILL HIMSELF, THEN BURNED HIMSELF TO DEATH IN THE PAGODA\'S COURTYARD .\n\nTHE QUARREL SPREADS . THE\n\nTHREE RITUALISTIC SUICIDES BROUGHT TO FIVE THE NUMBER OF BUDDHISTS WHO\n\nHAVE TURNED THEMSELVES INTO HUMAN TORCHES IN FURTHER PROTEST AGAINST\n\nTHE REGIME OF SOUTH VIET N

In [217]:
docs[348]

'SOUTH VIET NAM REPORT ON THE WAR OVERSHADOWED\n\nBY THE POLITICAL AND DIPLOMATIC TURMOIL IN SAIGON, THE ALL BUT\n\nFORGOTTEN WAR AGAINST THE VIET CONG CONTINUES ON ITS UGLY, BLOODY AND\n\nWEARISOME COURSE . THE DRIVE AGAINST THE COMMUNISTS HAS NOT DIMINISHED\n\nIN RECENT WEEKS ; IN FACT, IT HAS INTENSIFIED . FEARS THAT THE\n\nBUDDHIST CONTROVERSY MIGHT DAMAGE MORALE AMONG VIETNAMESE TROOPS HAVE\n\nSO FAR BEEN GROUNDLESS . IF LAST WEEK\'S BATTLES WERE ANY CRITERION, THE\n\nGOVERNMENT SOLDIERS ARE FIGHTING BETTER THAN EVER AGAINST A COMMUNIST\n\nFOE THAT IS EXACTING A HIDEOUS PRICE IN BLOOD IN THE FLOODED PADDIES OF\n\nTHE SOUTH . THE BIGGEST GOVERNMENT VICTORY IN MONTHS CAME LAST WEEK\n\nNEAR THE TOWN OF GOCONG, 45 MILES SOUTH OF SAIGON . IN THE DEAD OF\n\nNIGHT, 500 VIET CONG REGULARS SWOOPED DOWN ON A STRATEGIC HAMLET UNDER\n\nA SCREEN OF SUPPORTING FIRE FROM HEAVY MACHINE GUNS AND RECOILLESS\n\nRIFLES . DESPERATELY CALLING FOR HELP OVER THEIR RADIO, THE DEFENDERS\n\nFOUGHT BACK DOGG

In [218]:
docs[303]

