In [1]:
import numpy as np
import pandas as pd
import re
import collections

In [2]:
# Read the documents
docDB = []
docs_names = ['doc1.txt', 'doc2.txt', 'doc3.txt']
for name in docs_names:
    with open(f'./docs/{name}', 'r') as f:
        docDB.append(f.read())
docDB

['THE ALLIES AFTER NASSAU IN DECEMBER 1960, THE U.S . FIRST\n\nPROPOSED TO HELP NATO DEVELOP ITS OWN NUCLEAR STRIKE FORCE . BUT EUROPE\n\nMADE NO ATTEMPT TO DEVISE A PLAN . LAST WEEK, AS THEY STUDIED THE\n\nNASSAU ACCORD BETWEEN PRESIDENT KENNEDY AND PRIME MINISTER MACMILLAN,\n\nEUROPEANS SAW EMERGING THE FIRST OUTLINES OF THE NUCLEAR NATO THAT THE\n\nU.S . WANTS AND WILL SUPPORT . IT ALL SPRANG FROM THE ANGLO-U.S .\n\nCRISIS OVER CANCELLATION OF THE BUG-RIDDEN SKYBOLT MISSILE, AND THE\n\nU.S . OFFER TO SUPPLY BRITAIN AND FRANCE WITH THE PROVED POLARIS (TIME,\n\nDEC . 28) . THE ONE ALLIED LEADER WHO UNRESERVEDLY WELCOMED THE POLARIS\n\nOFFER WAS HAROLD MACMILLAN, WHO BY THUS KEEPING A SEPARATE NUCLEAR\n\nDETERRENT FOR BRITAIN HAD SAVED HIS OWN NECK . BACK FROM NASSAU, THE\n\nPRIME MINISTER BEAMED THAT BRITAIN NOW HAD A WEAPON THAT " WILL LAST A\n\nGENERATION . THE TERMS ARE VERY GOOD . " MANY OTHER BRITONS WERE NOT SO\n\nSURE . THOUGH THE GOVERNMENT WILL SHOULDER NONE OF THE $800 MILLI

In [3]:
# Read the queries
with open('./docs/queries.txt', 'r') as f:
    content = f.read()
    matches = re.findall(r'\*QUERY\n\n(.+?)(\n\n)?(?=\*QUERY|$)', content, re.DOTALL)
queriesDB = [query[0] for query in matches]
queriesDB

['THE ALLIES IN NASSAU',
 'KHRUSHCHEV AND\n\nRUSSIA',
 'BERLIN ONE LAST RUN HANS WEIDNER HAD BEEN HOPING FOR MONTHS TO\n\nESCAPE DRAB EAST GERMANY AND MAKE HIS WAY TO THE WEST']

In [4]:
# Read the stop words
with open('./docs/stop.txt', 'r') as f:
    content = f.read()
    matches = re.findall(r'^[A-Z0-9]+$', content, re.MULTILINE)
stopWordsDB = [word.lower() for word in matches]
stopWordsDB

['a',
 'about',
 'above',
 'across',
 'actually',
 'add',
 'added',
 'after',
 'again',
 'against',
 'ago',
 'all',
 'almost',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'an',
 'and',
 'another',
 'any',
 'anyone',
 'are',
 'around',
 'as',
 'asked',
 'at',
 'b',
 'back',
 'bad',
 'be',
 'became',
 'because',
 'become',
 'been',
 'before',
 'began',
 'behind',
 'being',
 'best',
 'better',
 'between',
 'big',
 'biggest',
 'both',
 'brought',
 'but',
 'by',
 'c',
 'called',
 'came',
 'can',
 'cannot',
 'cent',
 'come',
 'complete',
 'continued',
 'could',
 'd',
 'day',
 'decided',
 'declared',
 'despite',
 'did',
 'do',
 'does',
 'down',
 'during',
 'e',
 'each',
 'early',
 'eight',
 'enough',
 'entire',
 'ep',
 'etc',
 'even',
 'ever',
 'every',
 'everything',
 'f',
 'face',
 'faced',
 'fact',
 'failed',
 'far',
 'fell',
 'few',
 'finally',
 'find',
 'first',
 'five',
 'for',
 'found',
 'four',
 'from',
 'g',
 'gave',
 'get',
 'give',
 'given',
 'go',
 'go

In [5]:
# Tokenize + Normalize with Stopping
def process_document(document_text):
    return ' '.join([token.lower() for token in document_text.split() if token.lower().isalnum() and token.lower() not in stopWordsDB])

In [6]:
processed_docs = [process_document(doc) for doc in docDB]
processed_docs

['allies nassau december proposed help nato develop nuclear strike force europe attempt devise plan studied nassau accord president kennedy prime minister europeans saw emerging outlines nuclear nato wants support sprang crisis cancellation skybolt offer supply britain france polaris dec allied leader unreservedly welcomed polaris offer harold keeping separate nuclear deterrent britain saved neck prime minister beamed britain weapon generation terms britons sure government shoulder none development cost poured skybolt spend billion fleet submarines british able build prove nuclear fleet bomber force presumably obsolete tory backbenchers loudly skeptical call type nassau stipulates polaris submarine except supreme national interests committed truly multilateral nato force mean britain eventually strike force decide national interests justify withdrawal submarines particularly national interests conflict policy inclusion offer deliberate ploy jack kennedy end downgrade prized special rel

In [7]:
# weight calc
def df(term, documentDB):
    return len([1 for document in documentDB if term in document.split()])

def idf(term, documentDB):
    return np.log10((len(documentDB) + 1) / (df(term, documentDB) + 0.5))

In [8]:
all_terms = sorted(set([term for doc in processed_docs for term in doc.split()]))
all_terms

['100',
 '115',
 '1954',
 '20',
 '32',
 '4',
 '40',
 '50',
 'able',
 'abruptly',
 'accept',
 'accord',
 'achieve',
 'acres',
 'adenauer',
 'advance',
 'advanced',
 'affixed',
 'ahead',
 'aimed',
 'alliance',
 'allied',
 'allies',
 'ancient',
 'answer',
 'apprehensive',
 'argued',
 'army',
 'arrived',
 'arsenal',
 'atlantic',
 'atomic',
 'attached',
 'attack',
 'attacked',
 'attempt',
 'autobahn',
 'awaited',
 'away',
 'backbenchers',
 'backed',
 'bail',
 'barbed',
 'barrier',
 'barriers',
 'beamed',
 'becomes',
 'belongings',
 'berlin',
 'billion',
 'bits',
 'bitter',
 'blades',
 'blame',
 'bleak',
 'blew',
 'bolted',
 'bomber',
 'bombs',
 'boost',
 'border',
 'boss',
 'bounced',
 'bragged',
 'britain',
 'british',
 'britons',
 'bruises',
 'build',
 'bulletin',
 'bullets',
 'bus',
 'call',
 'cancellation',
 'cannon',
 'capacity',
 'cargo',
 'cartoonists',
 'case',
 'central',
 'chancellor',
 'charge',
 'charles',
 'chassis',
 'checkpoint',
 'children',
 'chop',
 'christmas',
 'chugged'

In [9]:
idfCache = {term:idf(term, processed_docs) for term in all_terms}
idfCache

{'100': 0.4259687322722811,
 '115': 0.4259687322722811,
 '1954': 0.20411998265592482,
 '20': 0.4259687322722811,
 '32': 0.4259687322722811,
 '4': 0.4259687322722811,
 '40': 0.4259687322722811,
 '50': 0.4259687322722811,
 'able': 0.4259687322722811,
 'abruptly': 0.4259687322722811,
 'accept': 0.4259687322722811,
 'accord': 0.4259687322722811,
 'achieve': 0.4259687322722811,
 'acres': 0.4259687322722811,
 'adenauer': 0.4259687322722811,
 'advance': 0.4259687322722811,
 'advanced': 0.4259687322722811,
 'affixed': 0.4259687322722811,
 'ahead': 0.4259687322722811,
 'aimed': 0.4259687322722811,
 'alliance': 0.4259687322722811,
 'allied': 0.4259687322722811,
 'allies': 0.4259687322722811,
 'ancient': 0.4259687322722811,
 'answer': 0.4259687322722811,
 'apprehensive': 0.4259687322722811,
 'argued': 0.4259687322722811,
 'army': 0.4259687322722811,
 'arrived': 0.4259687322722811,
 'arsenal': 0.4259687322722811,
 'atlantic': 0.4259687322722811,
 'atomic': 0.4259687322722811,
 'attached': 0.425968

In [10]:
def calculateWeightVector(document, idfCache):
    counter = collections.Counter(document.split())
    return [counter.get(term, 0) * idfCache[term] for term in all_terms]

In [11]:
tdMatrix = pd.DataFrame([calculateWeightVector(document, idfCache) for docID, document in enumerate(processed_docs)], columns=all_terms)
tdMatrix

Unnamed: 0,100,115,1954,20,32,4,40,50,able,abruptly,...,wheel,whirling,windshield,wire,withdrawal,within,wives,work,worst,yards
0,0.0,0.0,0.20412,0.0,0.0,0.425969,0.0,0.425969,0.425969,0.0,...,0.0,0.0,0.0,0.0,0.425969,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.20412,0.0,0.425969,0.0,0.0,0.0,0.0,0.425969,...,0.0,0.0,0.0,0.0,0.0,0.425969,0.0,0.425969,0.425969,0.0
2,0.425969,0.425969,0.0,0.425969,0.0,0.0,0.425969,0.0,0.0,0.0,...,0.851937,0.425969,0.425969,0.425969,0.0,0.0,0.851937,0.0,0.0,0.851937


In [12]:
processed_queries = [process_document(query) for query in queriesDB]
processed_queries

['allies nassau',
 'khrushchev russia',
 'berlin run hans weidner hoping escape drab east germany west']

In [13]:
def cosine_similarity(queryWeights, tdMatrix):
    return np.dot(tdMatrix, queryWeights) / (np.linalg.norm(queryWeights) * np.linalg.norm(tdMatrix, axis=1))

def find_similarity_order(query, tdMatrix):
    cosine_similarity_scores = cosine_similarity(calculateWeightVector(query, idfCache), tdMatrix.values)

    df = pd.DataFrame({'docID': tdMatrix.index, 'cosine_similarity_scores': cosine_similarity_scores})
    sorted_df = df.sort_values(by='cosine_similarity_scores', ascending=False)

    return sorted_df['docID'].values.tolist()

In [14]:
{str(query_no):find_similarity_order(query_text, tdMatrix) for query_no, query_text in enumerate(processed_queries)}

{'0': [0, 1, 2], '1': [1, 0, 2], '2': [2, 0, 1]}

BIM

In [15]:
def trk(term, rel_docs, non_rel_docs):
    rk = df(term, rel_docs)
    nrk = df(term, non_rel_docs)
    
    pk = (rk+0.5)/(len(rel_docs)-rk+0.5)
    qk = (nrk+0.5)/(len(non_rel_docs)-nrk+0.5)
    
    return np.log10(pk/qk)

# BIM Phase 1 with all non-rel docs.
trk_cache = {term:trk(term, {}, processed_docs) for term in all_terms}
trk_cache

{'100': 0.22184874961635637,
 '115': 0.22184874961635637,
 '1954': -0.22184874961635637,
 '20': 0.22184874961635637,
 '32': 0.22184874961635637,
 '4': 0.22184874961635637,
 '40': 0.22184874961635637,
 '50': 0.22184874961635637,
 'able': 0.22184874961635637,
 'abruptly': 0.22184874961635637,
 'accept': 0.22184874961635637,
 'accord': 0.22184874961635637,
 'achieve': 0.22184874961635637,
 'acres': 0.22184874961635637,
 'adenauer': 0.22184874961635637,
 'advance': 0.22184874961635637,
 'advanced': 0.22184874961635637,
 'affixed': 0.22184874961635637,
 'ahead': 0.22184874961635637,
 'aimed': 0.22184874961635637,
 'alliance': 0.22184874961635637,
 'allied': 0.22184874961635637,
 'allies': 0.22184874961635637,
 'ancient': 0.22184874961635637,
 'answer': 0.22184874961635637,
 'apprehensive': 0.22184874961635637,
 'argued': 0.22184874961635637,
 'army': 0.22184874961635637,
 'arrived': 0.22184874961635637,
 'arsenal': 0.22184874961635637,
 'atlantic': 0.22184874961635637,
 'atomic': 0.22184874

In [18]:
def calculateWeightVectorBIM(document, trk):
    token_stream = set(document.split())
    return [trk[term] if term in token_stream else 0 for term in all_terms]

def generate_td_matrix(processed_docs, trk_cache):
    return pd.DataFrame([calculateWeightVectorBIM(document, trk_cache) for document in processed_docs], columns=all_terms)

tdMatrixBIM = generate_td_matrix(processed_docs, trk_cache)
tdMatrixBIM

Unnamed: 0,100,115,1954,20,32,4,40,50,able,abruptly,...,wheel,whirling,windshield,wire,withdrawal,within,wives,work,worst,yards
0,0.0,0.0,-0.221849,0.0,0.0,0.221849,0.0,0.221849,0.221849,0.0,...,0.0,0.0,0.0,0.0,0.221849,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,-0.221849,0.0,0.221849,0.0,0.0,0.0,0.0,0.221849,...,0.0,0.0,0.0,0.0,0.0,0.221849,0.0,0.221849,0.221849,0.0
2,0.221849,0.221849,0.0,0.221849,0.0,0.0,0.221849,0.0,0.0,0.0,...,0.221849,0.221849,0.221849,0.221849,0.0,0.0,0.221849,0.0,0.0,0.221849


In [21]:
def simple_similarity(tdMatrix, queryWeights):
    return np.dot(tdMatrix, queryWeights)

def find_similarity_order_BIM(query, tdMatrix):
    similarity_scores = simple_similarity(tdMatrix.values, calculateWeightVectorBIM(query, trk_cache))

    df = pd.DataFrame({"docID": tdMatrix.index, "similarity_scores": similarity_scores})
    sorted_df = df.sort_values(by="similarity_scores", ascending=False)

    return sorted_df['docID'].values.tolist()

In [22]:
BIM_phase_1_results = {str(query_no):find_similarity_order_BIM(query_text, tdMatrix) for query_no, query_text in enumerate(processed_queries)}
BIM_phase_1_results

{'0': [0, 1, 2], '1': [1, 2, 0], '2': [2, 1, 0]}

In [27]:
def recompute_trk(rel_docs, documentDB):
    relevant_docs, non_relevant_docs = {}, {}
    for docID, docText in enumerate(documentDB):
        if docID in rel_docs:
            relevant_docs[str(docID)] = docText
        else:
            non_relevant_docs[str(docID)] = docText
    return {term:trk(term, relevant_docs, non_relevant_docs) for term in all_terms}

def BIMPhase2(BIM_Phase_1_results, queries, docs):
    result = {}
    for qNo, qText in enumerate(queries):
        newTrk = recompute_trk(BIM_Phase_1_results[str(qNo)], docs)
        newTDMatrix = generate_td_matrix(processed_docs, newTrk)
        result[str(qNo)] = find_similarity_order_BIM(qText, newTDMatrix)
    return result

In [28]:
BIMPhase2(BIM_phase_1_results, processed_queries, processed_docs)

{'0': [1, 2, 0], '1': [0, 1, 2], '2': [0, 1, 2]}