In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopwords = stopwords.words('english')
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import os
import numpy as np
import math

corpus = []
files = os.listdir('/content/drive/MyDrive/IR Assignments/A2/preprocessed')

for i in files:
    path = '/content/drive/MyDrive/IR Assignments/A2/preprocessed/' + i
    f = open(path, 'r')
    for i in f:
        corpus.append(i)

In [None]:
doc_tokens = []
for i in corpus:
    tokens = i.split()
    doc_tokens.append(tokens)


In [None]:
len(doc_tokens)

1399

In [None]:
def extraction(f):
    new_text = ''
    read = False
        
    for line in f:
        line = line.strip()        
        if line == '</TITLE>' or line == '</TEXT>':
            read = False
        if read:
            new_text += (line + ' ')
        if line == '<TITLE>' or line == '<TEXT>':
            read = True
      
    return new_text

In [None]:
def lowercase(x):
    return x.lower()

In [None]:
def tokenize(x):
    return nltk.word_tokenize(x) 

In [None]:
def remove_stopwords(x):
    temp_file = []

    for i in x:
        if i in stopwords:
            continue
        temp_file.append(i)
    
    s = ''
    for i in temp_file:
        s+=(i+ ' ')
    return s

In [None]:
def remove_punctuations(s):            
    s = re.sub(r'[^\w\s]','',s)
    return s

In [None]:
def remove_spaces(s):
    return s.strip()

In [None]:
def preprocess(s):
    s = lowercase(s)
    s = tokenize(s)
    s = remove_stopwords(s)
    s = remove_punctuations(s)
    s = remove_spaces(s)
    return s

In [None]:
def raw_tf(doc_tokens):
    raw_tf = {}
    for i in range(len(doc_tokens)):
        raw_tf[i] = {}
        unique_toks, tok_freq = np.unique(doc_tokens[i], return_counts=True)
        x = 0
        for j in unique_toks:
            raw_tf[i][j] = tok_freq[x]
            x+=1
    
    return raw_tf


In [None]:
def no_of_docs_containing_word(corpus, word):
    ans = 0
    for i in corpus:
        if word in i.split():
            ans+=1     
    return ans       

In [None]:
def compute_idf(corpus, vocab):
    idf = {}
    for j in tqdm(range(len(vocab))):
        word = vocab[j]
        freq = no_of_docs_containing_word(corpus,word)        
        idf[word] = math.log10(len(corpus) / freq)        
    return idf

In [None]:
def tf_weight(term, doc_tfs, scheme):
    if(scheme == "binary"): 
        if(term in doc_tfs.keys()): 
            return 1
        else: 
            return 0
    elif(scheme == "raw"): 
        if(term in doc_tfs.keys()): 
            return doc_tfs[term]
        else: 
            return 0
    elif(scheme == "tf"):
        if(term in doc_tfs.keys()): 
            total_terms = sum(doc_tfs.values())
            return doc_tfs[term] / total_terms
        else: 
            return 0
    elif(scheme == "log_norm"): 
        if(term in doc_tfs.keys()): 
            return math.log10(1 + doc_tfs[term])
        else: 
            return 0
    elif(scheme == "double_norm"): 
        if(term in doc_tfs.keys()): 
            t1 = 0.5
            t2 = (0.5)*(doc_tfs[term] / max(doc_tfs.values()))
            return t1 + t2
        else: 
            return 0.5

In [None]:
def gen_tf_idf(doc_tokens, vocab, scheme):
    num_docs = len(doc_tokens)

    raw_tfs = raw_tf(doc_tokens)
    term_idfs = compute_idf(corpus, vocab)

    num_words = len(vocab)

    tf_idf = np.zeros((num_docs, num_words))

    for i in tqdm(range(num_docs)):
        x = 0
        for j in vocab:            
            term_weight = tf_weight(j, raw_tfs[i], scheme)
            idf = term_idfs[j]
            tf_idf[i][x] = term_weight*idf
            x+=1

    return tf_idf


In [None]:
from tqdm import tqdm
vocab = []
for i in doc_tokens:
    for word in i:
        vocab.append(word)
vocab = list(set(vocab))

In [None]:
binary_tf_idf = gen_tf_idf(doc_tokens, vocab, 'binary')
raw_tf_idf = gen_tf_idf(doc_tokens, vocab, 'raw')
tf_tf_idf = gen_tf_idf(doc_tokens, vocab, 'tf')
log_norm_tf_idf = gen_tf_idf(doc_tokens, vocab, 'log_norm')
double_norm_tf_idf  = gen_tf_idf(doc_tokens, vocab, 'double_norm')

100%|██████████| 8960/8960 [01:41<00:00, 88.27it/s]
100%|██████████| 1399/1399 [00:10<00:00, 129.42it/s]
100%|██████████| 8960/8960 [01:42<00:00, 87.64it/s]
100%|██████████| 1399/1399 [00:13<00:00, 106.31it/s]
100%|██████████| 8960/8960 [01:44<00:00, 85.88it/s]
100%|██████████| 1399/1399 [00:13<00:00, 103.65it/s]
100%|██████████| 8960/8960 [01:41<00:00, 88.14it/s]
100%|██████████| 1399/1399 [00:12<00:00, 114.69it/s]
100%|██████████| 8960/8960 [01:40<00:00, 88.98it/s] 
100%|██████████| 1399/1399 [00:14<00:00, 99.29it/s] 


In [None]:
term_idfs = compute_idf(corpus, vocab)

100%|██████████| 8960/8960 [01:45<00:00, 85.13it/s]


In [None]:
def gen_query_vector(query_toks, scheme, term_idfs, vocab):
    vocab_len = len(vocab)
    num_query_toks = len(query_toks) 
    query_vector = [0] * vocab_len 
    query_tfs = {} 
    for i in range(num_query_toks):
        query_tfs[query_toks[i]] = 0
    for i in range(num_query_toks):
        query_tfs[query_toks[i]] += 1
    
    x = 0
    for i in query_toks:
        term_tf_weight = tf_weight(i, query_tfs, scheme)
        query_vector[x] = term_tf_weight * term_idfs.get(i, 0)
        x+=1   
    return query_vector

In [None]:
def calc_top_5(arr):
    arr = np.array(arr)
    return (-arr).argsort()[:10]

In [None]:
query = 'experimental slipstream sHear in simple'
query = preprocess(query).split()
q_vec_binary = gen_query_vector(query, 'binary', term_idfs, vocab)
q_vec_raw = gen_query_vector(query, 'raw', term_idfs, vocab)
q_vec_tf = gen_query_vector(query, 'tf', term_idfs, vocab)
q_vec_log_norm = gen_query_vector(query, 'log_norm', term_idfs, vocab)
q_vec_double_norm = gen_query_vector(query, 'double_norm', term_idfs, vocab)

In [None]:
binary_scores = np.dot(np.array(binary_tf_idf), np.array(q_vec_binary)) 
raw_scores = np.dot(np.array(raw_tf_idf), np.array(q_vec_raw)) 
tf_scores = np.dot(np.array(tf_tf_idf), np.array(q_vec_tf)) 
log_norm_scores = np.dot(np.array(log_norm_tf_idf), np.array(q_vec_log_norm)) 
double_norm_scores = np.dot(np.array(double_norm_tf_idf), np.array(q_vec_double_norm)) 

In [None]:
print("Binary top 5")
top_5 = calc_top_5(binary_scores)
for i in top_5:
    print(i, binary_scores[i])

Binary top 5
492 5.515225426973093
1163 5.515225426973093
429 5.515225426973093
481 3.748478388203395
82 2.7863964844710054
398 1.4490950836895478
639 1.4490950836895478
892 1.4490950836895478
1059 1.4490950836895478
11 1.4490950836895478


In [None]:
print("Raw top 5")
top_5 = calc_top_5(raw_scores)
for i in top_5:
    print(i, raw_scores[i])

Raw top 5
429 27.576127134865466
492 11.030450853946187
1163 11.030450853946187
481 3.748478388203395
82 2.7863964844710054
398 1.4490950836895478
639 1.4490950836895478
892 1.4490950836895478
1059 1.4490950836895478
11 1.4490950836895478


In [None]:
print("Term Frequency top 5")
top_5 = calc_top_5(tf_scores)
for i in top_5:
    print(i, tf_scores[i])

Term Frequency top 5
1163 0.056277810479317285
429 0.053442106850514474
492 0.05106690210160272
82 0.00849511123314331
1331 0.004364744227980565
1167 0.004164066332441229
481 0.0040567947924279165
11 0.003937758379591162
639 0.002058373698422653
892 0.0020238758152088658


In [None]:
print("Log Normalized top 5")
top_5 = calc_top_5(log_norm_scores)
for i in top_5:
    print(i, log_norm_scores[i])

Log Normalized top 5
429 1.2919242799842412
492 0.7921397455378736
1163 0.7921397455378736
481 0.33968358155737155
82 0.2525006254438148
398 0.13131563185582804
639 0.13131563185582804
892 0.13131563185582804
1059 0.13131563185582804
11 0.13131563185582804


In [None]:
print("Double Normalized top 5")
top_5 = calc_top_5(double_norm_scores)
for i in top_5:
    print(i, double_norm_scores[i])

Double Normalized top 5
492 9.507210405155067
429 9.507210405155067
1163 8.128404048411795
481 7.06197089068547
82 7.028237340115621
11 6.991113538950112
1167 6.930734577129714
1331 6.894507200037475
639 6.870355615309316
892 6.870355615309316


In [None]:
def jaccard_coefficient(doc, query):    
    doc_set = set(doc)
    query_set = set(query)
        
    intersection = len(doc_set.intersection(query_set))
    union = len(doc_set.union(query_set))    
    jaccard_coeff = intersection / union
    
    return jaccard_coeff

In [None]:
jcs = []
query = 'experimental slipstream shear simple'
query = query.lower().split()
for i in corpus:    
    jcs.append(jaccard_coefficient(i.split(), query))

In [None]:
print("Jaccard coefficient")
top_5 = calc_top_5(jcs)
for i in top_5:
    print(i, jcs[i])

Jaccard coefficient
838 0.11764705882352941
959 0.08
532 0.0625
611 0.061224489795918366
501 0.061224489795918366
705 0.06060606060606061
986 0.05405405405405406
657 0.05405405405405406
781 0.05263157894736842
878 0.05263157894736842
