### Importing necessary libraries 

In [144]:
import pickle
import os
import math
import string
import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.remove('where')
stop_words.remove('to')
stop_words.remove('is')

vocab = set()

### For loading and storing Data Structures

In [145]:
def pickle_data(filename, file):
    filename = os.path.join('New folder',filename)
    out = open(filename+'.pkl', 'wb')
    pickle.dump(file, out)
    out.close()
    
def load_pickle_data(file):
    file = os.path.join('New folder',file)
    out = open(file, 'rb')
    index_dict1 = pickle.load(out)
    out.close()
    return index_dict1

## Preprocessing

In [146]:
def remove_metadata(lines):
    start = 0
    for i in range(len(lines)):
        if lines[i] == '\n':
            start = i + 1
            break
    return lines[start:]

In [147]:
def preprocessing_words(words):
    mod_words = []
    global vocab
    
    # Remove punctuations.
    table = str.maketrans('', '', '\t')
    words_list = [word.translate(table) for word in words]
    trans_table = str.maketrans('', '', string.punctuation)
    stripped_words = [word.translate(trans_table) for word in words_list]
    words_list = [str for str in stripped_words if str]
    
    # Change to lowercase
    # and Check if all the characters in the word is alphanumeric 
    for word in words_list:
        word = word.lower()
        if word.isalnum():
            word = lemmatizer.lemmatize(word)
            
            if len(word) >= 2 and word not in stop_words:
                vocab.add(word)
                mod_words.append(word)
    return mod_words

### TF_IDF Logic

In [148]:
def compute_tf(words):
    tf_dict = {}
    for key in words:
        if key in tf_dict.keys():
            tf_dict[key] += 1
        else:
            tf_dict[key] = 1
    wtf_dict = tf_dict
    for key in tf_dict.keys():
        wtf_dict[key] = 1+ math.log10(wtf_dict[key])
        
    return tf_dict, wtf_dict

In [149]:
def compute_idf(tf, positional_dict):
    idf_dict = {}
    N = 10
    try:
        for key in tf.keys():
            if key in positional_dict.keys():
                count = positional_dict[key][0]
            else:
                count = 0
            idf = math.log10(N/(count))

            idf_dict[key] = idf
    except ZeroDivisionError:
        idf_dict[key] = 0
        
    return idf_dict

def idf_wrapper(docs_tf, positional_dict):
    docs_idf = []
    for tf in docs_tf:
        idf = compute_idf(tf,positional_dict)
        docs_idf.append(idf)
    return docs_idf

In [150]:
def compute_tfidf(tf,idf):
    tfidf = {}
    for key in tf.keys():
        val = tf[key] * idf[key]
        tfidf[key] = val
    return tfidf
    

def tfidf_wrapper(docs_tf,docs_idf):
    docs_tfidf = []
    for(tf,idf) in zip(docs_tf,docs_idf):
        tfidf = compute_tfidf(tf,idf)
        docs_tfidf.append(tfidf)
    return docs_tfidf

In [151]:
def create_vector(vocab,tfidf):
    vector = {}
    for key in vocab:
        if key in tfidf.keys():
            vector[key] = tfidf[key]
        else:
            vector[key] = 0.0
    return vector

def vectors_wrapper(vocab,docs_tfidf):
    vectors = []
    for tfidf in docs_tfidf:
        vector = create_vector(vocab,tfidf)
        vectors.append(vector)
    return vectors

### Positional Index Construction

In [152]:
def process_text(lines, positional_dict,doc_ID):
    ''' This method is handle all the text in the file.
    
        This will remove meta data, pre process the file and construct the positional dictionary.
        
        positional dictionary format :
            pos_dict = {
                        token : [doc_freq,{
                            file_id:[pos1, pos2,.....]
                            }]
                        }
    '''
    lines = remove_metadata(lines)
    seperator = ' '
    file = seperator.join(lines)
    # Tokenize.
    words = word_tokenize(file)
    words = preprocessing_words(words)
    doc_tf, doc_wtf = compute_tf(words)
    for pos, word in enumerate(words):
        if word in positional_dict.keys():
            positional_dict[word][0] += 1 # doc frequency of word 
            if doc_ID in positional_dict[word][1].keys():
                positional_dict[word][1][doc_ID].append(pos)
            else:
                positional_dict[word][1][doc_ID] = [pos] # adding the new file id and its position
                
        # If term does not exist in the positional index dictionary
        # (first encounter).  
        else :
            positional_dict[word] = []
            positional_dict[word].append(1)
            positional_dict[word].append({})
            positional_dict[word][1][doc_ID] = [pos]
            
    return doc_tf, doc_wtf, positional_dict     

### Processing Directories

In [153]:
file_mapper = {}
docs_tf = []
docs_wtf = []

root_dirs = ['New folder']
doc_ID = 0
positional_dict = {}
for fold in root_dirs:
    print('Processing files in : {}'.format(fold))
    
    for file in os.listdir(fold):
        path = os.path.join(fold,file)
        
        with open(path , 'r') as f:
            lines = f.readlines()
            doc_tf, doc_wtf, positional_dict = process_text(lines,positional_dict,doc_ID)
            docs_tf.append(doc_tf)
            docs_wtf.append(doc_wtf)
            file_mapper[doc_ID] = path
            doc_ID += 1
            if doc_ID % 100 == 0:
                print('Processing of {} files in {} is completed '.format(doc_ID,fold))
    docs_idf = idf_wrapper(docs_tf, positional_dict)
    


docs_tfidf = tfidf_wrapper(docs_tf,docs_idf)
docs_vectors = vectors_wrapper(vocab,docs_tfidf)

# pickling the dictionary
pickle_data('positional_dict',positional_dict)
pickle_data('pos_file_mapper',file_mapper)

Processing files in : New folder


### Loading Files

In [154]:
positional_dict = load_pickle_data('positional_dict.pkl')
file_mapper = load_pickle_data('pos_file_mapper.pkl')

N = len(docs_vectors)
V = len(vocab)

print('Num of docs :',N)
print('Vocab Size : ',V)
print(len(positional_dict.keys()))

Num of docs : 10
Vocab Size :  14
14


In [155]:
df_tf = pd.DataFrame(docs_tf)
df_wtf = pd.DataFrame(docs_wtf)
df_idf = pd.DataFrame(docs_idf)
df_vec = pd.DataFrame(docs_vectors)

df_tf.to_csv('docs_tf.csv', index=False, encoding='utf-8-sig')
df_wtf.to_csv('docs_wtf.csv', index=False, encoding='utf-8-sig')
df_idf.to_csv('docs_idf.csv', index=False, encoding='utf-8-sig')
df_vec.to_csv('docs_vectors.csv', index=False, encoding='utf-8-sig')

In [156]:
df_tf = pd.read_csv("docs_tf.csv")
df_tf = df_tf.T
df_tf.fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
antony,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
brutus,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
caeser,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mercy,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
worser,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
fool,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
fear,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
rush,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
to,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [157]:
df_wtf = pd.read_csv("docs_wtf.csv")
df_wtf = df_wtf.T
df_wtf.fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
antony,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
brutus,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
caeser,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mercy,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
worser,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
fool,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
fear,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
rush,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
to,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [158]:
df_idf = pd.read_csv("docs_idf.csv")
df_idf = df_idf.T
df_idf.fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
antony,0.522879,0.0,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0
brutus,0.522879,0.0,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0
caeser,0.30103,0.0,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mercy,0.30103,0.0,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0
worser,0.39794,0.0,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0
fool,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794
fear,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0
rush,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794
to,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794


In [159]:
df_vec = pd.read_csv("docs_vectors.csv")
df_vec.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
caeser,0.30103,0.0,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0
to,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794
mercy,0.30103,0.0,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0
worser,0.39794,0.0,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0
antony,0.522879,0.0,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0
fear,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0
fool,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794
brutus,0.522879,0.0,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tread,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794


In [160]:
def retrieve_list(word):
    '''
    This will retrieve postings list of given token if exists
    '''
    ans = []
    if word in positional_dict.keys():
        #print('Term {} is present in the dictionary'.format(word))
        ans =  positional_dict[word]
    else:
        print('Term : {} not present in dictionary'.format(word))
    return ans

In [161]:
def positional_intersect(pos_list_1,pos_list_2,k):
    ans = []
    for file_id in pos_list_1.keys():
        if file_id in pos_list_2.keys():
            list_1 = pos_list_1[file_id]
            list_2 = pos_list_2[file_id]
            
            for pos1 in list_1:
                for pos2 in list_2 :
                    if pos2 - pos1 == k : #or pos1 - pos2 == k :
                        if file_id not in ans:
                            ans.append(file_id)
                            #print('file found.')
                        break
    return ans  

In [162]:
def process_query(query):
    results = []
    query = preprocessing_words(query)
    print('final query after preprocessing :')
    print(query)
    try:
        for i in range(len(query)):
            j = i + 1
            pos_list_1 = retrieve_list(query[i])
            while j < len(query):
                pos_list_2 = retrieve_list(query[j])

                swap = False
                if pos_list_1[0] > pos_list_2[0]:
                    #print('Swapping')
                    pos_list_1, pos_list_2 = pos_list_2, pos_list_1
                    swap = True
                if swap :
                    k = i - j
                else :
                    k = j - i
                # Small postings list is always first one for optimisation
                ans = positional_intersect(pos_list_1[1], pos_list_2[1], k)
                results.append(ans)
                j += 1
    except:
        retrieve_list(query[i])
    return query, results

In [163]:
def cosine(query_vec, matched_vec):
    
    vec_a = []
    vec_b = []
    for key in query_vec.keys():
        vec_a.append(query_vec[key])
        vec_b.append(matched_vec[key])
    
    mod_a = np.sqrt(sum(np.square(vec_a)))
    mod_b = np.sqrt(sum(np.square(vec_b)))
    if mod_a == 0.0 or mod_b == 0.0 :
        score = 0
    else:
        score = np.dot(vec_a,vec_b)/(mod_a * mod_b)
    #print('score ',score)
    return score


def cosine_wrapper(vector, matched_vectors):
    dummy = []
    for vec in matched_vectors:
        dummy.append(cosine(vector, vec))
    return dummy

In [164]:
def find_query_score(query, positional_dict, matched_vectors):
    
    tf, _ = compute_tf(query)
    idf = compute_idf(tf,positional_dict)
    tfidf = compute_tfidf(tf,idf)

    vector = create_vector(vocab, tfidf)
    score = cosine_wrapper(vector,matched_vectors)
    return score

In [165]:
def calculate_similarity(query, positional_dict, matched_vectors):
    score = find_query_score(query, positional_dict, matched_vectors)
    return score

In [166]:
def construct_dict(lists):
    results = {}
    for lis in lists:
        for ele in lis:
            if ele in results.keys():
                results[ele] += 1
            else:
                results[ele] = 1
    return results


In [167]:
def print_output(results, score):
    
    print('-----------------------')
    print('The most probable files for given phrasal query in descending order is : ')
    i = 0
    path = []
    for key in results.keys():
        path.append(file_mapper[key])
        
    files_score = list(zip(score, path))
    files_score = sorted(files_score, reverse=True)
    
    for index, value in enumerate(files_score):
        print('file {} is {} with score: {}'.format(i+1,value[1], value[0]))
        i += 1
                

In [168]:
def matched_docs(results, docs_vectors):
    matched_vector = []
    matched_vectors = []
    for key in results.keys():
        matched_vector = docs_vectors[key]
        matched_vectors.append(matched_vector)
    return matched_vectors

In [169]:
def read_query():
    query = input().split()
    query, results = process_query(query)
    results = construct_dict(results)
    matched_vectors = matched_docs(results, docs_vectors)
    score = calculate_similarity(query, positional_dict, matched_vectors)
    print_output(results, score)

In [125]:
read_query()

brutus ahmed
final query after preprocessing :
['brutus', 'ahmed']
Term : ahmed not present in dictionary
-----------------------
The most probable files for given phrasal query in descending order is : 
