# Retrival

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --upgrade gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)


In [None]:
# !pip install python-terrier
# Import required library
import pprint
import os
import re
import json
import gzip
import logging
import nltk
import ast
import json
import string
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import pickle
from smart_open import open, register_compressor
# from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# nltk.download('wordnet')
nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

SAVE_PATH = "/content/drive/Shareddrives/IR project _ GRAS/dataset"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Classes and Function Definitions

In [None]:
class GRASGujaratiStemmer:
    def __init__(self, csv_file_path, root_words,stemmed_tokens = []):
        self.csv_file_path = csv_file_path
        self.root_words = root_words
        self.stemmed_tokens = stemmed_tokens
        
        print("Loading data...")
        self.dataset = pd.read_csv(self.csv_file_path)
        print("Data loaded successfully.")
        if self.stemmed_tokens == []:
            self.preprocess_data()
        print("Data cleaned and stemmed.")

    def preprocess_data(self):
        # for tokens in tqdm(self.dataset['tokens']):
        # self.dataset['tokens'] = ast.literal_eval(re.sub("[()',‘’A-Za-z]", "", self.dataset['tokens']))
            #unstemmed_tokens = [re.sub("[()',‘’A-Za-z]", "", token) for token in tokens]
        self.stemmed_tokens = [' '.join([self.stem(token) for token in ast.literal_eval(tokens)]) for tokens in tqdm(self.dataset['tokens'])]

            # self.dataset['tokens'][i] = unstemmed_tokens
            # self.dataset['stemmed_tokens'][i] = stemmed_tokens
    
    def stem(self, word):
        return self.root_words.get(word, word)

In [None]:
class QueryQrelsExtractor:
    
    def extract_queries(self, queries_path, root_words_dict):
        from bs4 import BeautifulSoup
        with open(queries_path) as topics_file:
            soup = BeautifulSoup(topics_file, features="html.parser")

            qid_all = [int(num.text) for num in soup.find_all("num")]
            # Here we are taking text/query from the <desc> tag.
            tokens_all = [self.__preprocess_string(desc.text,root_words_dict) for desc in soup.find_all("desc")]

        return {qid: query for qid, query in zip(qid_all, tokens_all)}
    
    def extract_qrels(self, qrels_path):
        qrels = {}
        with open(qrels_path) as qrels_file:
            lines = qrels_file.readlines()
            for line in lines:
                line = line.strip()
                if line.endswith('1'):
                    line = line.split()
                    qid = int(line[0])
                    qrels[qid] = qrels.get(qid, []) + [line[2]]
        return qrels
    
    def __preprocess_string(self, text,root_words_dict):
        tokenizer = nltk.RegexpTokenizer("[^*.()',‘’૦૧૨૩૪૫૬૭૮૯A-Za-z0-9\s]+")
        tokens = tokenizer.tokenize(text)
        new_tokens = [re.sub("[()',‘’A-Za-z]", "", token) for token in tokens 
                      if token not in string.punctuation 
                      and token not in guj_stopwords]
        new_tokens = [root_words_dict.get(token,token) for token in new_tokens]
        return new_tokens

In [None]:
class Retrieval:

    def cosine_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / ((np.linalg.norm(vec1))*(np.linalg.norm(vec2)))

    def perform_retrieval(self, doc_vectors, query_vectors):
        '''
        Calculates cosine similarity of query vectors with document vectors and
        retrieves top 10 documents for each query with their scores.
        '''
        top10docs_per_query = {qid: dict() for qid in query_vectors.keys()}
        
        for i, (docno, doc_vec) in enumerate(doc_vectors.items()):
            is_doc_empty = False
            for qid, query_vec in query_vectors.items():
                if all(doc_vec == 0):
                    top10docs_per_query[qid].update({docno: 0.0})
                    is_doc_empty = True
                else:
                    top10docs_per_query[qid].update({
                        docno: self.cosine_similarity(doc_vec, query_vec) 
                    })
            
            if is_doc_empty:
                print(f"Assigned 0.0 score to empty document {docno} for all queries.")
            if i % 10000 == 0:
                print(f"Processed {i+1} documents.")

        print("\nRetrieving top 10 documents for each query...")
        top10docs_per_query = {
            qid: {
                docno: score 
                for docno, score 
                in sorted(top10docs_per_query[qid].items(), 
                        key= lambda x: x[1], reverse=True)[:10]
            } 
            for qid in top10docs_per_query.keys()
        }

        return top10docs_per_query
    
    def get_avg_precision(self, rank_list, qrels):
        '''Returns a dictionary with qids and their relevant average precisions.'''
        # Store (precision x relevance) as a list for a query
        q_prec_rel = {qid: 0 for qid in qrels.keys()}

        for qid, rank_dict in sorted(rank_list.items()):
            rel_doc_ctr = 0
            avg_prec = 0.0
            for pos, (doc, _) in enumerate(sorted(rank_dict.items(), key=lambda x: x[1], reverse=True), 1):
                # Calculate precision@pos+1 x relevance@pos+1 for the retrieved documents
                if doc in qrels[qid]:
                    rel_doc_ctr += 1
                    avg_prec += rel_doc_ctr / (pos + 1)
                # print(f"doc in qrels: {doc in qrels[qid]}, pos+1: {pos+1}, rel_doc_ctr: {rel_doc_ctr}, avg_prec: {avg_prec}")
            if rel_doc_ctr != 0:
                q_prec_rel[qid] = ( 1 / rel_doc_ctr ) * avg_prec
            else:
                q_prec_rel[qid] = avg_prec
            # print(f"Qid: {qid}, Avg_prec: {q_prec_rel[qid]}")
        
        return q_prec_rel
    
    def get_map(self, avg_prec_dict):
        return (1 / len(avg_prec_dict)) * sum(avg_prec_dict.values())

In [None]:
# Define functions to build document vectors and query vectors from word embedding matrices
def build_document_vectors(docno_list,tokens_list, wordvectors):
    doc_vectors = {}

    for idx in range(len(docno_list)):
        doc_vec = np.zeros((wordvectors.vectors.shape[1],))
        vectors_added = 0
        for token in tokens_list[idx].split():
            try:
                doc_vec += wordvectors.get_vector(token)
                vectors_added += 1
            except:
                continue
        if vectors_added != 0:
            doc_vec = doc_vec / vectors_added
        doc_vectors[docno_list[idx]] = doc_vec
    
    return doc_vectors

def build_query_vectors(queries_dict, wordvectors):
    query_vectors = {}

    for qid, query in queries_dict.items():
        query_vec = np.zeros((wordvectors.vectors.shape[1],))
        vectors_added = 0
        for qtoken in query:
            try:
                query_vec += wordvectors.get_vector(qtoken)
                vectors_added += 1
            except:
                continue
        if vectors_added != 0:
            query_vec = query_vec / vectors_added
        query_vectors[qid] = query_vec
    
    return query_vectors

In [None]:
class IterTokens:
    def __init__(self,stemmed_tokens_str):
        self.stemmed_tokens_str = stemmed_tokens_str

    def __iter__(self):
        for tokens in self.stemmed_tokens_str:
            yield tokens.split()

# def itr_tokens(stemmed_tokens_str):
#     return [tokens.split() for tokens in stemmed_tokens_str]



In [None]:
with open("/content/drive/Shareddrives/IR project _ GRAS/dataset/Gujarati.Stop.Words.txt") as stopw_file:
    guj_stopwords = [word.strip() for word in stopw_file.readlines()]
    guj_stopwords[0] = guj_stopwords[0].lstrip("\ufeff")

csv_path = "/content/drive/Shareddrives/IR project _ GRAS/dataset/guj_corpus_processed.csv"
# Next: /content/drive/Shareddrives/IR project _ GRAS/dataset/root_word_dir_7_4_0.9
root_words_dict = pickle.load(open("/content/drive/Shareddrives/IR project _ GRAS/dataset/root_word_dir_6_4_0.9", "rb"))

### Running GRAS stemmer based retrieval process

In [None]:
# root_words_dict = pickle.load(open("/content/drive/Shareddrives/IR project _ GRAS/dataset/root_word_dir", "rb"))
# stemmed_tokens = pickle.load(open("/content/drive/Shareddrives/IR project _ GRAS/dataset/stemmed_tokens", "rb"))

In [None]:
gras_guj_stemmer = GRASGujaratiStemmer(csv_path, root_words_dict)

Loading data...
Data loaded successfully.


HBox(children=(FloatProgress(value=0.0, max=313163.0), HTML(value='')))


Data cleaned and stemmed.


In [None]:
# pickle.dump( gras_guj_stemmer.stemmed_tokens, open(os.path.join(SAVE_PATH,"stemmed_tokens"), "wb" ))

In [None]:
# gras_guj_stemmer.stemmed_tokens = pickle.load(open(os.path.join(SAVE_PATH,"stemmed_tokens"), "rb"))

In [None]:
# del gras_guj_stemmer

In [None]:
itr_tokens = IterTokens(gras_guj_stemmer.stemmed_tokens)

In [None]:
# time = 1hr
from gensim import corpora, models

fire_cbow_model = models.Word2Vec(
    sentences = itr_tokens,
    vector_size=350,
    window=5,
    min_count=1,
    workers=4,
    sg=0,
    hs=1,
    negative=0,
    epochs=3
)

2021-04-24 08:29:18,092 : INFO : collecting all words and their counts
2021-04-24 08:29:18,093 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-24 08:29:19,457 : INFO : PROGRESS: at sentence #10000, processed 3273172 words, keeping 158274 word types
2021-04-24 08:29:20,925 : INFO : PROGRESS: at sentence #20000, processed 6610289 words, keeping 225423 word types
2021-04-24 08:29:22,394 : INFO : PROGRESS: at sentence #30000, processed 9982146 words, keeping 278207 word types
2021-04-24 08:29:23,840 : INFO : PROGRESS: at sentence #40000, processed 13307872 words, keeping 324879 word types
2021-04-24 08:29:25,255 : INFO : PROGRESS: at sentence #50000, processed 16404460 words, keeping 376524 word types
2021-04-24 08:29:26,647 : INFO : PROGRESS: at sentence #60000, processed 19556873 words, keeping 421217 word types
2021-04-24 08:29:28,011 : INFO : PROGRESS: at sentence #70000, processed 22632367 words, keeping 459997 word types
2021-04-24 08:29:29,387 : IN

In [None]:
# pickle.dump( fire_cbow_model, open(os.path.join(SAVE_PATH,"word2vec_model"), "wb" ))

In [None]:
# time = 7.5 minute
corpus_vector_dict = build_document_vectors(gras_guj_stemmer.dataset['docno'],gras_guj_stemmer.stemmed_tokens,fire_cbow_model.wv)

In [None]:
# pickle.dump( corpus_vector_dict, open(os.path.join(SAVE_PATH,"corpus_vector_dict"), "wb" ))

In [None]:
query_path = '/content/drive/Shareddrives/IR project _ GRAS/dataset/gu.topics.126-175.2011.txt'
qrels_path = '/content/drive/Shareddrives/IR project _ GRAS/dataset/gu.qrels.126-175.2011.txt'

query_qrels = QueryQrelsExtractor()

In [None]:
query_dict = query_qrels.extract_queries(query_path,root_words_dict)

In [None]:
qrels_dict = query_qrels.extract_qrels(qrels_path)

In [None]:
query_vector_dict = build_query_vectors(query_dict,fire_cbow_model.wv)

In [None]:
del fire_cbow_model

In [None]:
retrieve = Retrieval()
top_10 = retrieve.perform_retrieval(corpus_vector_dict,query_vector_dict)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Processed 30001 documents.
Processed 40001 documents.
Assigned 0.0 score to empty document gujarat_samachar_date_20080903_guj_supplement_7_career for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20081015_guj_supplement_7_career for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20080206_guj_supplement_30_jivan_panth for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20081018_guj_sports_5_news4 for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20080409_guj_supplement_30_jivan_panth for all queries.
Processed 50001 documents.
Assigned 0.0 score to empty document gujarat_samachar_date_20080220_guj_supplement_30_jivan_panth for all queries.
Processed 60001 documents.
Assigned 0.0 score to empty document gujarat_samachar_date_20080312_guj_supplement_30_jivan_panth for all queries.
Processed 70001 documents.
Assign

In [None]:
avg_pr_dict = retrieve.get_avg_precision(top_10,qrels_dict)

In [None]:
MAP = retrieve.get_map(avg_pr_dict)
MAP

0.348505248917749

<table style="width:100%">
  <tr>
    <th>L</th>
    <th>Alpha</th>
    <th>Delta</th>
    <th> MAP </th>
  </tr>
    <tr>
    <td>NA</td>
    <td>NA</td>
    <td>NA</td>
    <td>0.30810024908953476</td>
  </tr>
  <tr>
    <td>6</td>
    <td>4</td>
    <td>0.5</td>
    <td>0.35627292511280606</td>
  </tr>
   <tr>
    <td>6</td>
    <td>4</td>
    <td>0.9</td>
    <td>0.348505248917749</td>
  </tr>
  <tr>
    <td>6</td>
    <td>6</td>
    <td>0.5</td>
    <td>0.3465313157081014</td>
  </tr>
  <tr>
    <td>7</td>
    <td>4</td>
    <td>0.5</td>
    <td>0.32438315982958843</td>
  </tr>
  <tr>
    <td>7</td>
    <td>4</td>
    <td>0.8</td>
    <td>0.3331921957671958</td>
  </tr>

</table>


## non Stemmed Data

In [None]:
dataset = pd.read_csv(csv_path)

In [None]:
real_tokens = [' '.join(ast.literal_eval(tokens)) for tokens in tqdm(dataset['tokens'])]

HBox(children=(FloatProgress(value=0.0, max=313163.0), HTML(value='')))




In [None]:
# pickle.dump( real_tokens, open(os.path.join(SAVE_PATH,"real_tokens"), "wb" ))

In [None]:
real_tokens = pickle.load(open(os.path.join(SAVE_PATH,"real_tokens"), "rb"))

In [None]:
del dataset

In [None]:
itr_tokens = IterTokens(real_tokens)

In [None]:
from gensim import corpora, models

fire_cbow_model = models.Word2Vec(
    sentences = itr_tokens,
    vector_size=350,
    window=5,
    min_count=1,
    workers=4,
    sg=0,
    hs=1,
    negative=0,
    epochs=3
)

2021-04-24 05:55:54,123 : INFO : collecting all words and their counts
2021-04-24 05:55:54,124 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-24 05:55:55,534 : INFO : PROGRESS: at sentence #10000, processed 3273172 words, keeping 247870 word types
2021-04-24 05:55:57,033 : INFO : PROGRESS: at sentence #20000, processed 6610289 words, keeping 369814 word types
2021-04-24 05:55:58,525 : INFO : PROGRESS: at sentence #30000, processed 9982146 words, keeping 468146 word types
2021-04-24 05:56:00,026 : INFO : PROGRESS: at sentence #40000, processed 13307872 words, keeping 555264 word types
2021-04-24 05:56:01,438 : INFO : PROGRESS: at sentence #50000, processed 16404460 words, keeping 649224 word types
2021-04-24 05:56:02,938 : INFO : PROGRESS: at sentence #60000, processed 19556873 words, keeping 731808 word types
2021-04-24 05:56:04,327 : INFO : PROGRESS: at sentence #70000, processed 22632367 words, keeping 805010 word types
2021-04-24 05:56:05,707 : IN

In [None]:
pickle.dump( fire_cbow_model, open(os.path.join(SAVE_PATH,"unstemmed_word2vec_model"), "wb" ))

In [None]:
corpus_vector_dict = build_document_vectors(dataset['docno'],real_tokens,fire_cbow_model.wv)

In [None]:
pickle.dump( corpus_vector_dict, open(os.path.join(SAVE_PATH,"unstemmed_corpus_vector_dict"), "wb" ))

In [None]:
query_path = '/content/drive/Shareddrives/IR project _ GRAS/dataset/gu.topics.126-175.2011.txt'
qrels_path = '/content/drive/Shareddrives/IR project _ GRAS/dataset/gu.qrels.126-175.2011.txt'

query_qrels = QueryQrelsExtractor()

In [None]:
query_dict = query_qrels.extract_queries(query_path,dict())

In [None]:
qrels_dict = query_qrels.extract_qrels(qrels_path)

In [None]:
query_vector_dict = build_query_vectors(query_dict,fire_cbow_model.wv)

In [None]:
del fire_cbow_model

In [None]:
retrieve = Retrieval()
top_10 = retrieve.perform_retrieval(corpus_vector_dict,query_vector_dict)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Processed 30001 documents.
Processed 40001 documents.
Assigned 0.0 score to empty document gujarat_samachar_date_20080903_guj_supplement_7_career for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20081015_guj_supplement_7_career for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20080206_guj_supplement_30_jivan_panth for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20081018_guj_sports_5_news4 for all queries.
Assigned 0.0 score to empty document gujarat_samachar_date_20080409_guj_supplement_30_jivan_panth for all queries.
Processed 50001 documents.
Assigned 0.0 score to empty document gujarat_samachar_date_20080220_guj_supplement_30_jivan_panth for all queries.
Processed 60001 documents.
Assigned 0.0 score to empty document gujarat_samachar_date_20080312_guj_supplement_30_jivan_panth for all queries.
Processed 70001 documents.
Assign

In [None]:
avg_pr_dict = retrieve.get_avg_precision(top_10,qrels_dict)

In [None]:
MAP = retrieve.get_map(avg_pr_dict)
MAP

0.30810024908953476