In [1]:
import os
import zipfile
from functools import partial
import nltk
import requests
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

from ipywidgets import widgets
from IPython.display import display, HTML
#from IPython.html import widgets
from collections import namedtuple

from gensim.corpora import Dictionary
from gensim.models import LdaModel, LsiModel, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import downloader as g_downloader
import itertools

# gensim uses logging, so set it up 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline

In [2]:
def download_dataset():
    folder_path = os.environ.get("IR1_DATA_PATH")
    if not folder_path:
        folder_path = "./datasets/"
    os.makedirs(folder_path, exist_ok=True)
    
    file_location = os.path.join(folder_path, "cacm.zip")
    
    # download file if it doesn't exist
    if not os.path.exists(file_location):
        
        url = "https://surfdrive.surf.nl/files/index.php/s/M0FGJpX2p8wDwxR/download"

        with open(file_location, "wb") as handle:
            print(f"Downloading file from {url} to {file_location}")
            response = requests.get(url, stream=True)
            for data in tqdm(response.iter_content()):
                handle.write(data)
            print("Finished downloading file")
    
    if not os.path.exists(os.path.join(folder_path, "train.txt")):
        
        # unzip file
        with zipfile.ZipFile(file_location, 'r') as zip_ref:
            zip_ref.extractall(folder_path)
        
download_dataset()

In [5]:
def read_cacm_docs(root_folder = "./datasets/"):
    """
        Reads in the CACM documents. The dataset is assumed to be in the folder "./datasets/" by default
        Returns: A list of 2-tuples: (doc_id, document), where 'document' is a single string created by 
            appending the title and abstract (separated by a "\n"). 
            In case the record doesn't have an abstract, the document is composed only by the title
    """

    file_dir = root_folder + "cacm.all"
    keeping = False
    temp = ""
    doc_list =[]

    with open(file_dir) as cacm_file:
        for line in cacm_file:
            line_begin = line[0:2]

            if line_begin == ".I":
                doc_index = line.split(" ")[1].replace("\n","")
                temp = ""

            elif line_begin == ".T":
                keeping = True

            elif line_begin == ".W":
                keeping = True
                temp += "\n "

            elif line_begin in ['.B', '.A', '.N', '.K']:
                keeping = False

            elif (line_begin not in ['.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K']):
                if keeping:
                    temp += line
            else:
                doc_list.append((doc_index, temp))
                keeping = False

    return doc_list



In [8]:
def read_queries(root_folder = "./datasets/"):
    """
        Reads in the CACM queries. The dataset is assumed to be in the folder "./datasets/" by default
        Returns: A list of 2-tuples: (query_id, query)
    """
    
    file_dir = root_folder + "query.text"
    keeping = False
    temp = ""
    query_list =[]

    with open(file_dir, "r") as query_file:
        for line in query_file:
            line_begin = line[0:2]

            if line_begin == ".I":
                doc_index = line.split(" ")[1].replace("\n","")
                temp = ""

            elif line_begin == ".W":
                keeping = True   

            elif line_begin == ".A":
                keeping = False

            elif (line_begin not in ['.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K']):
                if keeping:
                    temp += line
            else:
                query_list.append((doc_index, temp))
                keeping = False
                
    return query_list

In [11]:
def load_stopwords(root_folder = "./datasets/"):
    """
        Loads the stopwords. The dataset is assumed to be in the folder "./datasets/" by default
        Output: A set of stopwords
    """
    file_dir = root_folder + "common_words"

    with open(file_dir , 'r') as f:
        stopwords = [line.strip() for line in f]
        
    return set(stopwords)

In [13]:
def tokenize(text):
    """
        Tokenizes the input text. Use the WordPunctTokenizer
        Input: text - a string
        Output: a list of tokens
    """
    tokenized = nltk.WordPunctTokenizer().tokenize(text)
    return tokenized
    

In [15]:
def stem_token(token):
    """
        Stems the given token using the PorterStemmer from the nltk library
        Input: a single token
        Output: the stem of the token
    """
    ps = nltk.stem.PorterStemmer() 
    return ps.stem(token)
    

In [17]:
def process_text(text, stem=False, remove_stopwords=False, lowercase_text=False):
    
    tokens = []
    for token in tokenize(text):
        if remove_stopwords and token.lower() in stopwords:
            continue
        if stem:
            token = stem_token(token)
        if lowercase_text:
            token = token.lower()
        tokens.append(token)

    return tokens


In [18]:
# In this configuration:
# Don't preprocess the text, except to tokenize 
config_1 = {
  "stem": False,
  "remove_stopwords" : False,
  "lowercase_text": True
} 


# In this configuration:
# Preprocess the text, stem and remove stopwords
config_2 = {
  "stem": True,
  "remove_stopwords" : True,
  "lowercase_text": True, 
} 

####
doc_repr_1 = []
doc_repr_2 = []
for (doc_id, document) in docs:
    doc_repr_1.append((doc_id, process_text(document, **config_1)))
    doc_repr_2.append((doc_id, process_text(document, **config_2)))

####

In [19]:
def build_tf_index(documents):
    """
        Build an inverted index (with counts). The output is a dictionary which takes in a token
        and returns a list of (doc_id, count) where 'count' is the count of the 'token' in 'doc_id'
        Input: a list of documents - (doc_id, tokens) 
        Output: An inverted index. [token] -> [(doc_id, token_count)]
    """
    tf_index = {}

    for doc in documents:
        for token in np.unique(doc[1]):
            doc_list = (doc[0], doc[1].count(token))

            if token in tf_index.keys():
                tf_index[token].append(doc_list)
            else:
                tf_index[token] = [doc_list]
    
    return tf_index



In [20]:
#### Indexed documents based on the two configs

# Create the 2 indices
tf_index_1 = build_tf_index(doc_repr_1)
tf_index_2 = build_tf_index(doc_repr_2)

# This function returns the tf_index of the corresponding config
def get_index(index_set):
    assert index_set in {1, 2}
    return {
        1: tf_index_1,
        2: tf_index_2
    }[index_set]

####
#### Preprocessed query based on the two configs

# This function preprocesses the text given the index set, according to the specified config
def preprocess_query(text, index_set):
    assert index_set in {1, 2}
    if index_set == 1:
        return process_text(text, **config_1)
    elif index_set == 2:
        return process_text(text, **config_2)

#### 

In [24]:
def bow_search(query, index_set):
    """
        Perform a search over all documents with the given query. 
        Note: You have to use the `get_index` function created in the previous cells
        Input: 
            query - a (unprocessed) query
            index_set - the index to use
        Output: a list of (document_id, score), sorted in descending relevance to the given query 
    """

    
    index = get_index(index_set)
    processed_query = preprocess_query(query, index_set)
    
    bag_dict = {}
    for q in processed_query:

        if q not in index:
            continue 

        for doc_id, tf in index[q]:        
            if doc_id not in bag_dict:
                    bag_dict[doc_id] = 0.0
            
            bag_dict[doc_id] += 1.0 

    sorted_result = sorted(bag_dict.items(), key=lambda tup: tup[1], reverse = True)

    return sorted_result



In [29]:
def compute_df(documents):
    """
        Compute the document frequency of all terms in the vocabulary
        Input: A list of documents
        Output: A dictionary with {token: document frequency)
    """
    doc_freq = {}
    
    for i in range(len(documents)):
        tokens = documents[i]
        for token in tokens:
            if token not in doc_freq:
                doc_freq[token] = {i}
          
            else:
                doc_freq[token].add(i)

    for token in doc_freq:
        doc_freq[token] = len(doc_freq[token])

    return doc_freq
    


In [30]:
#### Compute df based on the two configs

# get the document frequencies of each document
df_1 = compute_df([d[1] for d in doc_repr_1])
df_2 = compute_df([d[1] for d in doc_repr_2])

def get_df(index_set):
    assert index_set in {1, 2}
    return {
        1: df_1,
        2: df_2
    }[index_set]
####

In [32]:
def tfidf_search(query, index_set):
    """
        Perform a search over all documents with the given query using tf-idf. 
        Note #1: You have to use the `get_index` (and the `get_df`) function created in the previous cells
        Input: 
            query - a (unprocessed) query
            index_set - the index to use
        Output: a list of (document_id, score), sorted in descending relevance to the given query 
    """
    index = get_index(index_set)
    df = get_df(index_set)
    processed_query = preprocess_query(query, index_set)
    
    n_doc = len(doc_repr_1) if index_set == 1 else len(doc_repr_2)

    tfidf_dict = {}

    for q in processed_query:

        if q not in index:
            continue 

        for doc_id, tf in index[q]:        
            if doc_id not in tfidf_dict:
                    tfidf_dict[doc_id] = 0
            
            tfidf_dict[doc_id] += tf*np.log(n_doc/df[q])

    sorted_result = sorted(tfidf_dict.items(), key=lambda tup: tup[1], reverse = True)

    return sorted_result


In [37]:
#### Document length for normalization

def doc_lengths(documents):
    doc_lengths = {doc_id:len(doc) for (doc_id, doc) in documents}
    return doc_lengths

doc_lengths_1 = doc_lengths(doc_repr_1)
doc_lengths_2 = doc_lengths(doc_repr_2)

def get_doc_lengths(index_set):
    assert index_set in {1, 2}
    return {
        1: doc_lengths_1,
        2: doc_lengths_2
    }[index_set]
####

In [38]:
def naive_ql_search(query, index_set):
    """
        Perform a search over all documents with the given query using a naive QL model. 
        Note #1: You have to use the `get_index` (and get_doc_lengths) function created in the previous cells
        Input: 
            query - a (unprocessed) query
            index_set - the index to use
        Output: a list of (document_id, score), sorted in descending relevance to the given query 
    """
    index = get_index(index_set)
    doc_lengths = get_doc_lengths(index_set)
    processed_query = preprocess_query(query, index_set)
    unigram_probs = {}


    for i, q in enumerate(processed_query):
      if q not in index:
        continue
      
      if i > 0:
        tf_dicts = dict(index[q])
        for doc_id in unigram_probs:
          if doc_id in tf_dicts:
            unigram_probs[doc_id] *= 1.0 * tf_dicts[doc_id] / doc_lengths[doc_id]  
        
          else:
            unigram_probs[doc_id] = 0

      else:    
        for doc_id, tf in index[processed_query[0]]:
          unigram_probs[doc_id] = tf / doc_lengths[doc_id] 

    sorted_rank = sorted(unigram_probs.items(), key = lambda d: d[1], reverse = True)
    
    return sorted_rank


In [44]:
def get_doc_ids(query, index_set):
  "return doc_id list of documents that contain the query terms"
  index = get_index(index_set)
  doc_ids = []
  for q in query:
    if q not in index:
      continue
      
    for doc_id, _ in index[q]:
      if doc_id not in doc_ids:
        doc_ids.append(doc_id)
    
  return doc_ids


def ql_search(query, index_set):
    """
        Perform a search over all documents with the given query using a QL model 
        with Jelinek-Mercer Smoothing (set smoothing=0.1). 
        
        
        Note #1: You have to use the `get_index` (and get_doc_lengths) function created in the previous cells
        Note #2: You might have to create some variables beforehand and use them in this function
        
        
        Input: 
            query - a (unprocessed) query
            index_set - the index to use
        Output: a list of (document_id, score), sorted in descending relevance to the given query 
    """
    index = get_index(index_set)
    doc_lengths = get_doc_lengths(index_set)
    processed_query = preprocess_query(query, index_set)
    
    doc_ids = get_doc_ids(processed_query, index_set)
    cl = sum(doc_lengths.values())
    lamb = 0.1
    unigram_probs = dict(zip(doc_ids, np.zeros(len(doc_ids))))
    
    for i, q in enumerate(processed_query):
      if q not in index:
        continue
      
      tf_dict = dict(index[q])
      cf = sum(tf_dict.values())

      for doc_id in doc_ids:
        tf = tf_dict[doc_id] if doc_id in tf_dict else 0
        unigram_probs[doc_id] += np.log((1 - lamb) * tf / doc_lengths[doc_id] + lamb * cf / cl)
          
    sorted_rank = sorted(unigram_probs.items(), key = lambda d: d[1], reverse = True)
    return sorted_rank
    

In [50]:
def bm25_search(query, index_set):
    """
        Perform a search over all documents with the given query using BM25. Use k_1 = 1.5 and b = 0.75
        Note #1: You have to use the `get_index` (and `get_doc_lengths`) function created in the previous cells
        Note #2: You might have to create some variables beforehand and use them in this function
        
        Input: 
            query - a (unprocessed) query
            index_set - the index to use
        Output: a list of (document_id, score), sorted in descending relevance to the given query 
    """
    
    index = get_index(index_set)
    df = get_df(index_set)
    doc_lengths = get_doc_lengths(index_set)
    processed_query = preprocess_query(query, index_set)
    
    k_1, b = 1.5, 0.75
    bm25_dict = {}
    dl_avg = 1.0 * sum(doc_lengths.values()) / len(doc_lengths)
 
    for q in processed_query:
      if q not in index:
        continue
      
      for doc_id, tf in index[q]:
        if doc_id not in bm25_dict:
          bm25_dict[doc_id] = 0
        
        idf = np.log(len(doc_lengths)/df[q])
        bm25_dict[doc_id] += idf * (k_1 + 1) * tf / (k_1 * (1-b + b * doc_lengths[doc_id]/ dl_avg) + tf)
    
    sorted_rank = sorted(bm25_dict.items(), key = lambda d: d[1], reverse = True)
    return sorted_rank


In [56]:
#### Highlighter function
# class for results
ResultRow = namedtuple("ResultRow", ["doc_id", "snippet", "score"])
docs_by_id = dict((d[0], d[1]) for d in docs)

def highlight_text(document, query, tol=17):
    import re
    tokens = tokenize(query)
    regex = "|".join(f"(\\b{t}\\b)" for t in tokens)
    regex = re.compile(regex, flags=re.IGNORECASE)
    output = ""
    i = 0
    for m in regex.finditer(document):
        start_idx = max(0, m.start() - tol)
        end_idx = min(len(document), m.end() + tol)
        output += "".join(["...",
                        document[start_idx:m.start()],
                        "<strong>",
                        document[m.start():m.end()],
                        "</strong>",
                        document[m.end():end_idx],
                        "..."])
    return output.replace("\n", " ")


def make_results(query, search_fn, index_set):
    results = []
    for doc_id, score in search_fn(query, index_set):
        highlight = highlight_text(docs_by_id[doc_id], query)
        if len(highlight.strip()) == 0:
            highlight = docs_by_id[doc_id]
        results.append(ResultRow(doc_id, highlight, score))
    return results


In [57]:
search_fn = bm25_search
index_set = 1

text = widgets.Text(description="Search Bar", width=200)
display(text)

def handle_submit(sender):
    print(f"Searching for: '{sender.value}'")
    
    results = make_results(sender.value, search_fn, index_set)
    
    # display only the top 5
    results = results[:5]
    
    body = ""
    for idx, r in enumerate(results):
        body += f"<li>Document #{r.doc_id}({r.score}): {r.snippet}</li>"
    display(HTML(f"<ul>{body}</ul>"))
    

text.on_submit(handle_submit)

Text(value='', description='Search Bar')

In [59]:
def read_qrels(root_folder = "./datasets/"):
    """
        Reads the qrels.text file. 
        Output: A dictionary: query_id -> [list of relevant documents]
    """
    # YOUR CODE HERE
    query_f = open(os.path.join(root_folder, "qrels.text"), 'r')
    query_dic = {}

    for line in query_f:
      q_id, doc_id, _, _ = line.split()
      
      # make queries and qrels consistent in query_id ('01'->'1')
      q_id = str(int(q_id))
      
      if q_id not in query_dic:
        query_dic[q_id] = []
      query_dic[q_id].append(doc_id)
    
    return query_dic


In [61]:
def precision_k(results, relevant_docs, k):
    """
        Compute Precision@K
        Input: 
            results: A sorted list of 2-tuples (document_id, score), 
                    with the most relevant document in the first position
            relevant_docs: A set of relevant documents. 
            k: the cut-off
        Output: Precision@K
    """
    relevant_cnt = 0

    for i, (doc_id, _) in enumerate(results):
      if doc_id in relevant_docs:
        relevant_cnt += 1
      if i == k - 1:
        break

    return relevant_cnt / k


In [63]:
def recall_k(results, relevant_docs, k):
    """
        Compute Recall@K
        Input: 
            results: A sorted list of 2-tuples (document_id, score), with the most relevant document in the first position
            relevant_docs: A set of relevant documents. 
            k: the cut-off
        Output: Recall@K
    """
    relevant_cnt = 0

    for i, (doc_id, _) in enumerate(results):
      if doc_id in relevant_docs:
        relevant_cnt += 1
      if i == k - 1:
        break

    return relevant_cnt / len(relevant_docs)

In [65]:
def average_precision(results, relevant_docs):
    """
        Compute Average Precision (for a single query - the results are 
        averaged across queries to get MAP in the next few cells)
        Hint: You can use the recall_k and precision_k functions here!
        Input: 
            results: A sorted list of 2-tuples (document_id, score), with the most 
                    relevant document in the first position
            relevant_docs: A set of relevant documents. 
        Output: Average Precision
    """
    relevant_cnt = 0
    sum = 0
    search_cnt = 0

    while relevant_cnt < len(relevant_docs) and search_cnt < len(results):
      doc_id, _ = results[search_cnt]
      search_cnt += 1
      if doc_id in relevant_docs:
        relevant_cnt += 1
        sum += relevant_cnt / search_cnt

    return sum / len(relevant_docs)


In [67]:
def err(results, relevant_docs):
    """
        Compute the expected reciprocal rank.
        Input: 
            results: A sorted list of 2-tuples (document_id, score), with the most 
                    relevant document in the first position
            relevant_docs: A set of relevant documents. 
        Output: ERR
        
    """
    # YOUR CODE HERE
    err_score = 0
    r_prod = 1

    for i, (doc_id, _) in enumerate(results):
      if doc_id in relevant_docs:
        err_score += 1/(i+1) * r_prod * 0.5
        r_prod *= 1 - 0.5
    
    return err_score


In [69]:
#### metrics@k functions

recall_at_1 = partial(recall_k, k=1)
recall_at_5 = partial(recall_k, k=5)
recall_at_10 = partial(recall_k, k=10)
precision_at_1 = partial(precision_k, k=1)
precision_at_5 = partial(precision_k, k=5)
precision_at_10 = partial(precision_k, k=10)


list_of_metrics = [
    ("ERR", err),
    ("MAP", average_precision),
    ("Recall@1",recall_at_1),
    ("Recall@5", recall_at_5),
    ("Recall@10", recall_at_10),
    ("Precision@1", precision_at_1),
    ("Precision@5", precision_at_5),
    ("Precision@10", precision_at_10)]
####

In [70]:
#### Evaluate a search function

list_of_search_fns = [
    ("BOW", bow_search),
    ("TF-IDF", tfidf_search),
    ("NaiveQL", naive_ql_search),
    ("QL", ql_search),
    ("BM25", bm25_search)
]

def evaluate_search_fn(search_fn, metric_fns, index_set=None):
    # build a dict query_id -> query 
    queries_by_id = dict((q[0], q[1]) for q in queries)
    
    metrics = {}
    for metric, metric_fn in metric_fns:
        metrics[metric] = np.zeros(len(qrels), dtype=np.float32)
    
    for i, (query_id, relevant_docs) in enumerate(qrels.items()):
        query = queries_by_id[query_id]
        if index_set:
            results = search_fn(query, index_set)
        else:
            results = search_fn(query)
        
        for metric, metric_fn in metric_fns:
            metrics[metric][i] = metric_fn(results, relevant_docs)

    
    
    final_dict = {}
    for metric, metric_vals in metrics.items():
        final_dict[metric] = metric_vals.mean()
    
    return final_dict


In [74]:
def dot(vec_1,vec_2): 
    """
        vec_1 and vec_2 are of the form: [(int, float), (int, float), ...]
        Return the dot product of two such vectors, computed only on the floats
        You can assume that the lengths of the vectors are the same, and the dimensions are aligned 
            i.e you won't get: vec_1 = [(1, 0.2)] ; vec_2 = [(2, 0.3)] 
                                (dimensions are unaligned and lengths are different)
    """
    return sum( [vec_1[i][1]*vec_2[i][1] for i in range(len(vec_1))] )


def cosine_sim(vec_1, vec_2):
    return dot(vec_1, vec_2)/((np.sqrt(dot(vec_1, vec_1)) * np.sqrt(dot(vec_2, vec_2)))+1e-6)


In [77]:
class VectorSpaceRetrievalModel:
    """
        Parent class for Dense Vector Retrieval models
    """
    def __init__(self, doc_repr):
        """
            document_collection: 
                [
                    (doc_id_1, [token 1, token 2, ...]), 
                    (doc_id_2, [token 1, token 2, ....]) 
                    ...
                ]

        """
        self.doc_repr = doc_repr
        self.documents = [_[1] for _ in self.doc_repr]
        
        # construct a dictionary
        self.dictionary = Dictionary(self.documents)
        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        self.dictionary.filter_extremes(no_below=10)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.documents]
    
        # Make a index to word dictionary.
        temp = self.dictionary[0]  # This is only to "load" the dictionary.
        self.id2word = self.dictionary.id2token
        
        # this is set by the train_model function
        self.model = None
        
        
    def vectorize_documents(self):
        """
            Returns a doc_id -> vector dictionary
        """
        vectors = {}
        for (doc_id, _), cc in zip(self.doc_repr, self.corpus):
            vectors[doc_id] = self.model[cc]
        return vectors

    def vectorize_query(self, query):
        # Note the use of config_2 here!
        query = process_text(query, **config_2)
        query_vector = self.dictionary.doc2bow(query)
        return self.model[query_vector]
    
    def train_model(self):
        """
            Trains a model and sets the 'self.model' variable. 
            Make sure to use the variables created in the __init__ method.
            e.g the variables which may be useful: {corpus, dictionary, id2word}
        """
        raise NotImplementedError()

In [78]:
class LsiRetrievalModel(VectorSpaceRetrievalModel):
    def __init__(self, doc_repr):
        super().__init__(doc_repr)
        
        self.num_topics = 100
        self.chunksize = 2000
    
    def train_model(self):
        self.model = LsiModel(self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize)


In [80]:
class DenseRetrievalRanker:
    def __init__(self, vsrm, similarity_fn):
        """
            vsrm: instance of `VectorSpaceRetrievalModel`
            similarity_fn: function instance that takes in two vectors 
                            and returns a similarity score e.g cosine_sim defined earlier
        """
        self.vsrm = vsrm 
        self.vectorized_documents = self.vsrm.vectorize_documents()
        self.similarity_fn = similarity_fn
    
    def _compute_sim(self, query_vector):
        """
            Compute the similarity of `query_vector` to documents in 
            `self.vectorized_documents` using `self.similarity_fn`
            Returns a list of (doc_id, score) tuples
        """
        empty_list = []

        for key in self.vectorized_documents:
          doc_id = key
          document_single = self.vectorized_documents.get(key)

          if document_single != [] and query_vector != []:
            score = self.similarity_fn(document_single,query_vector) # compute similary between the query vector to each vectorized document
            empty_list.append(tuple((doc_id, score)))
          else:
            score = 0
            empty_list.append(tuple((doc_id, score)))


        return empty_list
    
    def search(self, query):
        scores = self._compute_sim(self.vsrm.vectorize_query(query))
        scores.sort(key=lambda _:-_[1])
        return scores 

In [82]:
# test your LSI model
search_fn = drm_lsi.search

text = widgets.Text(description="Search Bar", width=200)
display(text)

def make_results_2(query, search_fn):
    results = []
    for doc_id, score in search_fn(query):
        highlight = highlight_text(docs_by_id[doc_id], query)
        if len(highlight.strip()) == 0:
            highlight = docs_by_id[doc_id]
        results.append(ResultRow(doc_id, highlight, score))
    return results

def handle_submit_2(sender):
    print(f"Searching for: '{sender.value}' (SEARCH FN: {search_fn})")
    
    results = make_results_2(sender.value, search_fn)
    
    # display only the top 5
    results = results[:5]
    
    body = ""
    for idx, r in enumerate(results):
        body += f"<li>Document #{r.doc_id}({r.score}): {r.snippet}</li>"
    display(HTML(f"<ul>{body}</ul>"))
    

text.on_submit(handle_submit_2)

Text(value='', description='Search Bar')

In [83]:
def jenson_shannon_divergence(vec_1, vec_2, assert_prob=False):
    """
        Computes the Jensen-Shannon divergence between two probability distributions. 
        NOTE: DO NOT RETURN 1 - JSD here, that is handled by the next function which is already implemented! 
        The inputs are *gensim* vectors - same as the vectors for the cosine_sim function
        assert_prob is a flag that checks if the inputs are proper probability distributions 
            i.e they sum to 1 and are positive - use this to check your inputs if needed. 
                (This is optional to implement, but recommended - 
                you can the default to False to save a few ms off the runtime)
    """
    _vec_1 = np.asarray(vec_1) / sum(n for _, n in vec_1)
    _vec_2 = np.asarray(vec_2) / sum(n for _, n in vec_2)
    _avg = 0.5 * (_vec_1 + _vec_2)
    
    def KL(a,b):
      a = np.asarray([x[1] for x in a], dtype=np.float)
      b = np.asarray([x[1] for x in b], dtype=np.float)

      return np.sum(np.where(a != 0, a * np.log2(a / b), 0))

    return 0.5 * (KL(_vec_1, _avg) + KL(_vec_2, _avg))

def jenson_shannon_sim(vec_1, vec_2, assert_prob=False):
    return 1 - jenson_shannon_divergence(vec_1, vec_2)



In [85]:
class LdaRetrievalModel(VectorSpaceRetrievalModel):
    def __init__(self, doc_repr):
        super().__init__(doc_repr)
        
        # use these parameters in the train_model method
        self.num_topics = 100
        self.chunksize = 2000
        self.passes = 20
        self.iterations = 400
        self.eval_every = 10
        
        # this is need to get full vectors
        self.minimum_probability=0.0
        self.alpha='auto'
        self.eta='auto'
    
    
    def train_model(self):
        self.model = LdaModel(self.corpus, self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
                              passes=self.passes, iterations=self.iterations, eval_every=self.eval_every, 
                              minimum_probability=self.minimum_probability, alpha=self.alpha, eta=self.eta)


In [87]:
drm_lda = DenseRetrievalRanker(lda, jenson_shannon_sim)

search_fn = drm_lda.search

text = widgets.Text(description="Search Bar", width=200)
display(text)

text.on_submit(handle_submit_2)

Text(value='', description='Search Bar')

In [88]:
class W2VRetrievalModel(VectorSpaceRetrievalModel):
    def __init__(self, doc_repr):
        super().__init__(doc_repr)
        
        self.size = 100 
        self.min_count = 1
    
    def train_model(self):
        """
        Trains the W2V model
        """
        self.model = Word2Vec(self.documents, size=self.size, min_count = self.min_count)
        self.model.save("word2vec-google-news-300.model")
        
    def vectorize_documents(self):
        """
            Returns a doc_id -> vector dictionary
        """
        vectors = {}
        for (doc_id, _), cc in zip(self.doc_repr, self.documents):
          vector_dim = self.model.vector_size
          arr = np.empty((0,vector_dim), dtype='f')

          for wrd in cc:
            if wrd in self.model.wv.vocab:
              word_array = self.model.wv[wrd]
              norm = np.linalg.norm(word_array)
              word_array = (word_array/norm).reshape(1, -1)
              arr = np.append(arr,np.array(word_array), axis=0)
            else:
              word_array = np.zeros(self.size).reshape(1, -1)
              arr = np.append(arr,np.array(word_array), axis=0)

          list1 = np.mean(arr, axis=0)
          list2 = list(range(self.size))
          vectors[doc_id] = list(zip(list2, list1)) # save vectorized query for each doc
        return vectors

    def vectorize_query(self, query):
        """
        Vectorizes the query using the W2V model
        """
        query = process_text(query, **config_2)
        vector_dim = self.model.vector_size
        arr = np.empty((0,vector_dim), dtype='f')
        
        for wrd in query:
          if wrd in self.model.wv.vocab:
            word_array = self.model.wv[wrd] # infer vector for each word

            norm = np.linalg.norm(word_array)
            word_array = (word_array/norm).reshape(1, -1) # normalize the inferred vector

            arr = np.append(arr,np.array(word_array), axis=0)
          else:
            word_array = np.zeros(self.size).reshape(1, -1) # if the word is not present, return 0 for all dimension

            arr = np.append(arr,np.array(word_array), axis=0)

        list1 = np.mean(arr, axis=0) # average over each dimension
        list2 = list(range(self.size))

        return list(zip(list2, list1))
      
    
class W2VPretrainedRetrievalModel(W2VRetrievalModel):
    def __init__(self, doc_repr):
        super().__init__(doc_repr)
        self.model_name = "word2vec-google-news-300"
        self.size = 300
    
    def train_model(self):
        """
        Loads the pretrained model
        """
        self.model = g_downloader.load(self.model_name)

w2v = W2VRetrievalModel(doc_repr_2)
w2v.train_model()

w2v.vectorize_query("report")

2021-02-19 22:52:44,336 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-19 22:52:44,438 : INFO : built Dictionary(5937 unique tokens: ['-', 'algebra', 'intern', 'languag', 'preliminari']...) from 3204 documents (total 115969 corpus positions)
2021-02-19 22:52:44,443 : INFO : discarding 4740 tokens: [('repeat', 8), ('glossari', 7), ('inspect', 8), ('uncol', 2), ('rung', 9), ('secant', 2), ('.', 1603), ('acceler', 6), ('diverg', 3), ('induc', 9)]...
2021-02-19 22:52:44,444 : INFO : keeping 1197 tokens which were in no less than 10 and no more than 1602 (=50.0%) documents
2021-02-19 22:52:44,446 : INFO : resulting dictionary: Dictionary(1197 unique tokens: ['-', 'algebra', 'intern', 'languag', 'preliminari']...)
2021-02-19 22:52:44,608 : INFO : collecting all words and their counts
2021-02-19 22:52:44,608 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-19 22:52:44,624 : INFO : collected 5937 word types from a corpus of 115969 raw w

[(0, 0.10895367),
 (1, 0.12217637),
 (2, -0.07480104),
 (3, 0.19750515),
 (4, -0.013826887),
 (5, 0.069984645),
 (6, 0.014467405),
 (7, 0.059876934),
 (8, 0.11729815),
 (9, 0.16905047),
 (10, 0.10974939),
 (11, -0.026893523),
 (12, 0.03642345),
 (13, 0.067993626),
 (14, -0.024139423),
 (15, 0.06633035),
 (16, 0.1302095),
 (17, -0.063747145),
 (18, 0.085926525),
 (19, -0.031735405),
 (20, 0.08511715),
 (21, -0.075270005),
 (22, -0.0049473816),
 (23, -0.1396215),
 (24, 0.12844622),
 (25, -0.0039034665),
 (26, -0.13227499),
 (27, 0.11614388),
 (28, 0.26218578),
 (29, -0.04970734),
 (30, -0.033200137),
 (31, 0.12476612),
 (32, -0.14957774),
 (33, 0.16482496),
 (34, 0.08254256),
 (35, -0.14865686),
 (36, 0.038189203),
 (37, 0.016183285),
 (38, -0.06224189),
 (39, -0.10534956),
 (40, 0.085742265),
 (41, -0.01841159),
 (42, 0.116705224),
 (43, 0.035525147),
 (44, -0.017305037),
 (45, 0.12868994),
 (46, 0.0064053936),
 (47, -0.030771768),
 (48, -0.07658876),
 (49, 0.087660454),
 (50, 0.0332776

In [90]:
w2v_pretrained = W2VPretrainedRetrievalModel(doc_repr_2)
w2v_pretrained.train_model()

# you can now get an W2V vector for a given query in the following way:
w2v_pretrained.vectorize_query("report")

2021-02-19 22:52:45,994 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-19 22:52:46,092 : INFO : built Dictionary(5937 unique tokens: ['-', 'algebra', 'intern', 'languag', 'preliminari']...) from 3204 documents (total 115969 corpus positions)
2021-02-19 22:52:46,099 : INFO : discarding 4740 tokens: [('repeat', 8), ('glossari', 7), ('inspect', 8), ('uncol', 2), ('rung', 9), ('secant', 2), ('.', 1603), ('acceler', 6), ('diverg', 3), ('induc', 9)]...
2021-02-19 22:52:46,099 : INFO : keeping 1197 tokens which were in no less than 10 and no more than 1602 (=50.0%) documents
2021-02-19 22:52:46,102 : INFO : resulting dictionary: Dictionary(1197 unique tokens: ['-', 'algebra', 'intern', 'languag', 'preliminari']...)




2021-02-19 22:56:01,189 : INFO : word2vec-google-news-300 downloaded
2021-02-19 22:56:01,199 : INFO : loading projection weights from /Users/xinyichen/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2021-02-19 22:56:42,420 : INFO : loaded (3000000, 300) matrix from /Users/xinyichen/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
  if wrd in self.model.wv.vocab:
  word_array = self.model.wv[wrd] # infer vector for each word


[(0, -0.05876739),
 (1, -0.06762275),
 (2, -0.037232764),
 (3, -0.04628938),
 (4, 0.041257925),
 (5, -0.017006315),
 (6, 0.020125818),
 (7, -0.05635229),
 (8, 0.08090579),
 (9, -0.055547256),
 (10, -0.0072452943),
 (11, 0.013283039),
 (12, 0.039245345),
 (13, -0.043673024),
 (14, -0.07003785),
 (15, 0.016905688),
 (16, -0.108679414),
 (17, -0.0026037777),
 (18, -0.073257975),
 (19, -0.09901902),
 (20, 0.1449059),
 (21, -0.0050314544),
 (22, -0.066817716),
 (23, -0.04971077),
 (24, 0.017811349),
 (25, 0.04528309),
 (26, 0.021635255),
 (27, 0.07366049),
 (28, -0.05997494),
 (29, 0.056754805),
 (30, -0.033811375),
 (31, -0.116729744),
 (32, -0.044880573),
 (33, -0.11914484),
 (34, 0.02978621),
 (35, -0.019522043),
 (36, 0.0166038),
 (37, 0.027974887),
 (38, 0.048100706),
 (39, 0.00034276783),
 (40, 0.028176146),
 (41, 0.04950951),
 (42, -0.03642773),
 (43, 0.13927066),
 (44, -0.018415123),
 (45, -0.012427692),
 (46, 0.0031698162),
 (47, -0.008704416),
 (48, -0.10465425),
 (49, 0.061585),


In [92]:
drm_w2v = DenseRetrievalRanker(w2v, cosine_sim)

# test your LDA model
search_fn = drm_w2v.search

text = widgets.Text(description="Search Bar", width=200)
display(text)


text.on_submit(handle_submit_2)

Text(value='', description='Search Bar')

In [93]:
drm_w2v_pretrained = DenseRetrievalRanker(w2v_pretrained, cosine_sim)

# test your LDA model
search_fn = drm_w2v_pretrained.search

text = widgets.Text(description="Search Bar", width=200)
display(text)


text.on_submit(handle_submit_2)

  if wrd in self.model.wv.vocab:
  word_array = self.model.wv[wrd]


Text(value='', description='Search Bar')

In [94]:
class D2VRetrievalModel(VectorSpaceRetrievalModel):
    def __init__(self, doc_repr):
        super().__init__(doc_repr)
        
        self.vector_size= 100
        self.min_count = 1
        self.epochs = 20
        
        self.taggedDocument = [TaggedDocument(doc, [i]) for i, doc in enumerate(self.documents)]
        
    def train_model(self):
        self.model = Doc2Vec(self.taggedDocument, size=self.vector_size, min_count = self.min_count, epochs=self.epochs)
    
    def vectorize_documents(self):
        """
            Returns a doc_id -> vector dictionary
        """
        vectors = {}

        for (doc_id, _), cc in zip(self.doc_repr, self.taggedDocument):
          list1 = self.model.infer_vector(cc[0]) # infer vector for the query
          list2 = list(range(self.vector_size))

          vectors[doc_id] = list(zip(list2, list1))
        return vectors


    def vectorize_query(self, query):
        query = process_text(query, **config_2)

        list1 = self.model.infer_vector(query) # infer vector for the query
        list2 = list(range(self.vector_size))
        
        return list(zip(list2, list1))

d2v = D2VRetrievalModel(doc_repr_2)
d2v.train_model()


d2v.vectorize_query("mellifluous")

2021-02-19 22:56:52,489 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-19 22:56:52,602 : INFO : built Dictionary(5937 unique tokens: ['-', 'algebra', 'intern', 'languag', 'preliminari']...) from 3204 documents (total 115969 corpus positions)
2021-02-19 22:56:52,607 : INFO : discarding 4740 tokens: [('repeat', 8), ('glossari', 7), ('inspect', 8), ('uncol', 2), ('rung', 9), ('secant', 2), ('.', 1603), ('acceler', 6), ('diverg', 3), ('induc', 9)]...
2021-02-19 22:56:52,608 : INFO : keeping 1197 tokens which were in no less than 10 and no more than 1602 (=50.0%) documents
2021-02-19 22:56:52,610 : INFO : resulting dictionary: Dictionary(1197 unique tokens: ['-', 'algebra', 'intern', 'languag', 'preliminari']...)
2021-02-19 22:56:52,677 : INFO : collecting all words and their counts
2021-02-19 22:56:52,678 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-02-19 22:56:52,698 : INFO : collected 5937 word types and 3204 unique tags fro

2021-02-19 22:56:57,021 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-19 22:56:57,028 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-19 22:56:57,034 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-19 22:56:57,034 : INFO : EPOCH - 16 : training on 115969 raw words (95663 effective words) took 0.2s, 566450 effective words/s
2021-02-19 22:56:57,190 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-19 22:56:57,201 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-19 22:56:57,203 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-19 22:56:57,204 : INFO : EPOCH - 17 : training on 115969 raw words (95559 effective words) took 0.2s, 577754 effective words/s
2021-02-19 22:56:57,360 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-19 22:56:57,365 : INFO : worker thread finished; awaiting finish of 1 more threads
20

[(0, -0.0019213697),
 (1, -0.004106078),
 (2, -0.0028646353),
 (3, 0.001140005),
 (4, 0.002845204),
 (5, 0.00082793355),
 (6, 0.0022050699),
 (7, -3.3529705e-06),
 (8, 0.0022612617),
 (9, 0.0025844753),
 (10, 0.0008105231),
 (11, -0.0036163835),
 (12, -0.00439776),
 (13, -0.0006054907),
 (14, 0.0013738474),
 (15, 0.0017882313),
 (16, -0.00018534577),
 (17, 0.00020566402),
 (18, 0.00022947349),
 (19, 0.00036388924),
 (20, 0.004048445),
 (21, -0.003983369),
 (22, -0.0025260183),
 (23, -0.003400089),
 (24, 0.0041524353),
 (25, 0.00031268262),
 (26, -0.00062451634),
 (27, -0.0020408237),
 (28, -0.0015949851),
 (29, -0.00014304624),
 (30, 0.0044405507),
 (31, 0.0026443126),
 (32, 0.0022147764),
 (33, 0.003634851),
 (34, -0.0021013443),
 (35, -0.004437364),
 (36, 0.0044091693),
 (37, 0.0008734586),
 (38, -0.00043053512),
 (39, -0.0041146106),
 (40, 0.0028440086),
 (41, -0.00081371865),
 (42, 0.0035586727),
 (43, 0.0012667307),
 (44, 0.0019917127),
 (45, -0.0035935107),
 (46, 0.00291651),
 (4

In [96]:
drm_d2v = DenseRetrievalRanker(d2v, cosine_sim)

# test your LDA model
search_fn = drm_d2v.search

text = widgets.Text(description="Search Bar", width=200)
display(text)


text.on_submit(handle_submit_2)

Text(value='', description='Search Bar')

In [98]:
class DenseRerankingModel:
    def __init__(self, initial_retrieval_fn, vsrm, similarity_fn):
        """
            initial_retrieval_fn: takes in a query and returns a list of [(doc_id, score)] (sorted)
            vsrm: instance of `VectorSpaceRetrievalModel`
            similarity_fn: function instance that takes in two vectors 
                            and returns a similarity score e.g cosine_sim defined earlier
        """
        self.ret = initial_retrieval_fn
        self.vsrm = vsrm
        self.similarity_fn = similarity_fn
        self.vectorized_documents = vsrm.vectorize_documents()
        
        assert len(self.vectorized_documents) == len(doc_repr_2)
    
    def search(self, query, K=50):
        """
            First, retrieve the top K results using the retrieval function
            Then, re-rank the results using the VSRM instance
        """

        # use BM25 to retrieve the top K results and store in newdict
        scores = self.ret(query)
        doc_ids = [i[0] for i in scores][0:K]
        newdict = {k: self.vectorized_documents[k] for k in doc_ids}


        empty_list  = []
        query_vector = self.vsrm.vectorize_query(query)
        
        for key in newdict:
          doc_id = key
          document_single = newdict.get(key)

          if document_single !=[] and query_vector != []:
            score = self.similarity_fn(document_single, query_vector) # compute similary between the query vector to each vectorized document
            empty_list.append(tuple((doc_id, score)))
          else:
            score = 0
            empty_list.append(tuple((doc_id, score)))

        empty_list.sort(key=lambda _:-_[1])
        return empty_list

In [99]:
##### Function check
bm25_search_2 = partial(bm25_search, index_set=2)
lsi_rerank = DenseRerankingModel(bm25_search_2, lsi, cosine_sim)
lda_rerank = DenseRerankingModel(bm25_search_2, lda, jenson_shannon_sim)
w2v_rerank = DenseRerankingModel(bm25_search_2, w2v, cosine_sim)
w2v_pretrained_rerank = DenseRerankingModel(bm25_search_2, w2v_pretrained, cosine_sim)
d2v_rerank = DenseRerankingModel(bm25_search_2, d2v, cosine_sim)

##### 

  if wrd in self.model.wv.vocab:
  word_array = self.model.wv[wrd]
