## Installations and getting the dataset in place

In [None]:
!pip install python-terrier &> /dev/null
!pip install ir_datasets &> /dev/null

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir /root/.ir_datasets
!mkdir /root/.ir_datasets/wapo
!cp /content/drive/MyDrive/WashingtonPost.v2.tar.gz /root/.ir_datasets/wapo
!ls ../root/.ir_datasets/wapo

WashingtonPost.v2.tar.gz


In [None]:
import ir_datasets
import pandas as pd
from google.colab import files
import os
import pyterrier as pt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Document class

In [None]:
# Borrowed from https://github.com/bwanglzu/Maximal-Marginal-Relevance
# and adapted to fit our data and needs 

"""Document class, read document, clean document, get terms."""
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

class Document(object):

    def __init__(self, docno):
        self.docno = docno # Actual string of the docid such as 'f233ecdeb87a44a6aa9ac429999d2d4c'
        # self._name = docno.split('/')[-1]
        self._term = None
        self._author = None
        self._kicker = None
        self._title = None

    def read(self):
        """Get terms within documents."""
        try:

          docstore = dataset.docs_store()
          self._term = docstore.get(self.docno).body#
          self._author=docstore.get(self.docno).author
          self._kicker=docstore.get(self.docno).kicker
          self._title=docstore.get(self.docno).title
        
          return self
          
        except EnvironmentError:
            raise IOError("Docno not found")

    def lower(self):
        """Terms to lower case."""
        self._term = self._term.lower()
        return self

    def del_punc(self):
        """Remove punc."""
        self._term = self._term.translate({k:None for k in string.punctuation})
        return self

    def del_space_stop(self):
        """Remove spaces, stopwords."""
        cached = stopwords.words("english")
        self._term = ' '.join([word for word in self._term.split() if word not in cached])
        return self

    @property
    def terms(self):
        """Finish process"""
        self.read().lower().del_punc().del_space_stop()
        return self._term

    @property 
    def name(self):
        """doc name"""
        return self._name

    @property
    def author(self):
      
      if self.read()._author.isalnum():
        return self._author
      else: 
        return ''.join([char for char in self._author if char.isalnum()])
      
      
    @property
    def kicker(self):
      if self.read()._kicker.isalnum():
        return self._kicker
      else: 
        return ''.join([char for char in self._kicker if char.isalnum()])

    @property
    def title(self):
        """Finish title process"""
        self.read().lower().del_punc().del_space_stop()
        return self._title

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Reranking definition

In [None]:
# Skeleton borrowed from https://github.com/bwanglzu/Maximal-Marginal-Relevance
# drastically adapted to fit our needs and the mmr function is changed to an adaptation of https://medium.com/tech-that-works/maximal-marginal-relevance-to-rerank-results-in-unsupervised-keyphrase-extraction-22d95015c7c5

import glob
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


def _build_sim_matrix(initial_ranking=None, alpha=0.5, beta=0.5):
  """
  Build similarity matrix.
  Alpha determines ratio of tfidf repr vs binary (higher alpha is more emphasis on binary representation)
  Beta determines ratio of title vs body (higher beta is more emphasis on title)

  """
  terms = []
  titles = []
  a_k = []
  docs = initial_ranking['docno'].tolist()
  for d in docs:
    d = Document(d)
    terms.append(d.terms)
    a_k.append(d.author + " " +  d.kicker)
    titles.append(d.title)
    comb = d.author + " " +  d.kicker

  tfidf = TfidfVectorizer().fit_transform(terms)
  tfidf_title = TfidfVectorizer().fit_transform(titles)

  counts = CountVectorizer().fit_transform(a_k)
  pairwise_counts = ((counts*counts.T).A)/2
  

  pairwise_similarity_body = (tfidf * tfidf.T).A
  pairwise_similarity_title = (tfidf_title * tfidf_title.T).A

  pairwise_similarity=(beta*pairwise_similarity_body + (1-beta)*pairwise_similarity_title)
  
  combined = (alpha*pairwise_counts + (1-alpha)*pairwise_similarity)
  return pd.DataFrame(combined)
  
def _lookup_rel(initial_ranking, doc):
	"""Lookup table for relevance."""
	return initial_ranking.loc[initial_ranking['docno'] == doc, 'score'].iloc[0]

def _lookup_sim(doc1, doc2, sim_matrix, initial_ranking):
	"""Lookup pairwise similarity."""
	try:
	    doc1_idx = initial_ranking.index[initial_ranking['docno'] == doc1].tolist()[0]
	    doc2_idx = initial_ranking.index[initial_ranking['docno'] == doc2].tolist()[0]
			
	except IndexError:
		return 0
	sim_doc1_doc2 = sim_matrix.iat[doc1_idx, doc2_idx]
	return sim_doc1_doc2


def rank(initial_ranking, lambda_score, query, alpha = 0.5, beta = 0.5):
	"""Ranking based on mmr score."""

	sim_matrix = _build_sim_matrix(initial_ranking=initial_ranking, alpha=alpha, beta=beta)
	return maximal_marginal_relevance(initial_ranking=initial_ranking, sim_matrix=sim_matrix, query=query, lambda_constant=lambda_score)
 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def maximal_marginal_relevance(initial_ranking, sim_matrix, query, lambda_constant=0.5):
    """
    Documentation here.
    """

    s = [] # reranking
    r = initial_ranking['docno'].to_list()

    while len(r) > 0:
        score = None
        # phrase_to_add = ''
        for i in r:
            first_part = _lookup_rel(initial_ranking, i)  
            second_part = 0
            for j in s:
                cos_sim = _lookup_sim(i, j['docno'], sim_matrix=sim_matrix, initial_ranking=initial_ranking) 

                if cos_sim > second_part:
                    second_part = cos_sim
            equation_score = lambda_constant*(first_part)-(1-lambda_constant) * second_part

            if score == None:
                score = equation_score
                doc_to_add = i
            elif equation_score > score:
                score = equation_score
                doc_to_add = i
        r.remove(doc_to_add)
        s.append({'docno': doc_to_add, 'score': score, 'query': query})
    return s 



## Initial Indexing + Ranking 

In [None]:
dataset = ir_datasets.load("wapo/v2/trec-core-2018")
print(dataset)
print(dataset.docs_count())

Dataset(id='wapo/v2/trec-core-2018', provides=['docs', 'queries', 'qrels'])
595037


Show the first 10 documents:

In [None]:
pt.init()
dataset_pt = pt.get_dataset('irds:wapo/v2')
# Index wapo/v2
indexer = pt.IterDictIndexer('./indices/wapo_v2', meta={"docno": 36})
try: 
  index_ref = indexer.index(dataset_pt.get_corpus_iter(), fields=['url', 'title', 'author', 'kicker', 'body'])

except Exception as e:
  index_ref = './indices/wapo_v2/data.properties'
  print('Warning: Exception was caught: \n', e)

terrier-assemblies 5.7 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.7 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

[INFO] [starting] building docstore
docs_iter: 595037doc [11:48, 840.35doc/s]
[INFO] [finished] docs_iter: [11:48] [595037doc] [840.34doc/s]
[INFO] [finished] building docstore [11:48]


wapo/v2 documents:   0%|          | 0/595037 [00:00<?, ?it/s]

In [None]:
# load the index, print the statistics
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

Number of documents: 595037
Number of terms: 674473
Number of postings: 111041554
Number of fields: 5
Number of tokens: 177895814
Field names: [url, title, author, kicker, body]
Positions:   false



In [None]:
dataset_pt = pt.get_dataset('irds:wapo/v2/trec-core-2018')
pipeline = pt.BatchRetrieve(index_ref, wmodel='BM25')
ranking = pipeline(dataset_pt.get_topics('title'))

[INFO] [starting] https://trec.nist.gov/data/core/topics2018.txt
[INFO] [finished] https://trec.nist.gov/data/core/topics2018.txt: [00:00] [24.1kB] [10.2MB/s]


## Reranking

Checking for preprocessing:

In [None]:
import numpy as np 
import pandas as pd

for x in ['author', 'title', 'kicker', 'body']:
  print(x)
  print('isnan: ', sum(pd.DataFrame(dataset.docs_iter()[:10000])[x].isna()))
  print('empty str: ', sum(pd.DataFrame(dataset.docs_iter()[:10000])[x]==''))


author
isnan:  0
empty str:  2268
title
isnan:  410
empty str:  0
kicker
isnan:  0
empty str:  87
body
isnan:  0
empty str:  49


In [None]:
def get_relevance(docno, query_id, qrels):
  
    try:
      doc_given_query = qrels[(qrels['query_id'] == query_id) & 
                              (qrels['doc_id'] == docno)]
      return int(doc_given_query['relevance'])
    except:
      return None 


def rerank(lambda_score, alpha, beta):
  rerank_result = []
  for q in np.unique(ranking['query']):
    q_rank = ranking[ranking['query'] == q] # kick out kickerless (these are empty strings) and relevance-less here for latter use get relevance function 
    query_id = np.unique(q_rank['qid'])[0]
    missing_data = [d for d in q_rank['docno'] if (get_relevance(d, query_id, qrels)==None or Document(d).kicker == '' or Document(d).title==None)] # for one of them there are only 62 left, title is never a problem here
    q_rank = q_rank.drop(q_rank[q_rank['docno'].isin(missing_data)].index)
    max_score =np.max(q_rank['score'])

    q_rank['score'] = q_rank['score']/max_score
    q_rank.reset_index(inplace=True)

    rerank_result+=rank(initial_ranking = q_rank[:50], lambda_score = lambda_score, query = q, alpha = alpha, beta = beta)
    
  return rerank_result

In [None]:
# # getting the rerankings

# qrels = pd.DataFrame(dataset.qrels_iter())

# alpha_schedule = [0, 0.25, 0.5, 0.75, 1]
# beta_schedule = [0, 0.25, 0.5, 0.75, 1]
# lmbd = 0.25                    #[0.25, 0.5, 0.75]

# for alpha in alpha_schedule:
#   for beta in beta_schedule:
#     pd.DataFrame(rerank(lmbd, alpha, beta)).to_csv(f'/content/drive/MyDrive/RU/IR/lambda={lmbd}_alpha={alpha}_beta={beta}.csv')

## Evaluation functions

### Utils

In [None]:
import os 


def read_all_frames(dir): 
  data = {}
  for filename in os.listdir(dir):
      if filename.endswith("csv"):  
          print(filename)
          p = os.path.join(dir, filename)
          data[filename[:-4]]= pd.read_csv(p)
  return data


def query_helper(string):
  '''
  preprocessing necessary to find qid
  '''
  string = string.replace('u.s.', 'u s')
  string = ''.join([char for char in string if char.isalnum() or char == ' ' or char == '-'])
  string = ''.join([' ' if char == '-' else char for char in string ])
  return string



def lookup_qid(query, df):
  '''
  takes query title, preprocessed or not 
  takes pd.DataFrame(dataset.queries_iter()) as df 
  '''

  try:
    return df[df['title'] == query]['query_id'].iloc[0]
  except IndexError:
    df["title"] = df["title"].str.lower().apply(query_helper)
    return df[df['title'] == query]['query_id'].iloc[0]


all_frames = read_all_frames('/content/drive/MyDrive/RU/IR') 
df = pd.DataFrame(dataset.queries_iter())
qrels = pd.DataFrame(dataset.qrels_iter())


### Eval funcs

In [None]:
def topic_recall(ranking, query, queries_iter_df, qrels, k=10):
  """
  Ranking: a ranking containing different queries 
  Query: For which query to 
  queries_iter_df: Dataframe which contains both, queries (title) and query_id (preferably already formatted in the way it is also in the ranking)
  qrels: query-relevance dataframe
  k: for which k to get the recall
  Returns ratio of (unique topics covered)/(all unique relevant topics)
  """
  q_rank = ranking[ranking['query'] == query].reset_index()[:k]
  qid = lookup_qid(query=query, df=queries_iter_df)

  TP_topics = set()

  qrels = qrels[qrels['query_id'] == qid].reset_index()
  for doc in q_rank['docno']:

    doc_relevance = get_relevance(docno=doc, query_id=qid, qrels=qrels)
    if doc_relevance != None and doc_relevance > 0:
        TP_topics.add(Document(doc).kicker)

  actually_rel = qrels[qrels['relevance'] > 0]

  unique_rel_topics = np.unique([Document(d).kicker for d in actually_rel['doc_id']])
  if unique_rel_topics > k:
    FN = k-len(TP_topics)
  else:
    FN = len(unique_rel_topics)-len(TP_topics)

  TP = len(TP_topics)
  return TP/(TP+FN)


In [None]:
from itertools import combinations 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

def avg_dissim(reranked, k): 
  """
  takes reranked list as returned by MMR class 
  returns average similarity of pairs of top k docuements in ranking 
  uses tf-idf representation and cosine similarity
  """
  reranked = reranked[:k]
  if type(reranked) is list:
    perm = combinations(reranked, 2)
  else: 
    perm = combinations(reranked['docno'], 2)

  result = []
  terms = []

  for pair in list(perm): 
    if type(reranked) is list:
      doc1 = pair[0]['docno']
      doc2 = pair[1]['docno']
    else: 
      doc1 = pair[0]
      doc2 = pair[1]

    terms.append(Document(doc1).terms)
    terms.append(Document(doc2).terms)
    tfidf = TfidfVectorizer().fit_transform(terms)

    sim = cosine_similarity(tfidf[0], tfidf[1])
    result.append(1-sim.squeeze())

  return np.mean(result)

In [None]:
import numpy as np 
import pandas as pd 


def g(relevance, tau): 
    """
    utility function g to compute probability of relevance given a document and a query 
    takes relevance and cosntant tau 
    return g factor 
    (tau may be 0 given that we only have max(r)=2)
    """
    res = relevance - tau
    if res <0:
        return 0 
    else:
        return res 


def p_rel(g, g_max): 
    """
    returns 
    """
    return (2**g-1)/2**g_max

def disc(l, k): 
    if l<k: 
        return 1 
    else: 
        return l-k

def distance(doc1, doc2): 
    """ 
    Takes two doc_no
    Returns cosine dissimilarity 
    """
    terms = []
    terms.append(Document(doc1).terms)
    terms.append(Document(doc2).terms)
    try:
      tfidf = TfidfVectorizer().fit_transform(terms)
      sim = cosine_similarity(tfidf[0], tfidf[1])
    except: 
      return 0 

    return 1-sim.squeeze()
    


def ILD(reranked_list, document:str, query_id:str, qrels, k): 
    """
    takes 
        reranked list with respect to one query, 
        document number (doc_no), 
        query_d
    dataframe with queries, relevance and docno has to exist 
    """
    reranked_list = reranked_list[:k]
  
    pos_doc_k = reranked_list.index[reranked_list['docno']==document].tolist()[0]
  

    if 'qrels' not in globals():
        qrels = pd.DataFrame(dataset.qrels_iter())

    g_max = np.max(qrels['relevance'][qrels['query_id'] ==query_id])

    numerator=0
    C_k = 0 

    for l, doc_l in enumerate(reranked_list['docno']): 
        
        discount = disc(l, pos_doc_k)
        rel_l = get_relevance(query_id=query_id, docno= doc_l, qrels=qrels) # this was in a try and except before we removed all the ones without relevance
        g_score_l = g(rel_l, 0)
        p_rel_l = p_rel(g_score_l, g_max=g_max)
        dist = distance(document, doc_l)
        numerator += discount*p_rel_l*dist

        if doc_l != document:
            C_k += discount*p_rel_l

    if C_k == 0:
      return None # this is the case if there is only one or zero relevant docs
    else:
      return numerator/C_k
  
def avg_ILD(ranking, query_id:str, qrels, k):
  """
  returns average ILD of first k items in ranking 
  omits the value for the only relevant doc if only one is relevant 
  returns zero if no relvant docs are in ranking 
  """

  res = 0
  n_docs = k

  for doc in ranking['docno'][:k]:
    div = ILD(ranking, doc, query_id, qrels, k)
    if div != None:
      res += div
    else: 
      n_docs -= 1 # if there is only one relevant doc
  if n_docs ==0:
    return 0 # this happens if there are zero relevant docs
  else:
    return res/n_docs




    

In [None]:
qrels

Unnamed: 0,query_id,doc_id,relevance,iteration
0,321,004c6120d0aa69da29cc045da0562168,0,0
1,321,005a1f0c2064487a7f15443b2a5f349a,0,0
2,321,00722094-2935-11e2-b4e0-346287b7e56c,0,0
3,321,007d2856-7cc4-11e4-84d4-7c896b90abdc,0,0
4,321,009aafb6-0283-11e6-8bb1-f124a43f84dc,0,0
...,...,...,...,...
26228,825,fd63b1f8-f00e-11e4-a55f-38924fca94f9,0,0
26229,825,fdeefde0-44e1-11e4-b47c-f5889e061e5f,0,0
26230,825,fe320ce7929e640c70458009f73e5241,0,0
26231,825,ff1c523ea149f03d89019adb8782cdd9,0,0


In [None]:
def precision(ranking, k, q_id):
  ranking = ranking[:k]
  all_rel = np.sum(qrels[qrels['query_id'] == q_id]['relevance'])
  if all_rel <k:
    k = all_rel
  # get all relevant docs - divide rel by this if it is lower than k
  rel =0
  for doc in ranking['docno']:
    relevance=get_relevance(doc, q_id, qrels)
    if relevance >0:
      rel += 1
  return rel/k


## Results

In [None]:
df_queries_qid = pd.DataFrame(dataset.queries_iter())
all_queries = np.unique(df_queries_qid['title'])
all_queries = pd.Series(all_queries).str.lower().apply(query_helper) # make the queries findable
all_frames = read_all_frames('/content/drive/MyDrive/RU/IR')


frames={}
for name, df in all_frames.items():
  if name.startswith('lambda'):
    frames[name] = df

frames['base_ranking'] = ranking 

average_results=[]

k=10
counter=0
for name, df in frames.items(): 
  
   final_data = {}
   final_data['config'] = name
   results = []
   for query in all_queries:
     r = {}
     r['query'] = query
     r['name'] = name

     q_rank = df[df['query'] == query][:50]
     q_id = lookup_qid(query, df_queries_qid)
     
     if name == 'base_ranking':
        missing_data = [d for d in q_rank['docno'] if (get_relevance(d, q_id, qrels)==None or Document(d).kicker == '' or Document(d).title==None)] # for one of them there are only 62 left, title is never a problem here
        q_rank = q_rank.drop(q_rank[q_rank['docno'].isin(missing_data)].index).reset_index()

     avg_ild = avg_ILD(q_rank, q_id, qrels, 10)
     r['avg_ild']= avg_ild

     prec = precision(q_rank, k, q_id) 
     r['precision']= prec

     t_recall = topic_recall(q_rank, query, df_queries_qid, qrels=qrels, k=k)
     r['t_recall']= t_recall

     avg_sim = avg_dissim(q_rank, k=k) 
     r['avg_sim']= avg_sim


     results.append(r)
   results_finished = pd.DataFrame(results)
   results_finished.to_csv(f'/content/drive/MyDrive/RU/IR/average_results_{name}.csv')

   precision_mean = np.mean(results_finished['precision'])
   final_data['precision_mean'] = precision_mean
   ild = np.mean(results_finished['avg_ild'])
   final_data['avg_ild'] = ild
   recall_mean = np.mean(results_finished['t_recall'])
   final_data['recall_mean'] = recall_mean
   avg_diss_mean = np.mean(results_finished['avg_sim'])
   final_data['avg_diss_mean'] = avg_diss_mean
   average_results.append(final_data)
 

   pd.DataFrame(average_results).to_csv('/content/drive/MyDrive/RU/IR/average_results.csv')


