# inforet 2022 2

In [None]:
# no time to lose:
!wget https://gerdes.fr/saclay/informationRetrieval/our_msmarco.zip
!unzip our_msmarco.zip
# this will be big: 1.2gb!
# you will get three files

In [None]:
# this turns on the autotimer, so that every cell has a timing information below
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime
# stop using:
# %unload_ext autotime

In [None]:
# !pip install dask
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import dask.dataframe as dd

## our dataset

- "TREC stands for the Text Retrieval Conference. Started in 1992 it is a series of workshops that focus on supporting research within the information retrieval community. It provides the infrastructure necessary for large-scale evaluation of text retrieval methodologies. Every year these workshops are organized, which are centered around a set of tracks. These tracks encourage new researches in the area of information retrieval."
- TREC 2019 Deep Learning Track https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019
- data from MS-Marco https://microsoft.github.io/msmarco/
- The dataset contains  367k queries and a corpus of 3.2 million documents. 
___
- if you want to reproduce my selection or get a bigger set, uncomment and execute


In [None]:
#!wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz
#!wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz
#!wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-top100.gz
	
#!gzip -d msmarco-docs.tsv.gz
#!gzip -d msmarco-doctrain-queries.tsv.gz
#!gzip -d msmarco-doctrain-top100.gz


- we have three datasets:
    
    1. the queries: msmarco-doctrain-queries.tsv
    2. the gold: msmarco-doctrain-top100.tsv is a table containing query_id, doc_id and rank
    3. the actual documents: msmarco-docs.tsv 21GB of text! doc_id, url, title, text

In [None]:
all_queries=pd.read_table('msmarco-doctrain-queries.tsv',header=None)
all_queries.columns=['qid','query']
print('Shape=>',all_queries.shape)
all_queries.head()

#### reducing the dataset
- here we take 1000 queries. 
- if this is too big for your computer, use this code to build smaller version, starting with the already reduced 1000 query set that we've downloaded before


In [None]:
our_queries=all_queries.sample(n=1000,random_state=42).reset_index(drop=True)
print('Shape=>',our_queries.shape)
our_queries.head()

In [None]:
our_queries.to_csv('our.msmarco.queries.tsv',sep='\t')

#### the gold file
- 36m lines!

In [None]:
gold_top100=pd.read_table('msmarco-doctrain-top100',delimiter=' ',header=None)
gold_top100.columns=['qid','Q0','docid','rank','score','runstring']
print('Shape=>',gold_top100.shape)
display(gold_top100.head())
# Reducing train_top100 for training
our_gold_top100=train_top100[gold_top100['qid'].isin(our_queries['qid'].unique())].reset_index(drop=True)
print('Shape=>',our_gold_top100.shape)
our_gold_top100.head()

In [None]:
our_gold_top100.to_csv('our.msmarco.gold.tsv',sep='\t')

#### the data file

- it's so big that it's smarter to use dask: https://docs.dask.org/en/stable/

In [None]:
df=dd.read_table('msmarco-docs.tsv',blocksize=100e6,header=None) #  partitions of 100MB
df.columns=['docid','url','title','body']
df.head()

In [None]:
# can't get the number of rows quickly :s
# very slow:
# len(df.index)

# faster:
!wc -l msmarco-docs.tsv

- big dataset with 3m rows!
- we want the top 100 for our queries
- this takes some time!

In [None]:
def create_corpus(result):
  unique_docid=result['docid'].unique()
  condition=df['docid'].isin(unique_docid)
  corpus=df[condition].reset_index(drop=True)
  corpus=corpus.drop(columns='url')
  print('Number of Rows=>',len(corpus))
  return corpus

our_docs=create_corpus(our_gold_top100)
our_docs.head()

In [None]:
our_docs.to_csv('our.msmarco.docs.tsv',sep='\t', single_file=True)

- this is still a big file: 92k documents

# reading in our smaller files
here we use the

- !wget https://gerdes.fr/saclay/informationRetrieval/our_msmarco.zip
- !unzip our_msmarco.zip

In [None]:
queries = pd.read_csv('our.msmarco.queries.tsv',sep='\t',usecols=[1,2])
queries

In [None]:
gold = pd.read_csv('our.msmarco.gold.tsv',sep='\t',usecols=[1,3,4,5])
gold

In [None]:
docs = pd.read_csv('our.msmarco.docs.tsv',sep='\t',usecols=[1,2,3])
docs

In [None]:
# Creating Training Set of Queries
training_queries=queries.iloc[:500]
print('Shape=>',training_queries.shape)
display(training_queries.head())
# Creating Testing Set of Queries
testing_queries=queries.iloc[500:]
print('Shape=>',testing_queries.shape)
testing_queries.head()

## exploring the data

### 🚧 todo: check whether there are NaN and take care of them

In [None]:
... isna

In [None]:
... fillna

### let's have a look at some random query:

In [None]:
queries.loc[111]

In [None]:
gold[gold.qid==251898]

### 🚧 todo: let's look at the top-ranked document for that query
- title
- body

In [None]:
# todo: .values[0] can help

### 🚧 todo: let's look at the second document
- let's make a functioin to make that easier

In [None]:
def titleAndBody(qid,nr):
    display(...)
    display(...)
titleAndBody(251898,1)

#### let's look at the 100th document

In [None]:
titleAndBody(251898,99)

### 🚧 todo: try this with a different queries to get a feel of the quality of the gold

# doing our first baseline retrieval function

- todo: 
    - build and fit a binary CountVectorizer on the **titles**
    - play with and understand build_analyzer, build_tokenizer, and transform
    - transform our query 111
        - understand what happens with yet unseen words in the transform process
    - find the docs with the most words in common
    - write an evaluation function computing the top 10 precision p@10
    - apply to our 500 queries


In [None]:
vectorizer = CountVectorizer(binary=True)
# understand the options: 
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
X = ...
print('we got',len(vectorizer.get_feature_names()),'features, for example',vectorizer.get_feature_names()[33333:33339])

In [None]:
queries.loc[111].query

In [None]:
vectorizer.build_analyzer()(...)

In [None]:
vectorizer.build_tokenizer()(...)

In [None]:
qv = vectorizer.transform([...])
qv

### 🚧 todo:
- understand what happens with yet unseen words in the transform process


- think of the shape of X, what are the rows, what are the columns?
- how to select the titles that have the words of our query?
       - think of matrix multiplication and transposition

In [None]:
xqv = ...
xqv

### 🚧 todo: 
  - look at argmax and max, 
  - check the numpy 
      - flatnonzero function to find the best match
      - the .A and the .flat functions
  - show the best matching doc

In [None]:
... .loc[ ... ]

### 🚧 todo: use argpartition to get the 10 best answers

In [None]:
pred10i = np.argpartition(...
pred10i

In [None]:
docs.loc[pred10i]

In [None]:
docs.loc[pred10i].docid

In [None]:
gold[gold.qid==251898].docid

### 🚧 todo:
- find the relevant documents that are in our top 10
- user intersect1d
- compute the precision p@10

In [None]:
intersection = np.intersect1d(...)
intersection

In [None]:
precision = ...
precision

In [None]:
# 🚧 todo: build a function p@10 that gives the precision at 10
def pAt10(qid):
    ...
    ...
    return ...

pAt10(251898)


### 🚧 todo:
- take our 500 training queries qid
- apply our function
- compute the average

In [None]:
training_queries.qid

In [None]:
training_queries.qid....

In [None]:
training_queries.qid... .mean()

- that looks like a baseline we can beat :)
- what's the query we are doing best in?
    - max?

In [None]:
....max()

- oh, we have just been lucky before...

## 🚧 todo:

- redo the vectorization and evaluation on the whole text, not only the titles
- try the non-binary CountVectorizer
- go for tf-idf
    - play with at least two options and re-evaluate
- find other improvements. these may include:
    - cleaning the text
    - heuristically combining title and body matches
    - looking at bigrams
    - looking at terms (by means of a clean multi-word term list from wikipedia, see notebook 1)
    - by removing stopwords (look at nltk or spacy to do that)
    - trying an implementation of bm25
  
- do a grid search with a few promising parameters
    - maybe get inspired by GridSearchCV and pipelines in https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py
        - you can also check the weel-written section "Pipelines" in this book: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
    - make a nice visualization of the results
    
- interpret the complete results in 3 to 5 sentences.
    - what strategy would do best if we switch our evaluation to p@100?

- give some ideas for improving the results





In [None]:
# Function for leaning text
# remove words with numbers inside
# replace new lines by space
# remove urls
# only keep ascii words
def clean_text(text):
    text=re.sub('\w*\d+\w*','', text)
    text=...
    ...
    return text
 
# Cleaning corpus using RegEx
docs['cleaned']=docs['body'].apply(lambda x: clean_text(x))
