# Information Retrieval 3 

In [1]:
# this turns on the autotimer, so that every cell has a timing information below
%load_ext autotime

time: 178 µs (started: 2023-03-28 01:19:27 +02:00)


In [2]:
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from rank_bm25 import BM25Okapi
import spacy
from sense2vec import Sense2Vec
import gc

tqdm.pandas()

time: 2.87 s (started: 2023-03-28 01:19:29 +02:00)


## getting the best combinations from last time and writing them into files

In [3]:
origdocs = pd.read_csv('our.msmarco.docs.tsv',sep='\t',usecols=[1,2,3])
origdocs['title'].fillna('-', inplace=True)
origdocs['body'].fillna('-', inplace=True)
origdocs

Unnamed: 0,docid,title,body
0,D2981241,What do you call a group of lions?,Lions Vocabulary of the English Language Word ...
1,D687756,.,"The A Priori Argument ( also, Rationalization;..."
2,D913099,Everything You Need To Learn How To Cook Veget...,Home > How To Cook Vegetables Everything You N...
3,D328017,"What is the difference between latitude, longi...",Longitude Latitude Geographic Coordinate Syste...
4,D1636347,When was the pulley invented?,Answers.com ® Wiki Answers ® Categories Techno...
...,...,...,...
92560,D3379210,Top 39 Doctor insights on: Can An Iud Cause Ha...,Top 39 Doctor insights on: Can An Iud Cause Ha...
92561,D3068739,How to get back your DirecTV cancellation fees,How to get back your Direc TV cancellation fee...
92562,D1590402,Certification FAQs,Fingerprinting 1. Where can I get fingerprinte...
92563,D2175490,Greenhouse gas emissions by Canadian economic ...,"Access PDF (682 KB)In 2015, Canada's total gre..."


time: 11.1 s (started: 2023-03-28 01:19:34 +02:00)


In [4]:
docs = pd.DataFrame(columns = ['docid', 'text'])
docs['docid']=origdocs.docid
docs['text']=origdocs.title+' '+origdocs.body
docs

Unnamed: 0,docid,text
0,D2981241,What do you call a group of lions? Lions Vocab...
1,D687756,". The A Priori Argument ( also, Rationalizatio..."
2,D913099,Everything You Need To Learn How To Cook Veget...
3,D328017,"What is the difference between latitude, longi..."
4,D1636347,When was the pulley invented? Answers.com ® Wi...
...,...,...
92560,D3379210,Top 39 Doctor insights on: Can An Iud Cause Ha...
92561,D3068739,How to get back your DirecTV cancellation fees...
92562,D1590402,Certification FAQs Fingerprinting 1. Where can...
92563,D2175490,Greenhouse gas emissions by Canadian economic ...


time: 841 ms (started: 2023-03-28 01:21:10 +02:00)


In [5]:
del origdocs # saving memory

time: 193 µs (started: 2023-03-28 01:21:13 +02:00)


In [None]:
docs.to_csv('our.text.msmarco.docs.tsv',sep='\t', columns=['docid','text'])

## reading back in just for checking the files - or for restarting here

In [9]:
# this is a different doc, no longer distinguishing title and body
docs = pd.read_csv('our.text.msmarco.docs.tsv',sep='\t',usecols=[1,2]) 
docs

Unnamed: 0,docid,text
0,D2981241,What do you call a group of lions? Lions Vocab...
1,D687756,". The A Priori Argument ( also, Rationalizatio..."
2,D913099,Everything You Need To Learn How To Cook Veget...
3,D328017,"What is the difference between latitude, longi..."
4,D1636347,When was the pulley invented? Answers.com ® Wi...
...,...,...
92560,D3379210,Top 39 Doctor insights on: Can An Iud Cause Ha...
92561,D3068739,How to get back your DirecTV cancellation fees...
92562,D1590402,Certification FAQs Fingerprinting 1. Where can...
92563,D2175490,Greenhouse gas emissions by Canadian economic ...


time: 12.7 s (started: 2023-03-28 01:22:17 +02:00)


In [10]:
# use only col 1 if you have memory problems and do BM25 only
queries = pd.read_csv('./our.msmarco.queries.tsv',sep='\t',usecols=[1,2]) 
training_queries=queries.iloc[:500]
testing_queries=queries.iloc[500:]
training_queries

Unnamed: 0,qid,query
0,687888,what is a jpe
1,480210,price for asphalt driveway
2,591004,what causes pressure skin bruising
3,260536,how long drive from flagstaff to grand canyon
4,39422,average number of bowel movements per day for ...
...,...,...
495,133970,definition of dietary fiber
496,79788,can you start up a video record?
497,791583,what is rheumatoid spondylosis
498,732078,what is coleman fuel made out of


time: 7.31 ms (started: 2023-03-28 01:22:36 +02:00)


In [11]:
gold = pd.read_csv('our.msmarco.gold.tsv',sep='\t',usecols=[1,3,4,5])
gold

Unnamed: 0,qid,docid,rank,score
0,310290,D579750,1,-5.11498
1,310290,D579754,2,-5.57703
2,310290,D2380815,3,-5.84852
3,310290,D822566,4,-5.95002
4,310290,D2249695,5,-6.08326
...,...,...,...,...
99995,257942,D253854,96,-6.32693
99996,257942,D3056621,97,-6.32837
99997,257942,D1323491,98,-6.32871
99998,257942,D2722485,99,-6.33100


time: 63.7 ms (started: 2023-03-28 01:22:40 +02:00)


# redoing the vectorization for my two best results

### 🚧 todo:
### choose TfidfVectorizer or BM25Okapi (or both if you got time)

In [12]:
def pAt10(qid):
    query = queries[queries.qid==qid]['query']
    qv = vectorizer.transform(query)
    xqv = X*qv.T
    pred10i = np.argpartition(xqv.A.flat, -10)[-10:]
    intersection = np.intersect1d(docs.loc[pred10i].docid,gold[gold.qid==qid].docid)
    return len(intersection)/10

time: 565 µs (started: 2023-03-28 01:23:02 +02:00)


In [13]:
vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode')
X = vectorizer.fit_transform(docs.text)
print(len(vectorizer.get_feature_names_out()),'features, for example',vectorizer.get_feature_names_out()[44444:44449])
tfidfresults = training_queries.qid.progress_apply(pAt10)
tfidfresults.mean()

2067446 features, for example ['0highest' '0highs' '0highways' '0hihow' '0hill']


  0%|          | 0/500 [00:00<?, ?it/s]

0.9348

time: 4min 47s (started: 2023-03-28 01:23:19 +02:00)


# 🔎 manual error mining
- let's look at where things go wrong

### 🚧 todo:
- what's the lowest p@10 we got
- what's the 10 questions that got the worst score, from worst to slightly better?

In [14]:
tfidfresults.min()

0.0

time: 3.4 ms (started: 2023-03-28 01:28:06 +02:00)


In [15]:
worst10bm25i = tfidfresults.argsort()[:10]
worst10bm25i

0    285
1    172
2     65
3     37
4    270
5    256
6    335
7    300
8     21
9     92
Name: qid, dtype: int64

time: 12.4 ms (started: 2023-03-28 01:28:06 +02:00)


In [16]:
training_queries.loc[worst10bm25i]

Unnamed: 0,qid,query
285,127145,define skin doctor
172,729561,what is channeling
65,71027,can lyme disease cause coughing
37,417380,is mark applier?
270,424296,is spain bigger than italy
256,99399,cooking time for roasted beef short ribs
335,850892,what is the the bug std
300,903134,what tests or procedures do they have for chec...
21,1049686,who sang almost paradise
92,393188,in a democracy the idea of the consent of the ...


time: 6.43 ms (started: 2023-03-28 01:28:06 +02:00)


### 🚧 todo:
- write a function showDoc that takes qid, rank, and predicted as parameters
    - if predicted=True, shows the predicted doc of rank rank to the query qid
    - if predicted=False, shows the gold doc
    - prints the first 999 characters of the texts
- for the worst query
    - look at the 10 best gold vs 10 best predicted 
    - hypothetize why the results are so bad for the worst query

In [84]:
qv = vectorizer.transform(training_queries['query'])

def showDoc(qid, rank, predicted=False):
    if predicted:
        q_pos = queries[queries['qid']==qid].index[0]
    else:
        doc_id= gold.loc[(gold['qid']==qid)&(gold['rank']==rank)]['docid'].values[0]
        print(docs[docs['docid']==doc_id]['text'].values[0][:999])
        
showDoc(729561,7)
showDoc(729561,7, predicted=True)

What Channel Is Boomerang? Jerryt26 322 Contributions What Channel Is Boomerang? It depends on what Cable TV company or Satellite TV company you have. On Dish Network, Boomerang is on channel 175. Edit What channel is Discovery Channel on if you have Comcast?32Edit What channel is the tennis channel on Comcast? 735Edit What channel is the music channel on comcast? It is 401-446Edit Ryf4165 114 Contributions What channel is the weather channel on when you have comcast?i like channel 12 but there's 11 too. Although a lot of channels offer some weather, the weather channel is 32. Edit What channel is the outdoor channel when you have comcast? Com Cast doesn't have outdoor channel Edit What channel will the cooking channel be on comcast?channel 122Edit What comcast channel is the food channel on? It's on channel 72 only in Orlando, Florida Edit Ms WWEViper 88 Contributions Which channel is Disney channel for comcast? Channel 50 ! **EDIT** Who ever posted that is totally wrong! It is 33 fo


### 🚧 todo: can we characterize these difficult cases?
- do they have specicific problems?
- do we know when we are doing badly?
    - are the distances between query vector and the best documents bigger than average?

<font color=orange>
Firstly, there is a spelling mistake "endolymphatic" in the query itself. This word is already quite uncommon and with the error, it is unlikely to appear anywhere else. To retrieve the relevant documents for this query, a more algorithm hat takes into account possible spelling mistakes is required. Therefore, the gold label is not very useful, as it matches documents that have the words "system" or "what" appearing many times. As for the predictions, they seem to be completely unrelated to the query, with very few common words within the first 999 characters. The query scores 0 for all documents, so getting the top 10 scoring documents doesn't make sense and we are essentially retrieving random documents from the dataset.
<font>

# 🚀 spacy

- look at https://github.com/explosion/sense2vec/blob/master/README.md

In [25]:
import en_core_web_sm
nlp = en_core_web_sm.load()

time: 406 ms (started: 2023-03-28 01:33:50 +02:00)


### 🚧 todo:
- explain what's going on here:

In [26]:
sent1 = nlp("I am happy")
sent2 = nlp("I am sad")
sent3 = nlp("I am joyful")
sent1.similarity(sent2), sent1.similarity(sent3)

  sent1.similarity(sent2), sent1.similarity(sent3)


(0.9059779715129435, 0.9063014419270309)

time: 20.2 ms (started: 2023-03-28 01:33:52 +02:00)


<font color = orange>
The similarity score between two sentences is determined by analyzing the number of times words appear together in a text, which generates vector embeddings. Words that are related in meaning, such as "happy" and "joyful," are likely to appear in similar contexts and have higher similarity scores. However, even words that are opposite in meaning, such as "happy" and "sad," can have high similarity scores if they are commonly associated with similar words and phrases, such as "heart," "feel," and "emotion."
<font>

### let's try sense2vec

- depending on your machine, download one of the two versions of sense2vec from https://github.com/explosion/sense2vec/blob/master/README.md
  - s2v_reddit_2019_lg 	4 GB 	Reddit comments 2019 (01-07) 	part 1, part 2, part 3
      - cat s2v_reddit_2019_lg.tar.gz.* > s2v_reddit_2019_lg.tar.gz
  - s2v_reddit_2015_md 	573 MB 	Reddit comments 2015 	part 1
- unzip
- try it, and understand what's going on:

In [32]:
# ! cat ./s2v_reddit_2019_lg.tar.gz.* > s2v_reddit_2019_lg.tar.gz

time: 2.23 s (started: 2023-03-28 01:55:29 +02:00)


In [33]:
s2v = Sense2Vec().from_disk("./s2v_reddit_2019_lg")

time: 14 s (started: 2023-03-28 01:56:24 +02:00)


In [34]:
seeds = "natural language processing, machine learning, artificial intelligence".split(',')
seed_keys = [s2v.get_best_sense(seed.strip()) for seed in seeds]
seed_keys

['natural_language_processing|NOUN',
 'machine_learning|NOUN',
 'artificial_intelligence|NOUN']

time: 7.69 ms (started: 2023-03-28 01:56:50 +02:00)


In [35]:
most_similar = s2v.most_similar(seed_keys, n=10)
most_similar

[('deep_learning|NOUN', 0.832),
 ('Machine_Learning|ORG', 0.818),
 ('computer_vision|NOUN', 0.8114),
 ('neural_networks|NOUN', 0.798),
 ('Machine_learning|NOUN', 0.7933),
 ('big_data|NOUN', 0.7931),
 ('machine_learning_algorithms|NOUN', 0.7915),
 ('Artificial_Intelligence|ORG', 0.774),
 ('deep_reinforcement_learning|NOUN', 0.7715),
 ('neural_nets|NOUN', 0.7642)]

time: 20.6 s (started: 2023-03-28 01:57:00 +02:00)


### 🚧 todo: what is it that you couldn't do in Word2Vec?
- just one line of answer.
- answer: We cannot capture multi-word phrases or expressions as a single unit.

- most_similar is very slow. check this to speed things up (optional): https://towardsdatascience.com/how-to-build-a-fast-most-similar-words-method-in-spacy-32ed104fe498
### 🚧 todo:
- try also the following functions: 
    - similarity, get_other_senses, get_freq, s2v[query]

In [37]:
s2v.similarity(s2v.get_best_sense('hot dog'), s2v.get_best_sense('white snow'))

0.1635851

time: 44.2 ms (started: 2023-03-28 01:58:39 +02:00)


In [41]:
s2v.get_other_senses(s2v.get_best_sense('hot dog'))

['Hot_Dog|EVENT', 'Hot_Dog|PROPN', 'hot_dog|PROPN']

time: 2.22 ms (started: 2023-03-28 02:00:46 +02:00)


In [42]:
s2v.get_freq(s2v.get_best_sense('hot dog'))

22276

time: 2.08 ms (started: 2023-03-28 02:01:35 +02:00)


In [45]:
s2v[s2v.get_best_sense('hot dog')]

array([ 2.3108e-01, -1.9360e-01, -1.7003e-01,  9.8480e-02,  5.1809e-02,
       -7.0313e-02,  1.1311e-01, -1.1817e-01, -1.9109e-01,  2.0811e-01,
       -7.4227e-02,  1.5543e-02,  2.5959e-01, -3.0133e-02,  1.6546e-01,
        3.1536e-01, -1.8114e-01, -7.5966e-02,  2.0037e-02,  2.1310e-01,
       -3.0050e-01, -1.4920e-02, -5.6571e-02,  9.6602e-02,  2.0127e-02,
       -3.0549e-01, -9.1782e-02,  2.6700e-01, -1.2370e-01,  3.5850e-01,
       -1.1671e-01,  8.1833e-02,  8.4239e-02,  1.4250e-01, -2.1132e-01,
       -2.1688e-01,  3.2394e-01, -1.7889e-01,  1.3094e-01, -1.3539e-01,
        5.1521e-01, -1.5152e-02,  2.2745e-01,  1.6857e-02,  1.0836e-01,
       -4.7195e-02,  4.4912e-03,  3.7202e-02,  7.1748e-02,  3.6461e-01,
        3.0984e-01, -2.5067e-01,  9.8961e-02,  1.3698e-01, -1.8832e-01,
        3.6232e-01,  1.2284e-01, -2.5241e-01,  5.0017e-01, -1.4389e-01,
       -9.4602e-02,  1.8595e-01, -1.1870e-02, -1.1623e-01,  1.6984e-01,
        4.7125e-02, -1.1045e-01, -4.8309e-04, -6.8791e-02, -1.36

time: 9.43 ms (started: 2023-03-28 02:03:31 +02:00)


### 🚧 todo:
- try whether expanding your query by adding similar terms to the 10 worst queries improves the results

In [51]:
s2v.most_similar(s2v.get_best_sense('sky'), n=3)

[('skies|NOUN', 0.7452), ('clouds|NOUN', 0.736), ('night_sky|NOUN', 0.6994)]

time: 28.3 s (started: 2023-03-28 02:05:57 +02:00)


In [52]:
training_queries.loc[worst10bm25i]

Unnamed: 0,qid,query
285,127145,define skin doctor
172,729561,what is channeling
65,71027,can lyme disease cause coughing
37,417380,is mark applier?
270,424296,is spain bigger than italy
256,99399,cooking time for roasted beef short ribs
335,850892,what is the the bug std
300,903134,what tests or procedures do they have for chec...
21,1049686,who sang almost paradise
92,393188,in a democracy the idea of the consent of the ...


time: 73.4 ms (started: 2023-03-28 02:07:16 +02:00)


In [56]:
terms = ['skin', 'channeling', 'lyme', 'applier', 'spain', 'cooking', 'bug', 'procedures', 'paradise', 'democracy']
matched = [s2v.get_best_sense(term) for term in terms]
matched

['skin|NOUN',
 'channeling|VERB',
 'Lyme|PERSON',
 'applier|NOUN',
 'Spain|GPE',
 'cooking|VERB',
 'bug|NOUN',
 'procedures|NOUN',
 'paradise|PROPN',
 'democracy|NOUN']

time: 6.66 ms (started: 2023-03-28 02:11:41 +02:00)


In [57]:
similar = [s2v.most_similar(term, n=3) for term in matched]
similar

[[('normal_skin|NOUN', 0.7841),
  ('skin-|NOUN', 0.773),
  ('regular_skin|NOUN', 0.769)],
 [('channelling|VERB', 0.9046),
  ('channeled|VERB', 0.8007),
  ('channel|VERB', 0.7373)],
 [('Lyme_disease|NOUN', 0.873),
  ('Lyme|PROPN', 0.8672),
  ('lyme|PROPN', 0.8205)],
 [('Only_highest_value|NOUN', 0.5053),
  ('applie|PROPN', 0.4925),
  ('y_seconds|TIME', 0.4877)],
 [('Italy|GPE', 0.8596), ('Portugal|GPE', 0.8363), ('France|GPE', 0.8066)],
 [('cook|VERB', 0.8391), ('cooking|NOUN', 0.8163), ('cooked|VERB', 0.7729)],
 [('known_bug|NOUN', 0.7786),
  ('new_bug|NOUN', 0.7711),
  ('glitch|NOUN', 0.7699)],
 [('procedure|NOUN', 0.7739),
  ('specific_procedures|NOUN', 0.7493),
  ('certain_procedures|NOUN', 0.7321)],
 [('Paradise|LOC', 0.7969),
  ('paradise|NOUN', 0.7795),
  ('Paradise|PROPN', 0.6687)],
 [('Democracy|NOUN', 0.8686),
  ('democratic_system|NOUN', 0.8461),
  ('liberal_democracy|NOUN', 0.8399)]]

time: 4min 4s (started: 2023-03-28 02:11:43 +02:00)


In [66]:
qid2query = {285: 'define skin doctor', 172: 'what is the meaning of the name kameren cameron'}

for qid, query in qid2query.items():
    print(tfidfresults[qid])

0.0
0.1
time: 1.96 ms (started: 2023-03-28 02:23:08 +02:00)


### 🚧 todo:
- try misspelling a word and see whether you can fix that with sense2vec

In [73]:
s2v.get_best_sense('wordd')

time: 37 ms (started: 2023-03-28 02:25:43 +02:00)


In [75]:
s2v.most_similar(s2v.get_best_sense('wordd'), n=2)

[('Glooks|PERSON', 0.6875), ('epiv|NOUN', 0.6838)]

time: 19.6 s (started: 2023-03-28 02:26:14 +02:00)


### 🚧 todo:
- try embeddings for a few queries (all would take to long except if you have a GPU)
    - are the gold top 10 similar to the query itself?
    - check whether the gold top 10 answers for our most difficult question are really closer to the question than the currently predicted top10
         - how to get every doc as a vector: 
             - https://spacy.io/api/doc#vector "A real-valued meaning representation. Defaults to an average of the token vectors."
        - every doc has a similarity function taking another doc as argument: 
            - https://spacy.io/api/doc#similarity

In [78]:
for qid in [743838, 729561]:
    query = queries[queries['qid']==qid]['query'].values[0]
    print(query)
    
    vector = nlp(query)
    top_10 = gold[gold['qid']==qid].sort_values('rank')['docid'][:5]
    for docid in top_10:
        doc = docs[docs['docid']==docid].iloc[0]
        print('Similarity:', vector.similarity(nlp(doc['text'])))
        print(doc['text'][:200], '\n')


what is endolymhatic system
Similarity: 0.500770622380886
Body Systems Quiz 2 What body system does the PATELLA belong in? Skeletal What body system does the STERNUM belong in? Skeletal What body system does the RADIUS belong in? Skeletal What body system do 

Similarity: 0.3648865543160146
Body Systems 36 terms sc229207Body Systems The following flash cards will help you to study the material we've learned about the body systems Learn Flashcards Write Spell Test Match Gravity Advertisem 



  top_10 = gold[gold['qid']==qid].sort_values('rank')['docid'][:5]
  print('Similarity:', vector.similarity(nlp(doc['text'])))


Similarity: 0.4280795891236894
TEAS V- Human Body Science 158 terms rplauche1TEAS V- Human Body Science Notes from Human Body Science portion of TEAS V Study Guide by ATILearn Flashcards Write Spell Test Match Gravity Advertisement 

Similarity: 0.46693242458042405
CIS 300 Chapter 10 48 terms joselin_theobald CIS 300 Chapter 10Learn Flashcards Write Spell Test Match Gravity Advertisement Upgrade to remove ads Like this study set? Create a free account to save it 

Similarity: 0.4201511423924317
ANS 59 terms jramsey89ANSPathophysiology Learn Flashcards Write Spell Test Match Gravity Sort Name the two subsystems of the Nervous System Central Nervous System and the Peripheral Nervous System Nam 

what is channeling
Similarity: 0.2511835406983005
What channel is NBC on XFINITY? Comcast Xfinity (product) NBC Comcast Products and Services TV Channels Television What channel is NBC on XFINITY?3 Answers Greg Monti, Radio broadcast engineer fascina 

Similarity: 0.13733032058308664
What channel

In [79]:
qid = 127145
vector = nlp('define skin doctor')

time: 5.86 ms (started: 2023-03-28 02:33:55 +02:00)


In [80]:
gold_similar = []
best_docs_ids = gold[gold['qid']==qid].sort_values('rank')['docid'][:10]

for doc_id in best_docs_ids:
    doc = docs[docs['docid']==doc_id].iloc[0]
    gold_similar.append(vector.similarity(nlp(doc['text'])))

  best_docs_ids = gold[gold['qid']==qid].sort_values('rank')['docid'][:10]
  gold_similar.append(vector.similarity(nlp(doc['text'])))


time: 2.8 s (started: 2023-03-28 02:34:43 +02:00)


In [82]:
pd.Series(gold_similar).agg(['min', 'mean', 'max'])

min     0.384936
mean    0.449560
max     0.509249
dtype: float64

time: 16.3 ms (started: 2023-03-28 02:45:08 +02:00)
