# Settings

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!git clone https://github.com/koreankiwi99/24_dis_project1.git
%cd /content/24_dis_project1/

Cloning into '24_dis_project1'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 147 (delta 77), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (147/147), 49.58 KiB | 7.08 MiB/s, done.
Resolving deltas: 100% (77/77), done.
/content/24_dis_project1


In [3]:
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
from ast import literal_eval
import gc
import time
from pathlib import Path

from src.bm25.our_bm25 import OurBM25
from src.bm25.utils import settings, retrieval, check_recall, load_preprocessed
from src.rerank.model import ReRanker, HybridSearch

# Utils

# Load BM25

In [4]:
#same config and embedding in submission dataset in Kaggle
config_path = '/content/drive/MyDrive/DIS/bm25_2000_submission/config_en_20241101_185354 (1).json'
embedding_path = '/content/drive/MyDrive/DIS/bm25_2000_submission/matrix_en_20241101_185354-001.pkl'

In [5]:
%%time
bm25 = settings(config_path, embedding_path)
print(bm25.top_k)

2000
CPU times: user 42.3 s, sys: 15.6 s, total: 57.9 s
Wall time: 1min 7s


# Load Data

In [6]:
dev_preprocessed_path = '/content/drive/MyDrive/DIS/tf-idf/dev_preprocessed_.csv'
queries, answers = load_preprocessed(dev_preprocessed_path, 'en')

# Check Recall@K of BM25 (Basis)
- recall@10 : 0.785 (current score)
- recall@20 : 0.81
- recall@30 : 0.855
- recall@40 : 0.86
- recall@50 : 0.865

In [7]:
%%time
top_10 = retrieval(bm25, queries, 10, return_score=False)

CPU times: user 1min 32s, sys: 52.4 ms, total: 1min 32s
Wall time: 1min 32s


In [8]:
%%time
top_20 = retrieval(bm25, queries, 20, return_score=False)

CPU times: user 1min 32s, sys: 96.3 ms, total: 1min 32s
Wall time: 1min 32s


In [9]:
%%time
top_30 = retrieval(bm25, queries, 30, return_score=False)

CPU times: user 1min 32s, sys: 123 ms, total: 1min 32s
Wall time: 1min 32s


In [10]:
%%time
top_40 = retrieval(bm25, queries, 40, return_score=False)

CPU times: user 1min 32s, sys: 148 ms, total: 1min 32s
Wall time: 1min 32s


In [11]:
%%time
top_50 = retrieval(bm25, queries, 50, return_score=False)

CPU times: user 1min 32s, sys: 107 ms, total: 1min 32s
Wall time: 1min 32s


In [12]:
print(f"recall@10 : {check_recall(top_10, answers)}")
print(f"recall@20 : {check_recall(top_20, answers)}")
print(f"recall@30 : {check_recall(top_30, answers)}")
print(f"recall@40 : {check_recall(top_40, answers)}")
print(f"recall@50 : {check_recall(top_50, answers)}")

recall@10 : 0.785
recall@20 : 0.81
recall@30 : 0.855
recall@40 : 0.86
recall@50 : 0.865


# Load Cross Encoder

### Load doc_id2doc

In [13]:
id_doc_map_path = '/content/drive/MyDrive/DIS/corpus/id_doc_map.pkl'

In [14]:
with open(id_doc_map_path, 'rb') as f:
  id_doc_map = pickle.load(f)

## Load Huggingface Models

In [15]:
model_name = 'kiwi1229/english_cross_encoder_base_no_sampling'
device = 'cuda'

base_reranker = ReRanker(model_name, device, id_doc_map)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [16]:
model_name = 'kiwi1229/english_cross_encoder_top_no_sampling'
device = 'cuda'

top_reranker = ReRanker(model_name, device, id_doc_map)

config.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [17]:
model_name = 'kiwi1229/english_cross_encoder_base'
device = 'cuda'

base4_reranker = ReRanker(model_name, device, id_doc_map)

config.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [18]:
model_name = 'kiwi1229/english_cross_encoder_top'
device = 'cuda'

top4_reranker = ReRanker(model_name, device, id_doc_map)

config.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

# ReRank

### Load Original Text Queries

In [19]:
dev_path = '/content/drive/MyDrive/DIS/dis-project-1-document-retrieval/dev.csv'
df = pd.read_csv(dev_path)
dev_df = df[df['lang'] == 'en'].copy()
text_queries = list(dev_df['query'])

### Recall@10
- rerank top 30 docs retrieved from our BM25 model
- all of the scores were below 0.785
---

- BASE : 0.685
- TOP : 0.65
- BASE sampling 4 neg : 0.68
- TOP sampling 4 neg : 0.615


In [20]:
output = [base_reranker.rerank(q, id) for q, id in zip(text_queries, top_30)]
check_recall(output, answers)

0.685

In [21]:
output = [top_reranker.rerank(q, id) for q, id in zip(text_queries, top_30)]
check_recall(output, answers)

0.65

In [22]:
output = [base4_reranker.rerank(q, id) for q, id in zip(text_queries, top_30)]
check_recall(output, answers)

0.68

In [23]:
output = [top4_reranker.rerank(q, id) for q, id in zip(text_queries, top_30)]
check_recall(output, answers)

0.615

# Hybrid Search

### BM25 scores

In [24]:
score_list, id_list = [], []

for q in queries:
  score, id = bm25.search(q, 30, return_score=True)
  score_list.append(score)
  id_list.append(id)

### Reranker scores

In [25]:
%%time
top4_score = [top4_reranker.rerank(q, id, return_score=True) for q, id in zip(text_queries,
                                                                              id_list)]

CPU times: user 4min 9s, sys: 813 ms, total: 4min 10s
Wall time: 1min 25s


In [26]:
%%time
base4_score = [base4_reranker.rerank(q, id, return_score=True) for q, id in zip(text_queries,
                                                                              id_list)]

CPU times: user 4min 12s, sys: 802 ms, total: 4min 13s
Wall time: 1min 25s


In [27]:
%%time
base_score = [base_reranker.rerank(q, id, return_score=True) for q, id in zip(text_queries,
                                                                              id_list)]

CPU times: user 4min 23s, sys: 814 ms, total: 4min 24s
Wall time: 1min 27s


In [28]:
%%time
top_score = [top_reranker.rerank(q, id, return_score=True) for q, id in zip(text_queries,
                                                                              id_list)]

CPU times: user 4min 12s, sys: 876 ms, total: 4min 13s
Wall time: 1min 25s


## Recall@10
- BASE : 0.805
- TOP : 0.815
- BASE sampling 4 neg : 0.805
- TOP sampling 4 neg : 0.795


In [29]:
%%time
hybrid_base = HybridSearch(score_list, base_score, id_list)
check_recall(hybrid_base.output, answers)

CPU times: user 8.26 ms, sys: 1.98 ms, total: 10.2 ms
Wall time: 9.96 ms


0.805

In [30]:
%%time
hybrid_top = HybridSearch(score_list, top_score, id_list)
check_recall(hybrid_top.output, answers)

CPU times: user 10.1 ms, sys: 14 µs, total: 10.1 ms
Wall time: 9.75 ms


0.815

In [31]:
%%time
hybrid_base4 = HybridSearch(score_list, base4_score, id_list)
check_recall(hybrid_base4.output, answers)

CPU times: user 9.24 ms, sys: 1 ms, total: 10.2 ms
Wall time: 10 ms


0.805

In [32]:
%%time
hybrid_top4 = HybridSearch(score_list, top4_score, id_list)
check_recall(hybrid_top4.output, answers)

CPU times: user 10 ms, sys: 13 µs, total: 10 ms
Wall time: 9.69 ms


0.795