In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [2]:
import pandas as pd
from sklearn import model_selection
from datasets import Dataset

import zipfile
with zipfile.ZipFile("/kaggle/input/quora-question-pairs/train.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("./train/")

df = pd.read_csv("./train/train.csv")
df = df.dropna(subset=['question1', 'question2', 'is_duplicate'])
df = df.rename(columns={'is_duplicate': 'label'})
df = df[['question1', 'question2', 'label']]

# Stratified split: 70% train, 15% val, 15% test
train_val, test = model_selection.train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train, val = model_selection.train_test_split(train_val, test_size=0.1765, stratify=train_val['label'], random_state=42)  # 0.1765 ~ 15% of total

train_ds = Dataset.from_pandas(train.reset_index(drop=True))
val_ds = Dataset.from_pandas(val.reset_index(drop=True))
test_ds = Dataset.from_pandas(test.reset_index(drop=True))

print(f"Train size: {len(train_ds)}, Validation size: {len(val_ds)}, Test size: {len(test_ds)}")


Train size: 282990, Validation size: 60653, Test size: 60644


Bi-Encoder

In [3]:
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import f1_score

model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode test question pairs
emb1 = model.encode(test_ds['question1'], convert_to_tensor=True)
emb2 = model.encode(test_ds['question2'], convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(emb1, emb2).diagonal()

threshold = 0.7
preds = (cosine_scores > threshold).cpu().numpy().astype(int)
labels = test_ds['label']

baseline_f1 = f1_score(labels, preds)
print("Baseline Bi-Encoder F1 score:", baseline_f1)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1896 [00:00<?, ?it/s]

Batches:   0%|          | 0/1896 [00:00<?, ?it/s]

Baseline Bi-Encoder F1 score: 0.7299178616177086


In [5]:
from sentence_transformers import InputExample

subsample_size = 5000  # Adjust for speed 

def to_input_examples(dataset, size=None):
    if size:
        dataset = dataset.select(range(size))
    return [
        InputExample(texts=[q1, q2], label=float(label))
        for q1, q2, label in zip(dataset['question1'], dataset['question2'], dataset['label'])
    ]

train_samples = to_input_examples(train_ds, size=subsample_size)
val_samples = to_input_examples(val_ds, size=1000)  # smaller validation for speed


In [6]:
#Bi-Encoder with Cosine Similarity Loss (subsample)
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses

train_loader = DataLoader(train_samples, shuffle=True, batch_size=32)

model_cosine = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')

cosine_loss = losses.CosineSimilarityLoss(model=model_cosine)

model_cosine.fit(
    train_objectives=[(train_loader, cosine_loss)],
    epochs=3,
    warmup_steps=int(0.1 * len(train_loader)),
    show_progress_bar=True,
    use_amp=True
)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [7]:
#Bi-Encoder with Contrastive Loss (subsample)
train_loader = DataLoader(train_samples, shuffle=True, batch_size=1)

# model_contrastive = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')
model_contrastive = SentenceTransformer('paraphrase-MiniLM-L6-v2')

contrastive_loss = losses.ContrastiveLoss(model=model_contrastive, margin=0.5)

model_contrastive.fit(
    train_objectives=[(train_loader, contrastive_loss)],
    epochs=3,
    warmup_steps=int(0.1 * len(train_loader)),
    show_progress_bar=True,
    use_amp=True
)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [16]:
#Bi-Encoder with Multiple Negative Ranking Loss (subsample)
train_loader = DataLoader(train_samples, shuffle=True, batch_size=32)

model_multiple_neg = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')

multiple_neg_loss = losses.MultipleNegativesRankingLoss(model=model_multiple_neg)

model_multiple_neg.fit(
    train_objectives=[(train_loader, multiple_neg_loss)],
    epochs=3,
    warmup_steps=int(0.1 * len(train_loader)),
    show_progress_bar=True,
    use_amp=True
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 46.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 36.12 MiB is free. Process 2741 has 14.70 GiB memory in use. Of the allocated memory 14.44 GiB is allocated by PyTorch, and 64.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Cross-Encoder 

In [8]:
from sentence_transformers import CrossEncoder

cross_train_samples = [(q1, q2, label) for q1, q2, label in zip(train_ds['question1'][:subsample_size], train_ds['question2'][:subsample_size], train_ds['label'][:subsample_size])]

model_cross = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', num_labels=2)

model_cross.fit(
    cross_train_samples,
    epochs=3,
    batch_size=16,
    evaluation_steps=1000,
    show_progress_bar=True,
    use_amp=True
)


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

RuntimeError: Error(s) in loading state_dict for Linear:
	size mismatch for bias: copying a param with shape torch.Size([1]) from checkpoint, the shape in current model is torch.Size([2]).

 Evaluate Models on Constant Test Set

In [None]:
def evaluate_bi_encoder(model, test_ds, threshold=0.7):
    emb1 = model.encode(test_ds['question1'], convert_to_tensor=True)
    emb2 = model.encode(test_ds['question2'], convert_to_tensor=True)
    scores = util.pytorch_cos_sim(emb1, emb2).diagonal()
    preds = (scores > threshold).cpu().numpy().astype(int)
    return f1_score(test_ds['label'], preds)

def evaluate_cross_encoder(model, test_ds):
    pairs = list(zip(test_ds['question1'], test_ds['question2']))
    out = model.predict(pairs)
    preds = (out[:, 1] > 0.5).astype(int)
    return f1_score(test_ds['label'], preds)

print("Cosine Loss Bi-Encoder F1:", evaluate_bi_encoder(model_cosine, test_ds))
print("Contrastive Loss Bi-Encoder F1:", evaluate_bi_encoder(model_contrastive, test_ds))
print("Multiple Negatives Ranking Bi-Encoder F1:", evaluate_bi_encoder(model_multiple_neg, test_ds))
print("Cross Encoder F1:", evaluate_cross_encoder(model_cross, test_ds))
