In [8]:
# Mount Google Drive if needed
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
os.makedirs(PROJECT_PATH, exist_ok=True)

# Install/Import SentEval
!git clone https://github.com/facebookresearch/SentEval.git
import sys
sys.path.insert(0, './SentEval')
import inspect
if not hasattr(inspect, "getargspec"):
    # For Python 3 compatibility with older SentEval
    inspect.getargspec = inspect.getfullargspec
import senteval

# Install/Import SentenceTransformers and other needed libraries
!pip install datasets sentence_transformers
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util
from torch.utils.data import DataLoader
from datasets import load_dataset
from scipy.stats import spearmanr
import pandas as pd

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path 'SentEval' already exists and is not an empty directory.


## 1. Prepare Data for STS-B Fine-tuning

In [10]:
# Load the STS-B English dataset (stsb_multi_mt for English)
sts_dataset = load_dataset('stsb_multi_mt', name='en')

train_data_sts = sts_dataset['train']
test_data_sts = sts_dataset['test']

# Convert STS-B train examples to InputExample format
# Each example is (sentence1, sentence2, normalized_score)
train_examples_sts = []
for item in train_data_sts:
    score_norm = float(item['similarity_score']) / 5.0
    inp_example = InputExample(
        texts=[item['sentence1'], item['sentence2']],
        label=score_norm
    )
    train_examples_sts.append(inp_example)

## 2. Prepare SentEval Toolkit

In [11]:
"""
  data download link: https://github.com/facebookresearch/SentEval
  follow steps in README.md to download the data
"""
# SentEval requires a data_path with the downstream tasks data
# Make sure data is placed inside /content/drive/MyDrive/CS6120_project/data/raw
params_senteval = {
    'task_path': '/content/drive/MyDrive/CS6120_project/data/raw',
    'usepytorch': True,
    'kfold': 10  # cross-validation folds (can be 5 or 10)
}

# Specify the tasks you want to evaluate
transfer_tasks = ['TREC', 'MRPC']

# By default, 'prepare' can be a no-op for SBERT
def prepare(params, samples):
    return

# Batcher function: given a batch of tokenized sentences, convert them to embeddings
def batcher(params, batch):
    """
    SentEval expects a list of tokenized sentences, e.g. [['hello'], ['world']]
    We join tokens with whitespace. If the sentence is empty, we replace it with '.'
    Then we encode using the SentenceTransformer model.
    """
    sentences = [' '.join(s) if len(s) != 0 else '.' for s in batch]
    embeddings = model.encode(sentences, convert_to_tensor=False)
    return embeddings

## 3. Fine-tune SBERT on STS-B

In [20]:
# Initialize pre-trained SBERT
model_sbert_sts = SentenceTransformer('bert-base-nli-mean-tokens')

# Build training dataloader
train_dataset_sts = SentencesDataset(train_examples_sts, model=model_sbert_sts)
train_dataloader_sts = DataLoader(train_dataset_sts, shuffle=True, batch_size=16)

# Define loss function (CosineSimilarityLoss for STS tasks)
train_loss_sts = losses.CosineSimilarityLoss(model_sbert_sts)

# Fine-tune the model
model_sbert_sts.fit(
    train_objectives=[(train_dataloader_sts, train_loss_sts)],
    epochs=4,
    warmup_steps=100,
    output_path='/content/drive/MyDrive/CS6120_project/model/stsb_finetuned_model',
    show_progress_bar=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0275
1000,0.0102


In [13]:
## uncomment these two lines to load fine-tuned model if you have trained model and want to skip the training step
# model_sts_ft = SentenceTransformer("/content/drive/MyDrive/CS6120_project/model/stsb_finetuned_model")

gold_scores = []
predicted_scores = []

for sample in test_data_sts:
    s1 = sample['sentence1']
    s2 = sample['sentence2']
    gold_score = float(sample['similarity_score']) / 5.0  # normalized to 0..1

    emb1 = model_sts_ft.encode(s1)
    emb2 = model_sts_ft.encode(s2)
    cosine_sim = util.cos_sim(emb1, emb2).item()

    gold_scores.append(gold_score)
    predicted_scores.append(cosine_sim)

spearman_sts, _ = spearmanr(gold_scores, predicted_scores)
print(f"[STS-B] Spearman correlation after STS fine-tuning: {spearman_sts:.4f}")


[STS-B] Spearman correlation after STS fine-tuning: 0.8541


## 4. Further Fine-tune on MSMARCO

In [24]:
# load model
model_sts_ft = SentenceTransformer("/content/drive/MyDrive/CS6120_project/model/stsb_finetuned_model")

# Load MSMARCO dataset (train split)
# NOTE: This dataset is quite large. Consider sampling or partial loading for demonstration.
dataset_msmarco = load_dataset("ms_marco", "v1.1", split="train")

train_examples_msmarco = []
for example in dataset_msmarco:
    query = example["query"]
    passages = example["passages"]

    # 'is_selected' indicates whether the passage is a positive example
    for text, is_sel in zip(passages["passage_text"], passages["is_selected"]):
        if is_sel == 1:
            # Each positive example
            train_examples_msmarco.append(InputExample(texts=[query, text]))

print(f"[MSMARCO] Total training pairs: {len(train_examples_msmarco)}")

# Build Dataloader
train_dataset_msmarco = SentencesDataset(train_examples_msmarco, model=model_sts_ft)
train_dataloader_msmarco = DataLoader(train_dataset_msmarco, shuffle=True, batch_size=16)

# Define loss: MultipleNegativesRankingLoss is common for retrieval tasks
train_loss_msmarco = losses.MultipleNegativesRankingLoss(model_sts_ft)

# Fine-tune the model further
model_sts_ft.fit(
    train_objectives=[(train_dataloader_msmarco, train_loss_msmarco)],
    epochs=1,
    warmup_steps=int(len(train_dataloader_msmarco) * 1 * 0.1),
    output_path='/content/drive/MyDrive/CS6120_project/model/msmarco_stsb_finetuned_model',
    show_progress_bar=True
)

[MSMARCO] Total training pairs: 88523


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1016
1000,0.0341
1500,0.0254
2000,0.0277
2500,0.0194
3000,0.0179
3500,0.0194
4000,0.0158
4500,0.0166
5000,0.0136


## 5. Model Evaluation & Summary Metrics

In [25]:
# uncomment these three lines to load fine-tuned models if you have trained models and want to skip the training steps
model_sts_ft = SentenceTransformer("/content/drive/MyDrive/CS6120_project/model/stsb_finetuned_model")
model_final = SentenceTransformer("/content/drive/MyDrive/CS6120_project/model/msmarco_stsb_finetuned_model")

final_metrics = {
    's1_sts_spearman': spearman_sts
}

# Evaluate with SentEval
se = senteval.engine.SE(params_senteval, batcher, prepare)

print("\n=== Evaluate after STS-B Fine-tuning ===")
model = model_sts_ft  # Important: SentEval uses the global 'model' object in the batcher
results_stage1 = se.eval(transfer_tasks)

for task in transfer_tasks:
    if 'acc' in results_stage1[task]:
        print(f"{task}: {results_stage1[task]['acc']:.2f}")
        final_metrics[f's1_{task.lower()}_acc'] = results_stage1[task]['acc']
    else:
        print(f"{task}: {results_stage1[task]}")


print("\n=== Evaluate after MSMARCO + STS-B Fine-tuning ===")
model = model_final  # SentEval uses the global 'model' in the batcher

se = senteval.engine.SE(params_senteval, batcher, prepare)
results_stage2 = se.eval(transfer_tasks)

for task in transfer_tasks:
    if 'acc' in results_stage2[task]:
        print(f"{task}: {results_stage2[task]['acc']:.2f}")
        final_metrics[f's2_{task.lower()}_acc'] = results_stage2[task]['acc']
    else:
        print(f"{task}: {results_stage2[task]}")


=== Evaluate after STS-B Fine-tuning ===
TREC: 84.20
MRPC: 75.13

=== Evaluate after MSMARCO + STS-B Fine-tuning ===
TREC: 86.40
MRPC: 70.96


In [26]:
summary_data = {
    'Stage': ['After STS-B Fine-tuning', 'After MSMARCO Fine-tuning'],
    'TREC Accuracy': [
        final_metrics.get('s1_trec_acc', None),
        final_metrics.get('s2_trec_acc', None)
    ],
    'MRPC Accuracy': [
        final_metrics.get('s1_mrpc_acc', None),
        final_metrics.get('s2_mrpc_acc', None)
    ],
    'STS-B Spearman': [
        final_metrics.get('s1_sts_spearman', None),
        None  # We didn't re-check STS-B after MSMARCO
    ]
}

df_summary = pd.DataFrame(summary_data)
print("\n=== Final Metrics Summary ===")
print(df_summary)


=== Final Metrics Summary ===
                       Stage  TREC Accuracy  MRPC Accuracy  STS-B Spearman
0    After STS-B Fine-tuning           84.2          75.13        0.854054
1  After MSMARCO Fine-tuning           86.4          70.96             NaN
