## Libraries

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
from torch.utils.data import DataLoader
from collections import Counter
from tqdm import tqdm
import numpy as np
import logging
import pickle
import joblib
import random
import faiss
import torch
import json
import re
import os

2025-05-21 06:19:32.213581: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747808372.394148      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747808372.450009      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Load the previous model

In [None]:
model = SentenceTransformer("/fine-tuned-embedding-model-2020")

In [None]:
with open("/dataset_2005.json", "r") as f:
    data = json.load(f)

len(data)

1146

In [5]:
documents = [entry["content"] for entry in data if entry["content"]]

In [None]:
def chunk_text(text, size=512, stride=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words) - size + 1, stride):
        chunk = " ".join(words[i:i+size])
        chunks.append(chunk)
    return chunks


In [7]:
all_chunks = []
for i, doc in enumerate(documents):
    chunks = chunk_text(doc)
    all_chunks.extend(chunks)

In [None]:
chunk_embeddings = model.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)

In [None]:
print("Shape:", chunk_embeddings.shape)
print("Contains NaNs:", np.isnan(chunk_embeddings).any())

Shape: (270729, 384)
Contains NaNs: False


In [None]:
# load the Kmeans model
clustering = joblib.load('/kmeans_model.pkl')

In [None]:
counts = Counter(clustering.labels_)
print(f"Min cluster size: {min(counts.values())}")
print(f"Max cluster size: {max(counts.values())}")
print(f"Clusters with < 3 chunks: {sum(1 for c in counts.values() if c < 3)}")

Min cluster size: 1
Max cluster size: 414
Clusters with < 3 chunks: 143


In [None]:
clustered_chunks = {}
for idx, label in tqdm(enumerate(clustering.labels_), total=len(clustering.labels_)):
    clustered_chunks.setdefault(label, []).append(all_chunks[idx])

100%|██████████| 270729/270729 [00:00<00:00, 476281.44it/s]


In [None]:
train_examples = []

# positive pairs (from the same cluster)
for cluster_chunks in tqdm(clustered_chunks.values(), desc="Generating positive pairs"):
    if len(cluster_chunks) < 2:
        continue
    for i in range(len(cluster_chunks) - 1):
        train_examples.append(InputExample(texts=[cluster_chunks[i], cluster_chunks[i + 1]], label=1.0))

# negative pairs (random chunks from different clusters)
for _ in tqdm(range(len(train_examples)), desc="Generating negative pairs"):
    chunk1 = random.choice(all_chunks)
    chunk2 = random.choice(all_chunks)
    train_examples.append(InputExample(texts=[chunk1, chunk2], label=0.0))


Generating positive pairs: 100%|██████████| 13752/13752 [00:01<00:00, 9429.17it/s]
Generating negative pairs: 100%|██████████| 256977/256977 [00:01<00:00, 223596.75it/s]


## Load the data

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16, num_workers=0)

In [None]:
print(type(train_examples))
print(type(train_examples[0]))
print(train_examples[0].label)

<class 'list'>
<class 'sentence_transformers.readers.InputExample.InputExample'>
1.0


In [None]:
class ProgressLogger:
    def __init__(self, total_steps):
        self.total_steps = total_steps
        self.current_step = 0

    def on_step(self):
        self.current_step += 1
        if self.current_step % 1000 == 0 or self.current_step == self.total_steps:
            print(f"Training step {self.current_step}/{self.total_steps * 3}")

batches_per_epoch = len(train_dataloader)
total_steps = batches_per_epoch * 1
progress_logger = ProgressLogger(total_steps)

class LossWithProgress(losses.CosineSimilarityLoss):
    def forward(self, sentence_features, labels):
        progress_logger.on_step()
        return super().forward(sentence_features, labels)

train_loss = LossWithProgress(model)

In [None]:
print(torch.cuda.is_available())

True


In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [23]:
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

In [24]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
Tesla P100-PCIE-16GB


In [25]:
print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
print("Is CUDA available:", torch.cuda.is_available())

Torch version: 2.6.0+cu124
CUDA version: 12.4
cuDNN version: 90100
Is CUDA available: True


In [26]:
model.train()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [28]:
int(0.1 * len(train_dataloader) * 3)

9636

## Fine-tuning the model

In [29]:
 print("Starting training...")

 model.fit(
     train_objectives=[(train_dataloader, train_loss)],
     epochs=3,
     warmup_steps=int(0.1 * len(train_dataloader) * 3),
     show_progress_bar=True,
     output_path=None
 )

print("end training...")

Starting training...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0188
1000,0.0175
1500,0.0177
2000,0.0166
2500,0.0178
3000,0.0177
3500,0.0178
4000,0.0173
4500,0.0175
5000,0.0176


Training step 1000/96369
Training step 2000/96369
Training step 3000/96369
Training step 4000/96369
Training step 5000/96369
Training step 6000/96369
Training step 7000/96369
Training step 8000/96369
Training step 9000/96369
Training step 10000/96369
Training step 11000/96369
Training step 12000/96369
Training step 13000/96369
Training step 14000/96369
Training step 15000/96369
Training step 16000/96369
Training step 17000/96369
Training step 18000/96369
Training step 19000/96369
Training step 20000/96369
Training step 21000/96369
Training step 22000/96369
Training step 23000/96369
Training step 24000/96369
Training step 25000/96369
Training step 26000/96369
Training step 27000/96369
Training step 28000/96369
Training step 29000/96369
Training step 30000/96369
Training step 31000/96369
Training step 32000/96369
Training step 32123/96369
Training step 33000/96369
Training step 34000/96369
Training step 35000/96369
Training step 36000/96369
Training step 37000/96369
Training step 38000/9

In [None]:
model.save("embedding_model")