In [None]:
import random
from collections import defaultdict
from sentence_transformers import CrossEncoder, SentenceTransformer, SentencesDataset, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses, SimilarityFunction
from sentence_transformers.sampler import BatchSampler
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.losses import TripletLoss
from sentence_transformers.readers import LabelSentenceReader, InputExample
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset, Value
import chromadb

model_name = "all-MiniLM-L6-v2"
trained_model_path = "models/all-MiniLM-L6-v2-trained"

# Load pre-trained model - we are using a Sentence-BERT model
model = SentenceTransformer(model_name)

dataset = Dataset.from_csv("trading-v0.csv")
#dataset = load_dataset("json", data_files="registration-v1.jsonl")
print(f"DataSet shape (original) = {dataset.shape}")

dataset = dataset.rename_column("Questions", "anchor")
dataset = dataset.rename_column("Answers", "positive")
dataset = dataset.rename_column("Link to Guide/Policy", "reference")
dataset = dataset.map(lambda e, i: {'negative': f"I do not know"}, with_indices=True).cast_column('negative', Value(dtype='string'))
dataset = dataset.map(lambda e, i: {'qna': f"{e['anchor']} {e['positive']}"}, with_indices=True).cast_column('qna', Value(dtype='string'))
print(f"DataSet shape (after renaming) = {dataset.shape}")

train_dataset = dataset.select_columns(["anchor", "positive", "negative"]).take(30)
eval_dataset = dataset.select_columns(["anchor", "positive", "negative"]).skip(30).take(10)

print(dataset)

for i in range(len(dataset)):
    print(f"{i+1}\t: {dataset[i]['anchor']} \t==>\t {dataset[i]['positive']}")

In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    #output_dir="models/mpnet-base-all-nli-triplet",
    output_dir=trained_model_path,
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

In [None]:
# Set up data for fine-tuning 
loss = losses.TripletLoss(model = model)
trainer = SentenceTransformerTrainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    loss = loss,
)

# Set up data for fine-tuning 
trainer.train()
# Save the trained model 
model.save_pretrained(trained_model_path)