In [1]:
import logging
import sys
import traceback
import pandas as pd
from datetime import datetime


from datasets import load_dataset, Dataset


from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import BatchSamplers, SentenceTransformerTrainingArguments


In [2]:

# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

model_name = "imvladikon/sentence-transformers-alephbert"
train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1

# Save path of the model
output_dir = "output/training_nli_v2_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


In [3]:

# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)


2024-10-18 13:13:57 - Use pytorch device_name: mps
2024-10-18 13:13:57 - Load pretrained SentenceTransformer: imvladikon/sentence-transformers-alephbert


In [4]:

# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
# We'll start with 10k training samples, but you can increase this to get a stronger model
# logging.info("Read AllNLI train dataset")
# train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train").select(range(10000))
# eval_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="dev").select(range(1000))
# logging.info(train_dataset)

logging.info("Load the datasets from CSV files")
train_df = pd.read_csv('data/train.csv')
valid_df = pd.read_csv('data/dev.csv')
test_df = pd.read_csv('data/test.csv')

logging.info("Convert pandas DataFrames to Hugging Face Dataset objects")
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

eval_dataset = valid_dataset


2024-10-18 13:14:00 - Load the datasets from CSV files
2024-10-18 13:14:01 - Convert pandas DataFrames to Hugging Face Dataset objects


In [5]:

# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss
train_loss = losses.MultipleNegativesRankingLoss(model)


In [6]:


# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
# stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
# dev_evaluator = EmbeddingSimilarityEvaluator(
#     sentences1=stsb_eval_dataset["sentence1"],
#     sentences2=stsb_eval_dataset["sentence2"],
#     scores=stsb_eval_dataset["score"],
#     main_similarity=SimilarityFunction.COSINE,
#     name="sts-dev",
# )
# logging.info("Evaluation before training:")
# dev_evaluator(model)

sts_eval_df = pd.read_csv('data/heb_sts_test.csv')
stsb_eval_dataset = Dataset.from_pandas(sts_eval_df)

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=stsb_eval_dataset["sentence1"],
    sentences2=stsb_eval_dataset["sentence2"],
    scores=stsb_eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts",
)
logging.info("Evaluation before training:")
dev_evaluator(model)


2024-10-18 13:14:02 - Evaluation before training:
2024-10-18 13:14:02 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts dataset:


2024-10-18 13:14:08 - Cosine-Similarity :	Pearson: 0.6693	Spearman: 0.6508
2024-10-18 13:14:08 - Manhattan-Distance:	Pearson: 0.6429	Spearman: 0.6310
2024-10-18 13:14:08 - Euclidean-Distance:	Pearson: 0.6439	Spearman: 0.6317
2024-10-18 13:14:08 - Dot-Product-Similarity:	Pearson: 0.4571	Spearman: 0.4405


{'sts_pearson_cosine': np.float64(0.6693029706135705),
 'sts_spearman_cosine': np.float64(0.6508239891280294),
 'sts_pearson_manhattan': np.float64(0.6429249591660918),
 'sts_spearman_manhattan': np.float64(0.6309807328394371),
 'sts_pearson_euclidean': np.float64(0.6438809257962056),
 'sts_spearman_euclidean': np.float64(0.6317416791343402),
 'sts_pearson_dot': np.float64(0.45707904633475227),
 'sts_spearman_dot': np.float64(0.44048541100202476),
 'sts_pearson_max': np.float64(0.6693029706135705),
 'sts_spearman_max': np.float64(0.6508239891280294)}

In [7]:

# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    logging_steps=100,
    run_name="nli-v2",  # Will be used in W&B if `wandb` is installed
)

In [9]:

# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()


  0%|          | 0/2292 [00:00<?, ?it/s]

TypeError: 'int' object is not subscriptable

In [None]:

# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model)


In [None]:

# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
