In [1]:
# !pip install -U sentence-transformers
# !pip install --upgrade pip

In [2]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset

In [3]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
    print("using MPS")


using MPS


In [4]:
from sentence_transformers import SentenceTransformer, models

###### CREATE MODEL ######
max_seq_length = 128
train_batch_size = 16


# Load teacher model
# model_name = 'onlplab/alephbert-base'  # e.g., 'onlplab/alephbert-base' 
model_name = 'bert-base-nli-stsb-mean-tokens'
print("Load teacher model")
teacher_model = SentenceTransformer(model_name)

# Create student model
print("Create student model")
word_embedding_model = models.Transformer("xlm-roberta-base")

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Load teacher model
Create student model


In [5]:
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer, SentencesDataset, losses, evaluation, InputExample

from torch.utils.data import DataLoader
from datetime import datetime

# Define label mapping with new values
label_mapping = {
    'entailment': 1.0,
    'neutral': 0.0,
    'contradiction': -1.0
}

###### Load train sets ######

# Load the CSV data
train_df = pd.read_csv('data/train.csv')

# Convert the data into InputExamples
train_examples = [InputExample(texts=[row['translation1'], row['translation2']], label=label_mapping[row['gold_label']]) for _, row in train_df.iterrows()]

# Create SentencesDataset from the InputExamples
train_dataset = SentencesDataset(train_examples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

# Define the loss function
# train_loss = losses.MSELoss(model=model)
train_loss = losses.CosineSimilarityLoss(model=model)



###### Load dev sets ######

# Load the CSV data for dev set
dev_df = pd.read_csv('data/dev.csv')

# Extract sentences and convert labels using the new mapping
dev_sentences1 = dev_df['translation1'].tolist()
dev_sentences2 = dev_df['translation2'].tolist()
dev_scores = dev_df['gold_label'].map(label_mapping).tolist()  # Convert labels to float

# Initialize the evaluator with sentences and scores
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_sentences1, dev_sentences2, dev_scores)
evaluators = [evaluator_sts]


###### Load test sets ######

# Load the CSV data for test set
test_df = pd.read_csv('data/test.csv')

# Extract sentences and convert labels using the new mapping
test_sentences1 = test_df['translation1'].tolist()
test_sentences2 = test_df['translation2'].tolist()
test_scores = test_df['gold_label'].map(label_mapping).tolist()  # Convert labels to float

# Initialize the evaluator with sentences and scores
test_mse = evaluation.EmbeddingSimilarityEvaluator(test_sentences1, test_sentences2, test_scores)
evaluators.append(test_mse)


###### Train model ######

output_path = "output/model-" + datetime.now().strftime("%Y-%m-%d")
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
          epochs=2,
          evaluation_steps=100,
          warmup_steps=1000,
          scheduler='warmupconstant',
          output_path=output_path,
          save_best_model=True,
          optimizer_params= {'lr': 2e-5, 'eps': 1e-6}  # Removed correct_bias
          )

model.save_pretrained('./trained_sbert_model_3')



  0%|          | 0/36664 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 7.57 GB, other allocations: 28.69 GB, max allowed: 36.27 GB). Tried to allocate 732.43 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datasets import load_dataset
from sentence_transformers import InputExample
import pandas as pd

# Load the dataset
sts_data = load_dataset('csv', data_files='./data/heb_sts_test.csv')

# Access the dataset
dataset = sts_data['train']  # or the appropriate split name

# Convert to DataFrame for inspection
df = pd.DataFrame(dataset)

# Prepare the evaluation dataset
sts_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) 
                for row in df.to_dict(orient='records')]

# Initialize the evaluator
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_examples, name='hebrew-sts')

# Assuming you have a trained model
model.evaluate(evaluator)


print("Evaluation results for the model on Hebrew STS benchmark:")
print(evaluator(model, output_path='output/sbert_hebrew_model'))
