<a href="https://colab.research.google.com/github/ipassynk/dating-match-fine-tuning/blob/main/training/simple_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install datasets
!pip install sentence-transformers==2.2.2
!pip install transformers==4.21.0

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datasets import load_dataset, Dataset
import json

In [None]:
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [None]:
def load_data(file_name):
  data_elements=[]
  with open(file_name, 'r') as data:
      lines = data.readlines()
      for line in lines:
          item = json.loads(line)
          sentence1 = item['text_1'].split(': ')[1]
          sentence2 = item['text_2'].split(': ')[1]
          labels = data['label']
          similarity_score = 1.0 if item['label'] == 0 else 0.0
          data_element = {
              'sentence1': sentence1,
              'sentence2': sentence2,
              'labels': similarity_score
          }
          data_elements.append(data_element)
  return Dataset.from_list(data_elements)

train_dataset=load_data("dating_pairs.jsonl")
eval_dataset=load_data("eval_pairs.jsonl")

print(train_dataset[0])
print(eval_dataset[0])

In [None]:
sentences1 = [el['sentence1'] for el in eval_dataset]
sentences2 = [el['sentence2'] for el in eval_dataset]
scores = [el['labels'] for el in eval_dataset]

evaluator = EmbeddingSimilarityEvaluator(
              sentences1=sentences1,
              sentences2=sentences2,
              scores=scores,
              show_progress_bar=True)

In [None]:
args = SentenceTransformerTrainingArguments(
    num_train_epochs=4,
    eval_strategy="steps",
    eval_steps=500,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="all_nli_cosine_accuracy",
    optim="adamw_torch",
    output_dir="models/dating_model",
    learning_rate=2e-5
)

trainer = SentenceTransformerTrainer(
    model=model,
    eval_dataset=eval_dataset_formatted,
    train_dataset=train_dataset_formatted,
    loss=losses.CosineSimilarityLoss(model=model),
    evaluator=evaluator,
    args=args,
)

trainer.train()

model.save_pretrained("models/dating_model")
