In [1]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from sklearn.model_selection import train_test_split

import pandas as pd
from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

df = pd.read_csv('srpski.csv', sep='\t')
df = df.drop(columns=['Rbr', 'SR', 'sr/sr', 'Naslov', 'Jezik'])

df['label'] = df['Autor'].astype('category').cat.codes
df = df.drop(columns=['Autor'])
df = df.rename(columns={'Tekst':'text'})

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 110
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 28
    })
})

In [3]:
model_id = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1' 
model = SetFitModel.from_pretrained(model_id)


config.json: 100%|██████████| 718/718 [00:00<00:00, 373kB/s]
.gitattributes: 100%|██████████| 345/345 [00:00<00:00, 694kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 583kB/s]
README.md: 100%|██████████| 3.78k/3.78k [00:00<00:00, 8.83MB/s]
config.json: 100%|██████████| 718/718 [00:00<00:00, 762kB/s]
config_sentence_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 153kB/s]
pytorch_model.bin:  70%|██████▉   | 776M/1.11G [11:29<02:44, 2.04MB/s] Error while downloading from https://cdn-lfs.huggingface.co/sentence-transformers/paraphrase-xlm-r-multilingual-v1/224f40dde228eb17d15e7d26caa90f35806def427cd2e48e6f892fb62d64c6fb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1708290261&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwODI5MDI2MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZ

In [4]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    column_mapping={'text':"text", "label": "label"}
)


In [5]:
def hyperparameter_search_function(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16, 32])    }


In [6]:
trainer.train()

Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 20/20 [00:00<00:00, 52.25it/s]
***** Running training *****
  Num examples = 4400
  Num epochs = 1
  Total optimization steps = 275
  Total train batch size = 16
Iteration: 100%|██████████| 275/275 [41:44<00:00,  9.11s/it]
Epoch: 100%|██████████| 1/1 [41:44<00:00, 2504.30s/it]


In [7]:
metrics = trainer.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'accuracy': 0.2857142857142857}