In [15]:
import pandas as pd
import numpy as np
# %pip install setfit transformers==4.39.0

from setfit import SetFitModel, TrainingArguments, Trainer, sample_dataset
from datasets import load_dataset, Dataset, DatasetDict

all_valids = pd.read_csv('../data/train_valid_all.csv')
print(all_valids.shape)

all_valids = all_valids[['Response', 'Label']]

# Split all_valids into train, test, and validation datasets
train_samples = all_valids.groupby('Label').apply(lambda x: x.sample(5)).reset_index(drop=True)
remaining_samples = all_valids.drop(train_samples.index).reset_index(drop=True)
test_samples = remaining_samples.groupby('Label').apply(lambda x: x.sample(frac=0.67)).reset_index(drop=True)
validation_samples = remaining_samples.drop(test_samples.index).reset_index(drop=True)


(678, 10)


In [16]:
print("train shape: ", train_samples.shape)
print("test shape: ", test_samples.shape)
print("validation shape: ", validation_samples.shape)

train_dataset = Dataset.from_pandas(train_samples)
val_dataset = Dataset.from_pandas(validation_samples)
test_dataset = Dataset.from_pandas(test_samples)

train shape:  (25, 2)
test shape:  (437, 2)
validation shape:  (216, 2)


In [17]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Response', 'Label'],
        num_rows: 25
    })
    validation: Dataset({
        features: ['Response', 'Label'],
        num_rows: 216
    })
    test: Dataset({
        features: ['Response', 'Label'],
        num_rows: 437
    })
})

In [1]:
import pandas as pd
import numpy as np
from setfit import SetFitModel, TrainingArguments, Trainer, sample_dataset
from datasets import load_dataset, Dataset, DatasetDict

model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")



config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [2]:
dataset = load_dataset("SetFit/SentEval-CR")

Downloading readme:   0%|          | 0.00/447 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/427k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3012 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/753 [00:00<?, ? examples/s]

In [3]:
train_ds = dataset["train"].shuffle(seed=42).select(range(8 * 2))
test_ds = dataset["test"]

In [5]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=1 # Number of epochs to use for contrastive learning
)

  trainer = SetFitTrainer(
  obj.co_lnotab,  # for < python 3.10 [not counted in args]


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [6]:
trainer.train()


***** Running training *****
  Num unique pairs = 640
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 40


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

{'embedding_loss': 0.2251, 'learning_rate': 5e-06, 'epoch': 0.03}
{'train_runtime': 94.5332, 'train_samples_per_second': 6.77, 'train_steps_per_second': 0.423, 'epoch': 1.0}


In [7]:
metrics = trainer.evaluate()

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
metrics

{'accuracy': 0.8632138114209827}

# hered down has not been proven to work


In [23]:
dataset = load_dataset("SetFit/sst2")
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8)

test_dataset = dataset["test"]
model.labels = ["negative", "positive"]


args = TrainingArguments(
    batch_size=32,
    num_epochs=10,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
)

trainer.train()

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 16
})

In [7]:
args = TrainingArguments(
    batch_size=8,
    num_epochs=1,
    evaluation_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    column_mapping={"Response": "text", "Label": "label"}
)

trainer.train()

***** Running training *****
  Num unique pairs = 500
  Batch size = 8
  Num epochs = 1
  Total optimization steps = 63


  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'embedding_loss': 0.2289, 'learning_rate': 2.8571428571428573e-06, 'epoch': 0.02}
{'embedding_loss': 0.1546, 'learning_rate': 4.642857142857144e-06, 'epoch': 0.79}


  0%|          | 0/3538 [00:00<?, ?it/s]

KeyboardInterrupt: 