In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset

# Load the data into a pandas DataFrame
df = pd.read_csv('data/sentences.csv',index_col=0)
df = df[~df['policy_area_gold'].isna()]
df['label'] = 'Neutral'
df.loc[df['econ_scale_gold']==1,'label'] = 'Econ right'
df.loc[df['econ_scale_gold']==-1,'label'] = 'Econ left'
df.loc[df['soc_scale_gold']==1,'label'] = 'Social con'
df.loc[df['soc_scale_gold']==-1,'label'] = 'Social lib'
df['text'] = df['sentence_text']

df = df[df['label'].isin(['Econ right','Econ left'])]
df['label'] = df['label'].map({'Econ right': 0, 'Econ left': 1})
df = df.groupby('label',as_index=False).sample(n=50)
df = df[['text','label']]

train_df, eval_df = train_test_split(df, stratify=df['label'], test_size=0.2)


# Convert the pandas DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Simulate the few-shot regime by sampling 8 examples per class
# train_dataset = sample_dataset(train_dataset, label_column="label", num_samples=8)

# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"}
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()
metrics

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 3200
  Num epochs = 1
  Total optimization steps = 200
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/200 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'accuracy': 0.8}

In [3]:
trainer.push_to_hub(f"jwhandley/setfit-gb-manifestos-econ")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model_head.pkl:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

'https://huggingface.co/jwhandley/setfit-gb-manifestos-econ/tree/main/'

In [4]:
# Download from Hub and run inference
model = SetFitModel.from_pretrained("jwhandley/setfit-gb-manifestos-econ")
# Run inference
preds = model(["We will redistribute income from the richest to the poorest","Wealth creators should keep more of their income"])
preds

Downloading (…)lve/main/config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model_head.pkl:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

tensor([1, 0], dtype=torch.int32)