In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data into a pandas DataFrame
df = pd.read_csv('data/sentences.csv',index_col=0)
df = df[~df['policy_area_gold'].isna()]
df['label'] = 'Neutral'
df.loc[df['econ_scale_gold']==1,'label'] = 'Econ right'
df.loc[df['econ_scale_gold']==-1,'label'] = 'Econ left'
df.loc[df['soc_scale_gold']==1,'label'] = 'Social con'
df.loc[df['soc_scale_gold']==-1,'label'] = 'Social lib'
df['text'] = df['sentence_text']
df = df[['text','label']]

# Convert string labels to unique integers
# label2int = {label: i for i, label in enumerate(df['label'].unique())}
# int2label = {i: label for label, i in label2int.items()}
# df['label'] = df['label'].map(label2int)


train_df, val_df = train_test_split(df, test_size=0.2)
train_texts, train_labels = train_df['text'].tolist(), train_df['label'].tolist()
val_texts, val_labels = val_df['text'].tolist(), val_df['label'].tolist()

In [4]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


# Tokenize the data
train_dataset = train_dataset.map(tokenize_function, batched=True).class_encode_column("label")
val_dataset = val_dataset.map(tokenize_function, batched=True).class_encode_column("label")

# Load the model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(df['label'].unique()))

Map:   0%|          | 0/397 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/397 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    output_dir='./results',
    overwrite_output_dir=True,
    save_total_limit=3,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()




  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 1.533, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}
{'loss': 1.3871, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 1.4157, 'learning_rate': 4e-05, 'epoch': 0.6}
{'loss': 1.5386, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}
{'loss': 1.4793, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 1.4853203296661377, 'eval_runtime': 59.1781, 'eval_samples_per_second': 1.69, 'eval_steps_per_second': 0.22, 'epoch': 1.0}
{'loss': 1.4167, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 1.197, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.4}
{'loss': 1.0293, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.7873, 'learning_rate': 2e-05, 'epoch': 1.8}
{'loss': 1.1796, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.9683772921562195, 'eval_runtime': 59.2507, 'eval_samples_per_second': 1.688, 'eval_steps_per_second': 0.219, 'epoch': 2.0}
{'loss': 0.6517, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.2}
{'loss': 0.6177, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.4664, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.6}
{'loss': 0.5291, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}
{'loss': 0.5337, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6946234107017517, 'eval_runtime': 61.1835, 'eval_samples_per_second': 1.634, 'eval_steps_per_second': 0.212, 'epoch': 3.0}
{'train_runtime': 4650.8151, 'train_samples_per_second': 0.256, 'train_steps_per_second': 0.032, 'train_loss': 1.0508178329467774, 'epoch': 3.0}


TrainOutput(global_step=150, training_loss=1.0508178329467774, metrics={'train_runtime': 4650.8151, 'train_samples_per_second': 0.256, 'train_steps_per_second': 0.032, 'train_loss': 1.0508178329467774, 'epoch': 3.0})

In [6]:
results = trainer.evaluate()
print(results)

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6946234107017517, 'eval_runtime': 58.9535, 'eval_samples_per_second': 1.696, 'eval_steps_per_second': 0.221, 'epoch': 3.0}


In [7]:
trainer.save_model('./roberta-gb-manifesto')

In [10]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

test_model = AutoModelForSequenceClassification.from_pretrained("./roberta-gb-manifesto")
test_tokenizer = AutoTokenizer.from_pretrained("./roberta-gb-manifesto")

pipe = pipeline("text-classification",model=test_model,tokenizer=test_tokenizer)

pipe('This is a test')

[{'label': 'LABEL_2', 'score': 0.9665055274963379}]

In [15]:
pipe('We will never decrmininalise homosexuality')

[{'label': 'LABEL_4', 'score': 0.4578115940093994}]