In [1]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BartTokenizer
from datasets import load_dataset
import torch
import numpy as np
import evaluate

In [2]:
dataset = load_dataset("silicone", "swda")
dataset = dataset.remove_columns(["Dialogue_Act", "From_Caller", "To_Caller", "Topic", "Idx"])
dataset = dataset.rename_column("Label", "label")
dataset = dataset.rename_column("Utterance", "text")
dataset["train"]

Found cached dataset silicone (/Users/mangrove/.cache/huggingface/datasets/silicone/swda/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'Dialogue_ID', 'Conv_ID', 'label'],
    num_rows: 190709
})

In [3]:
# Split the dataset into training, validation, and test sets
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

In [4]:
from itertools import groupby

def group_utterances_by_conversations(dataset):
    conv_utterances = dataset['text']
    conv_labels = dataset['label']
    conv_ids = dataset['Dialogue_ID']
    conversations = []
    for conv_id, group in groupby(zip(conv_ids, conv_utterances, conv_labels), lambda x: x[0]):
        conv_utterances, conv_labels = zip(*[(x[1], x[2]) for x in group])
        conversation = ' </s> '.join(conv_utterances)
        conversations.append({'conversation': conversation, 'labels': conv_labels})
    return conversations


In [5]:
train_conversations = group_utterances_by_conversations(train_dataset)
val_conversations = group_utterances_by_conversations(val_dataset)
test_conversations = group_utterances_by_conversations(test_dataset)

In [6]:
train_conversations

[{'conversation': "so i 've been concerned about crime lately . </s> uh-huh . </s> uh , it 's really scary to listen to the news every night and -- </s> uh-huh . </s> -- to hear about all the problems . </s> i wondered if you were taking any special precautions in your neighborhood ? </s> well , i , i think we have a neighborhood watch . </s> uh-huh . </s> i think . </s> i 'm not real , </s> we do n't get real involved . </s> we 're never home , </s> so -- </s> uh-huh </s> . </s> -- uh , uh , well , i know they were going to start one , </s> but , uh , i have n't heard any more since , </s> so i do n't really know . </s> but as far as personally doing something , no . </s> no , um . </s> how about you ? </s> well , we moved in , when we moved in , there , there was n't any outside lights </s> and -- </s> uh-huh . </s> -- so we 've been trying to install some , uh , outside lights </s> and -- </s> uh-huh . </s> -- we put up a fence in the backyard . </s> mostly , you know , not so much 

In [19]:
from transformers import BartForSequenceClassification

# Initialize the tokenizer and the model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large', num_labels=43)

loading file vocab.json from cache at /Users/mangrove/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/vocab.json
loading file merges.txt from cache at /Users/mangrove/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /Users/mangrove/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dr

In [20]:
def tokenize_conversations(conversations):
    tokenized_conversations = []
    for conversation in conversations:
        tokenized_conversation = tokenizer.encode_plus(
            conversation['conversation'],
            max_length=512,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt',
            return_token_type_ids=False,
        )
        tokenized_conversation['labels'] = torch.tensor(conversation['labels'])
        tokenized_conversations.append(tokenized_conversation)
    return tokenized_conversations

train_tokenized = tokenize_conversations(train_conversations)
val_tokenized = tokenize_conversations(val_conversations)
test_tokenized = tokenize_conversations(test_conversations)

In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Define the Trainer and TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=1,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=lambda data: {'input_ids': torch.cat([item['input_ids'] for item in data], dim=0),
                                'attention_mask': torch.cat([item['attention_mask'] for item in data], dim=0),
                                'labels': torch.tensor([item['labels'][0] for item in data], dtype=torch.long)}
)

# Train the model
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 407385131


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 115
  Batch size = 1


IndexError: Target 44 is out of bounds.