**BART Fine Tuning for deriving plate meaning**

*Note, the original code was run locally using PyCharm, so I am unsure if this would all work on colab.*

Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import Trainer, TrainingArguments
import torch

Set up

In [None]:

df = pd.read_csv("golden_standard.csv").dropna()

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


# Load the BART model and tokenizer
model_name = 'facebook/bart-large'
model = BartForConditionalGeneration.from_pretrained(model_name).to('cuda')
tokenizer = BartTokenizer.from_pretrained(model_name)

Tokenizing (getting embeddings)

In [None]:

def preprocess_data(batch):

    # buh
    input_text = batch['plate']
    # buh
    #input_text = []
    #for plate in batch['plate']:
    #    input_text.append(f"Translate the meaning of the following valid plate to meaning: {plate}")
    # buh

    inputs = tokenizer(input_text, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    targets = tokenizer(batch['meaning'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    batch['input_ids'] = inputs['input_ids']
    batch['attention_mask'] = inputs['attention_mask']
    batch['labels'] = targets['input_ids']
    return batch

train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)
print(train_dataset)
print(train_dataset.data)

Training

In [None]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=10,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()
print("Done Training")

Generating Predictions

In [None]:
def generate_predictions(dataset):
    model.eval()
    plates = []
    true_meanings = []
    predicted_meanings = []

    for item in dataset:
        input_ids = torch.tensor(item['input_ids']).unsqueeze(0).to(model.device)
        attention_mask = torch.tensor(item['attention_mask']).unsqueeze(0).to(model.device)

        with torch.no_grad():
            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=128)

        plates.append(tokenizer.decode(item['input_ids'], skip_special_tokens=True))
        true_meanings.append(tokenizer.decode(item['labels'], skip_special_tokens=True))
        predicted_meanings.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

    return pd.DataFrame({
        "plate": plates,
        "true_meaning": true_meanings,
        "predicted_meaning": predicted_meanings
    })

predictions_df = generate_predictions(test_dataset)

print("Done Predicting")
predictions_df.to_csv("test_predictions.csv", index=False)