In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import pandas as pd

In [9]:
data = pd.read_csv("train.adverbs.csv")
data["sentence"] = data.apply(lambda x: x["sentence"].lower().replace(x["adverb"], f"<t>{x["adverb"]}</t>"), axis=1)

In [10]:
data.head()

Unnamed: 0,adverb,type,sentence,correct,Unnamed: 4
0,abominably,adverb.manner,i don't know anyone who could have behaved so ...,yes,
1,absently,adverb.manner,he read the letter <t>absently</t>,yes,
2,absolutely,adverb.degree,"<t>absolutely</t> right! fuck destiny, fate. a...",yes,
3,abstemiously,adverb.manner,"the monk lived <t>abstemiously</t>, avoiding a...",yes,
4,abstractly,adverb.domain,"<t>abstractly</t>, i knew it could happen.",yes,


In [11]:
data = Dataset.from_pandas(data)

In [12]:
label_list = sorted(list(set(data['type'])))  # might be strings
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for i,l in enumerate(label_list)}

def map_labels(batch):
    return {"label": [label2id[l] for l in batch['type']]}

data = data.map(map_labels, batched=True)

Map:   0%|          | 0/767 [00:00<?, ? examples/s]

In [14]:
model_name = "roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list), device_map="cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_tokens(["<t>", "</t>"])
model.resize_token_embeddings(len(tokenizer))


model.config.label2id = label2id
model.config.id2label = id2label

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [None]:
def tokenize(examples):
    output = tokenizer(examples["sentence"], padding="max_length", max_length=64, truncation=True, return_tensors="pt")
    return output

In [None]:
data = data.map(tokenize, remove_columns=["sentence", "type", "adverb"], batched=True)

In [None]:
# Optionally set format to return torch tensors (so collator sees tensors)
data.set_format(type="torch", columns=["input_ids","attention_mask","label"])

In [None]:
data = data.train_test_split()

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(predictions, labels)
    p, r, fscore, _ = precision_recall_fscore_support(predictions, labels, average="weighted", zero_division=0.0)
    return {
        "acc": accuracy,
        "precision": p,
        "recall": r,
        "fscore": fscore,
    }

In [None]:
training_args = TrainingArguments(
    output_dir="adverbs_classifier",
    eval_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=10,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="acc",
    save_strategy="epoch",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
test = pd.read_csv("adverb_os_llm.csv")
test["label"] = "adverb." + test["jooyoung"].str.replace("-", "_")
test = test[["sentence", "label"]]
test = test.dropna(axis=0)

In [None]:
test = Dataset.from_pandas(test)
test = test.map(tokenize, batched=True, remove_columns=["sentence"])
def map_labels(batch):
    return {"label": [label2id[l] for l in batch['label']]}

test = test.map(map_labels, batched=True)

test.set_format(type="torch", columns=["input_ids","attention_mask","label"])

In [None]:
pred = trainer.predict(test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

id2label = {v:k for k,v in label2id.items()}
trues = [id2label[x.item()] for x in test["label"]]
preds = [id2label[x] for x in pred.predictions.argmax(-1)]

# Compute confusion matrix
cm = confusion_matrix(trues, preds, labels=label_list)

# Plot confusion matrix

plt.figure(figsize=(13,25))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_list)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix with Class Names")
plt.tight_layout()
plt.savefig("cm.png")

In [None]:
pred.metrics