In [1]:
import pandas as pd
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

## Loading/exploring the data

In [2]:
df = pd.read_csv("overview-of-recordings.csv")
df = df[["phrase", "prompt"]]
# i want it to be called phrase, label for column names
df = df.rename(columns={"phrase": "phrase", "prompt": "label"})
df.head()

Unnamed: 0,phrase,label
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound


In [3]:
df.shape

(6661, 2)

In [4]:
df["label"].value_counts()

label
Acne                  328
Shoulder pain         320
Joint pain            318
Infected wound        306
Knee pain             305
Cough                 293
Feeling dizzy         283
Muscle pain           282
Heart hurts           273
Ear ache              270
Hair falling out      264
Head ache             263
Feeling cold          263
Skin issue            262
Stomach ache          261
Back pain             259
Neck pain             251
Internal pain         248
Blurry vision         246
Body feels weak       241
Hard to breath        233
Emotional pain        231
Injury from sports    230
Foot ache             223
Open wound            208
Name: count, dtype: int64

## Preprocess data

In [5]:
# this creates an integer for each label
# creates column label_id with the number label for each of the labels

unique_labels = df["label"].unique().tolist()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

df["label_id"] = df["label"].map(label2id)
df.head()

Unnamed: 0,phrase,label,label_id
0,When I remember her I feel down,Emotional pain,0
1,When I carry heavy things I feel like breaking...,Hair falling out,1
2,there is too much pain when i move my arm,Heart hurts,2
3,My son had his lip pierced and it is swollen a...,Infected wound,3
4,My muscles in my lower back are aching,Infected wound,3


In [6]:
# train, test, validation split
# adjust to 80/20 or whatever as needed
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.20, random_state=42)

In [7]:
# now coverting the pandas dataframes into Hugging Face dataset objects
train_dataset = Dataset.from_pandas(train_df[["phrase", "label_id"]])
val_dataset = Dataset.from_pandas(val_df[["phrase", "label_id"]])
test_dataset = Dataset.from_pandas(test_df[["phrase", "label_id"]])

## Tokenize the Text

### BERT-base

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_function(example):
    return tokenizer(
        example["phrase"], padding="max_length", truncation=True, max_length=128
    )

In [9]:
# apply the tokenizer to our data
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# rename columns to the format Trainer expects
train_dataset = train_dataset.rename_column("label_id", "labels")
val_dataset = val_dataset.rename_column("label_id", "labels")
test_dataset = test_dataset.rename_column("label_id", "labels")

# set format to PyTorch tensors
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/4262 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1333 [00:00<?, ? examples/s]

## Define the Model

In [10]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training


In [12]:
# defining the training hyperparams
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",  # Save checkpoint every epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Adjust based on dataset size
    weight_decay=0.01,
    logging_steps=50,
)


# evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}


# initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# now we actually trian
trainer.train()

  trainer = Trainer(


  0%|          | 0/1599 [00:00<?, ?it/s]

{'loss': 3.0871, 'grad_norm': 8.741687774658203, 'learning_rate': 4.843652282676673e-05, 'epoch': 0.09}
{'loss': 2.4788, 'grad_norm': 11.904923439025879, 'learning_rate': 4.687304565353346e-05, 'epoch': 0.19}
{'loss': 1.6697, 'grad_norm': 8.758871078491211, 'learning_rate': 4.530956848030019e-05, 'epoch': 0.28}
{'loss': 1.0783, 'grad_norm': 4.06333065032959, 'learning_rate': 4.374609130706692e-05, 'epoch': 0.38}
{'loss': 0.7145, 'grad_norm': 2.2248008251190186, 'learning_rate': 4.218261413383365e-05, 'epoch': 0.47}
{'loss': 0.4453, 'grad_norm': 8.431148529052734, 'learning_rate': 4.061913696060038e-05, 'epoch': 0.56}
{'loss': 0.4056, 'grad_norm': 11.223329544067383, 'learning_rate': 3.905565978736711e-05, 'epoch': 0.66}
{'loss': 0.2482, 'grad_norm': 1.8804333209991455, 'learning_rate': 3.7492182614133836e-05, 'epoch': 0.75}
{'loss': 0.1907, 'grad_norm': 1.1005831956863403, 'learning_rate': 3.5928705440900565e-05, 'epoch': 0.84}
{'loss': 0.16, 'grad_norm': 0.39669471979141235, 'learning

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.1228461042046547, 'eval_accuracy': 0.9718574108818011, 'eval_f1': 0.9716585743470921, 'eval_runtime': 5.5426, 'eval_samples_per_second': 192.328, 'eval_steps_per_second': 24.176, 'epoch': 1.0}
{'loss': 0.1221, 'grad_norm': 0.2567582130432129, 'learning_rate': 3.2801751094434024e-05, 'epoch': 1.03}
{'loss': 0.0658, 'grad_norm': 0.14377978444099426, 'learning_rate': 3.123827392120075e-05, 'epoch': 1.13}
{'loss': 0.0648, 'grad_norm': 0.3524585962295532, 'learning_rate': 2.9674796747967482e-05, 'epoch': 1.22}
{'loss': 0.0368, 'grad_norm': 0.14718036353588104, 'learning_rate': 2.811131957473421e-05, 'epoch': 1.31}
{'loss': 0.0475, 'grad_norm': 0.10879917442798615, 'learning_rate': 2.6547842401500937e-05, 'epoch': 1.41}
{'loss': 0.0515, 'grad_norm': 0.19178684055805206, 'learning_rate': 2.4984365228267666e-05, 'epoch': 1.5}
{'loss': 0.0093, 'grad_norm': 0.07616429030895233, 'learning_rate': 2.34208880550344e-05, 'epoch': 1.59}
{'loss': 0.0267, 'grad_norm': 0.06863011419773102

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.03994685038924217, 'eval_accuracy': 0.9934333958724203, 'eval_f1': 0.9934230807086855, 'eval_runtime': 5.4781, 'eval_samples_per_second': 194.593, 'eval_steps_per_second': 24.461, 'epoch': 2.0}
{'loss': 0.0081, 'grad_norm': 0.05965210869908333, 'learning_rate': 1.5603502188868045e-05, 'epoch': 2.06}
{'loss': 0.0149, 'grad_norm': 0.060965802520513535, 'learning_rate': 1.4040025015634772e-05, 'epoch': 2.16}
{'loss': 0.0255, 'grad_norm': 0.038357894867658615, 'learning_rate': 1.2476547842401502e-05, 'epoch': 2.25}
{'loss': 0.0217, 'grad_norm': 0.046150092035532, 'learning_rate': 1.0913070669168231e-05, 'epoch': 2.35}
{'loss': 0.005, 'grad_norm': 0.03978721424937248, 'learning_rate': 9.34959349593496e-06, 'epoch': 2.44}
{'loss': 0.0176, 'grad_norm': 0.049604762345552444, 'learning_rate': 7.78611632270169e-06, 'epoch': 2.53}
{'loss': 0.0238, 'grad_norm': 0.030209306627511978, 'learning_rate': 6.222639149468418e-06, 'epoch': 2.63}
{'loss': 0.0448, 'grad_norm': 0.0417071431875

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.031215321272611618, 'eval_accuracy': 0.9934333958724203, 'eval_f1': 0.9934230807086855, 'eval_runtime': 6.2693, 'eval_samples_per_second': 170.035, 'eval_steps_per_second': 21.374, 'epoch': 3.0}
{'train_runtime': 276.4993, 'train_samples_per_second': 46.242, 'train_steps_per_second': 5.783, 'train_loss': 0.3532891169870698, 'epoch': 3.0}


TrainOutput(global_step=1599, training_loss=0.3532891169870698, metrics={'train_runtime': 276.4993, 'train_samples_per_second': 46.242, 'train_steps_per_second': 5.783, 'total_flos': 841208168572416.0, 'train_loss': 0.3532891169870698, 'epoch': 3.0})

## Evaluate and Testing

In [13]:
# Evaluate on validation set
val_metrics = trainer.evaluate(eval_dataset=val_dataset)
print("Validation metrics:", val_metrics)

# Evaluate on test set
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test metrics:", test_metrics)

  0%|          | 0/134 [00:00<?, ?it/s]

Validation metrics: {'eval_loss': 0.031215321272611618, 'eval_accuracy': 0.9934333958724203, 'eval_f1': 0.9934230807086855, 'eval_runtime': 5.5163, 'eval_samples_per_second': 193.246, 'eval_steps_per_second': 24.292, 'epoch': 3.0}


  0%|          | 0/167 [00:00<?, ?it/s]

Test metrics: {'eval_loss': 0.0228365957736969, 'eval_accuracy': 0.994748687171793, 'eval_f1': 0.9947237734422149, 'eval_runtime': 6.9094, 'eval_samples_per_second': 192.927, 'eval_steps_per_second': 24.17, 'epoch': 3.0}
