In [17]:
import pandas as pd
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    RobertaTokenizer,
    RobertaForSequenceClassification,
)
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

## Loading/exploring the data

In [18]:
df = pd.read_csv("1900rows_data.csv")
df = df[["text", "og_label", "LABEL"]]
# i want it to be called phrase, label for column names
df = df.rename(columns={"text": "phrase", "LABEL": "label"})
df.head()

Unnamed: 0,phrase,og_label,label
0,When I remember her I feel down,Emotional pain,Neurological & General Symptoms
1,When I carry heavy things I feel like breaking...,Hair falling out,Dermatological & Skin Conditions
2,there is too much pain when i move my arm,Heart hurts,Chronic Conditions
3,My son had his lip pierced and it is swollen a...,Infected wound,Infections
4,My muscles in my lower back are aching,Infected wound,Infections


phrase and label will be the columns used in the model below

In [19]:
df.shape

(1906, 3)

In [20]:
df["label"].value_counts()

label
Infections                          356
Dermatological & Skin Conditions    340
Chronic Conditions                  281
Pain-related Conditions             262
Respiratory & Sensory Issues        237
Gastrointestinal Conditions         127
Neurological & General Symptoms     126
Allergic/Immunologic Reactions      100
Hepatobiliary                        50
Trauma/Injuries                      27
Name: count, dtype: int64

In [21]:
df.head()

Unnamed: 0,phrase,og_label,label
0,When I remember her I feel down,Emotional pain,Neurological & General Symptoms
1,When I carry heavy things I feel like breaking...,Hair falling out,Dermatological & Skin Conditions
2,there is too much pain when i move my arm,Heart hurts,Chronic Conditions
3,My son had his lip pierced and it is swollen a...,Infected wound,Infections
4,My muscles in my lower back are aching,Infected wound,Infections


In [22]:
df = df.drop("og_label", axis=1)

## Preprocess data

In [23]:
# this creates an integer for each label
# creates column label_id with the number label for each of the labels

unique_labels = df["label"].unique().tolist()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

df["label_id"] = df["label"].map(label2id)
df.head()

Unnamed: 0,phrase,label,label_id
0,When I remember her I feel down,Neurological & General Symptoms,0
1,When I carry heavy things I feel like breaking...,Dermatological & Skin Conditions,1
2,there is too much pain when i move my arm,Chronic Conditions,2
3,My son had his lip pierced and it is swollen a...,Infections,3
4,My muscles in my lower back are aching,Infections,3


In [24]:
# train, test, validation split
# adjust to 80/20 or whatever as needed
train_df_og, test_df = train_test_split(df, test_size=0.20, random_state=42)
train_df, val_df = train_test_split(train_df_og, test_size=0.20, random_state=42)

In [25]:
# now coverting the pandas dataframes into Hugging Face dataset objects
train_dataset = Dataset.from_pandas(train_df[["phrase", "label_id"]])
val_dataset = Dataset.from_pandas(val_df[["phrase", "label_id"]])
test_dataset = Dataset.from_pandas(test_df[["phrase", "label_id"]])

# BERT-base

## Tokenize the Text

In [26]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_function(example):
    return tokenizer(
        example["phrase"], padding="max_length", truncation=True, max_length=128
    )

In [27]:
# apply the tokenizer to our data
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# rename columns to the format Trainer expects
train_dataset = train_dataset.rename_column("label_id", "labels")
val_dataset = val_dataset.rename_column("label_id", "labels")
test_dataset = test_dataset.rename_column("label_id", "labels")

# set format to PyTorch tensors
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1219 [00:00<?, ? examples/s]

Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Map:   0%|          | 0/382 [00:00<?, ? examples/s]

## Define the Model

In [28]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training


In [29]:
# defining the training hyperparams
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",  # Save checkpoint every epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Adjust based on dataset size
    weight_decay=0.01,
    logging_steps=50,
)


# evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}


# initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# now we actually trian
trainer.train()

  trainer = Trainer(


  0%|          | 0/459 [00:00<?, ?it/s]

{'loss': 1.9532, 'grad_norm': 7.3343000411987305, 'learning_rate': 4.4553376906318085e-05, 'epoch': 0.33}
{'loss': 1.3901, 'grad_norm': 6.511680603027344, 'learning_rate': 3.910675381263617e-05, 'epoch': 0.65}
{'loss': 0.928, 'grad_norm': 8.806724548339844, 'learning_rate': 3.366013071895425e-05, 'epoch': 0.98}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.6580419540405273, 'eval_accuracy': 0.8360655737704918, 'eval_f1': 0.8296646897992638, 'eval_runtime': 3.3377, 'eval_samples_per_second': 91.381, 'eval_steps_per_second': 11.685, 'epoch': 1.0}
{'loss': 0.4787, 'grad_norm': 4.79069185256958, 'learning_rate': 2.8213507625272335e-05, 'epoch': 1.31}
{'loss': 0.3931, 'grad_norm': 16.791837692260742, 'learning_rate': 2.2766884531590417e-05, 'epoch': 1.63}
{'loss': 0.2896, 'grad_norm': 5.896658897399902, 'learning_rate': 1.7320261437908496e-05, 'epoch': 1.96}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.3976519703865051, 'eval_accuracy': 0.8885245901639345, 'eval_f1': 0.8880312381241089, 'eval_runtime': 1.6075, 'eval_samples_per_second': 189.733, 'eval_steps_per_second': 24.261, 'epoch': 2.0}
{'loss': 0.1656, 'grad_norm': 1.2936216592788696, 'learning_rate': 1.187363834422658e-05, 'epoch': 2.29}
{'loss': 0.0631, 'grad_norm': 2.692521572113037, 'learning_rate': 6.427015250544663e-06, 'epoch': 2.61}
{'loss': 0.1495, 'grad_norm': 18.80130958557129, 'learning_rate': 9.80392156862745e-07, 'epoch': 2.94}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.319612979888916, 'eval_accuracy': 0.9016393442622951, 'eval_f1': 0.9018947753292498, 'eval_runtime': 1.6549, 'eval_samples_per_second': 184.298, 'eval_steps_per_second': 23.566, 'epoch': 3.0}
{'train_runtime': 84.7649, 'train_samples_per_second': 43.143, 'train_steps_per_second': 5.415, 'train_loss': 0.6373874616519039, 'epoch': 3.0}


TrainOutput(global_step=459, training_loss=0.6373874616519039, metrics={'train_runtime': 84.7649, 'train_samples_per_second': 43.143, 'train_steps_per_second': 5.415, 'total_flos': 240566560722432.0, 'train_loss': 0.6373874616519039, 'epoch': 3.0})

## Evaluate and Testing

In [30]:
# Evaluate on validation set
val_metrics = trainer.evaluate(eval_dataset=val_dataset)
print("Validation metrics:", val_metrics)

# Evaluate on test set
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test metrics:", test_metrics)

  0%|          | 0/39 [00:00<?, ?it/s]

Validation metrics: {'eval_loss': 0.319612979888916, 'eval_accuracy': 0.9016393442622951, 'eval_f1': 0.9018947753292498, 'eval_runtime': 1.72, 'eval_samples_per_second': 177.322, 'eval_steps_per_second': 22.674, 'epoch': 3.0}


  0%|          | 0/48 [00:00<?, ?it/s]

Test metrics: {'eval_loss': 0.25221195816993713, 'eval_accuracy': 0.9293193717277487, 'eval_f1': 0.9285292023452953, 'eval_runtime': 2.0517, 'eval_samples_per_second': 186.189, 'eval_steps_per_second': 23.395, 'epoch': 3.0}
