In [55]:
import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizer

In [56]:
# Load and preprocess data
df = pd.read_csv('labelled_questions.csv')
df.head()

Unnamed: 0,Question,Label
0,How do I register for classes?,class
1,Where is the library located?,general
2,What are the registration deadlines?,registration
3,Does UM provide free shuttle bus services?,general
4,"I get KK1, where should I assemble during regi...",registration


In [57]:
labels = df['Label'].unique().tolist()
labels = [s.strip() for s in labels ]
labels

['class', 'general', 'registration', 'credit hours', 'residential college']

In [58]:
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

In [59]:
label2id

{'class': 0,
 'general': 1,
 'registration': 2,
 'credit hours': 3,
 'residential college': 4}

In [60]:
df["labels"]=df.Label.map(lambda x: label2id[x.strip()])
df.head()

Unnamed: 0,Question,Label,labels
0,How do I register for classes?,class,0
1,Where is the library located?,general,1
2,What are the registration deadlines?,registration,2
3,Does UM provide free shuttle bus services?,general,1
4,"I get KK1, where should I assemble during regi...",registration,2


In [61]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

loading file vocab.txt from cache at C:\Users\Asus/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\Asus/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\Asus/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps

In [62]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)

loading configuration file config.json from cache at C:\Users\Asus/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "class",
    "1": "general",
    "2": "registration",
    "3": "credit hours",
    "4": "residential college"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "class": 0,
    "credit hours": 3,
    "general": 1,
    "registration": 2,
    "residential college": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.0",


In [63]:
from sklearn.model_selection import train_test_split

# Train-validation-test split
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df.Question, df.labels, test_size=0.2, stratify=df.labels, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.2, stratify=temp_labels, random_state=42
)

In [64]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

In [65]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [66]:
train_dataset = CustomDataset(train_encodings, train_labels.tolist())
val_dataset = CustomDataset(val_encodings, val_labels.tolist())
test_dataset = CustomDataset(test_encodings, test_labels.tolist())

In [67]:
from transformers import TrainingArguments, Trainer

In [68]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [69]:
training_args = TrainingArguments(
    output_dir='./results', 
    do_train=True,
    do_eval=True,
    num_train_epochs=5,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=32,
    warmup_steps=100,                
    weight_decay=0.01,
    logging_dir='./multi-class-logs',            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    load_best_model_at_end=True
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [70]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 136
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 45
100%|██████████| 45/45 [01:51<00:00,  2.21s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 45/45 [01:51<00:00,  2.47s/it]

{'train_runtime': 111.056, 'train_samples_per_second': 6.123, 'train_steps_per_second': 0.405, 'train_loss': 1.3967232598198784, 'epoch': 5.0}





TrainOutput(global_step=45, training_loss=1.3967232598198784, metrics={'train_runtime': 111.056, 'train_samples_per_second': 6.123, 'train_steps_per_second': 0.405, 'train_loss': 1.3967232598198784, 'epoch': 5.0})

In [71]:
# Evaluation
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
val_metrics = trainer.evaluate(eval_dataset=val_dataset)
test_metrics = trainer.evaluate(eval_dataset=test_dataset)

metrics_df = pd.DataFrame([train_metrics, val_metrics, test_metrics], index=["train", "val", "test"]).iloc[:,:4]
metrics_df

***** Running Evaluation *****
  Num examples = 136
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
***** Running Evaluation *****
  Num examples = 27
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 1/1 [00:00<00:00, 332.20it/s]
***** Running Evaluation *****
  Num examples = 7
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 1/1 [00:00<00:00, 242.57it/s]


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision
train,0.838916,0.786765,0.620677,0.66069
val,0.892738,0.740741,0.56601,0.722222
test,0.97834,0.571429,0.380952,0.3125


In [74]:
def predict(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(1)
    pred_label_idx = probs.argmax().item()
    pred_label = model.config.id2label[pred_label_idx]
    return probs, pred_label_idx, pred_label

In [75]:
text = "Does the university offer complimentary shuttle bus services?"
predict(text)

(tensor([[0.2028, 0.3463, 0.1104, 0.0933, 0.2472]], grad_fn=<SoftmaxBackward0>),
 1,
 'general')