Class Balance in Training set - Experiment 3 - distilBERT

In [None]:
R_SEED = 123

import transformers
from transformers import set_seed
set_seed(R_SEED)

import random
random.seed(R_SEED)

import numpy as np
np.random.seed(R_SEED)

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.manual_seed(R_SEED)
torch.cuda.manual_seed(R_SEED)
torch.cuda.manual_seed_all(R_SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

cuda


In [None]:
#Import all required packages
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score

Get the dataset from Kaggle (a CSV file) ans plit the data intro training - dev - test.

In [None]:
!gdown 17-D6ZC8k2XFh3XbjMkbeoSVSsf8vGie6

Downloading...
From: https://drive.google.com/uc?id=17-D6ZC8k2XFh3XbjMkbeoSVSsf8vGie6
To: /content/Combined Data.csv
100% 31.5M/31.5M [00:00<00:00, 71.3MB/s]


In [None]:
!head -n 5 Combined\ Data.csv

,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless heart. All out of tune",Anxiety
2,"All wrong, back off dear, forward doubt. Stay in a restless and restless place",Anxiety
3,I've shifted my focus to something else but I'm still worried,Anxiety


Get rid of NAN lines.

In [None]:
data = pd.read_csv('/content/Combined Data.csv').dropna()
X = data['statement']
y = data['status']

Mapping all the labels with different numbers (from 0 to 6 as there are 7 different labels)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
enc = preprocessing.LabelEncoder()
encoded_y = enc.fit_transform(y)
print(list(enc.classes_))
# decoded_y = enc.inverse_transform(encoded_y)

['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal']


In [None]:
dev_pcrt = 0.2

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

X_train, X_test, y_train, y_test = train_test_split(
  X,
  encoded_y,
  test_size=dev_pcrt,
  random_state=R_SEED,
  shuffle=True,
  stratify=encoded_y,
)

X_train, X_dev, y_train, y_dev = train_test_split(
  X_train,
  y_train,
  test_size=dev_pcrt,
  random_state=R_SEED,
  shuffle=True,
  stratify=y_train,
)

#Class balance only in the training set
train_max_per_class = len(X_train) // len(list(enc.classes_))
train_df = pd.DataFrame({"X": list(X_train), "y": y_train})
train_filtered_df = train_df.groupby("y").apply(lambda x: x.sample(min(len(x), train_max_per_class), random_state=R_SEED)).reset_index(drop=True)
X_train = np.array(train_filtered_df["X"].tolist())
y_train = train_filtered_df["y"].values

for label in range(0, 7):
    print(f"Label {label}: {train_filtered_df['y'].value_counts().get(label, 0)}")


Label 0: 2458
Label 1: 1777
Label 2: 4816
Label 3: 4816
Label 4: 690
Label 5: 1656
Label 6: 4816


  train_filtered_df = train_df.groupby("y").apply(lambda x: x.sample(min(len(x), train_max_per_class), random_state=R_SEED)).reset_index(drop=True)


In [None]:
del encoded_y, R_SEED, X, data, dev_pcrt, y

Initilalize the 3 necessary functions:
* `__init__`: This is executed whenever an instance of this is created.
* `__getitem__`: This allows you to access any element using an index or index list.
* `__len__`: This returns the number of elements.

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, X_data, y_data, tokenizer):
      super(Dataset, self).__init__()
      self.data = []
      self.tokenizer = tokenizer
      for x, y in zip(X_data, y_data):
        x = self.tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        x = { k:v.squeeze(0) for k,v in x.items() }
        self.data.append([x, y])

  def __getitem__(self, idx):
      sent = self.data[idx][0]
      lbl = self.data[idx][1]
      return sent, lbl

  def __len__(self):
      return len(self.data)

def collate_fn(batch):
    inputs, labels = zip(*batch)
    # Convert list of dicts into dict of tensors
    batch_inputs = {key: torch.stack([x[key] for x in inputs]) for key in inputs[0].keys()}
    # Convert labels into a tensor
    batch_labels = torch.tensor(labels, dtype=torch.long)
    return batch_inputs, batch_labels

Upload the BERT model. In our case, distilBERT architecture.

In [None]:
# https://huggingface.co/distilbert/distilbert-base-cased
MODEL_NAME = 'distilbert/distilbert-base-cased'
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
train_set = Dataset(X_train, y_train, tokenizer)
dev_set = Dataset(X_dev, y_dev, tokenizer)
test_set = Dataset(X_test, y_test, tokenizer)

In [None]:
del X_dev, X_test, X_train, tokenizer, y_dev, y_train, y_test

EXPERIMENT 3 - EXPERIMENTAL SETTING + WEIGHT DECAY + SCHEDULER

In [None]:
#Hyperparameters
BATCH_SIZE = 64
LR = 5e-5
EPOCHS = 5
WEIGHT_DECAY = 1e-2

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE,
                                           shuffle=True, collate_fn=collate_fn)
dev_loader = torch.utils.data.DataLoader(dataset=dev_set, batch_size=BATCH_SIZE,
                                         shuffle=False, collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=BATCH_SIZE,
                                          shuffle=False, collate_fn=collate_fn)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device, scheduler):
    model.train()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, train_acc = 0, 0
    all_preds, all_labels = [], []

    for batch, (X, y) in tqdm(enumerate(dataloader), total=len(dataloader)):
      X = {key: tensor.to(device) for key, tensor in X.items()}
      y = y.to(device)
      # Compute prediction and loss
      pred = model(**X).logits
      loss = loss_fn(pred, y)

      # Backpropagation
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # Store loss and accuracy
      train_loss += loss.item()
      preds = pred.argmax(1)
      train_acc += (preds == y).type(torch.float).sum().item()

      all_preds.extend(preds.cpu().numpy())
      all_labels.extend(y.cpu().numpy())

      if batch % 50 == 0:
        loss, current = loss.item(), batch * len(y)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    f1 = f1_score(all_labels, all_preds, average="macro")

    scheduler.step() # Calculate the new learning rate for the next epoch
    train_loss /= num_batches
    train_acc /= size
    return train_loss, train_acc, f1

Training and dev evaluation loops.

In [None]:
def eval_loop(dataloader, model, loss_fn, device):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_acc = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X, y in tqdm(dataloader):
          X = {key: tensor.to(device) for key, tensor in X.items()}
          y = y.to(device)
          pred = model(**X).logits
          test_loss += loss_fn(pred, y).item()
          preds = pred.argmax(1)
          test_acc += (preds == y).type(torch.float).sum().item()
          all_preds.extend(preds.cpu().numpy())
          all_labels.extend(y.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="macro")

    test_loss /= num_batches
    test_acc /= size
    print(f"Eval Error: \n F1-macro: {f1} Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, test_acc, f1

In [None]:
best_dev_f1 = 0.0

model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(list(enc.classes_))).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
loss_fn = torch.nn.CrossEntropyLoss()

for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss, train_acc, train_f1 = train_loop(train_loader, model, loss_fn, optimizer, device, scheduler)
    print(f'train loss: {train_loss}, train_acc: {train_acc}, train_f1-macro: {train_f1}')
    dev_loss, dev_acc, dev_f1 = eval_loop(dev_loader, model, loss_fn, device)
    print(f'dev loss: {dev_loss}, dev_acc: {dev_acc}, dev_f1-macro: {dev_f1}')
    if dev_f1 > best_dev_f1: #If current dev_acc is greater than the best value so far
        best_dev_f1 = dev_f1 # store it
        torch.save(model.state_dict(), 'model.pth')  # save the model to disk under the name of model.pth

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
-------------------------------


  0%|          | 1/329 [00:03<19:42,  3.61s/it]

loss: 1.955739  [    0/21029]


 16%|█▌        | 51/329 [02:23<13:09,  2.84s/it]

loss: 0.945845  [ 3200/21029]


 31%|███       | 101/329 [04:47<10:51,  2.86s/it]

loss: 0.675935  [ 6400/21029]


 46%|████▌     | 151/329 [07:10<08:31,  2.87s/it]

loss: 0.710889  [ 9600/21029]


 61%|██████    | 201/329 [09:34<06:06,  2.86s/it]

loss: 0.454678  [12800/21029]


 76%|███████▋  | 251/329 [11:57<03:43,  2.87s/it]

loss: 0.469410  [16000/21029]


 91%|█████████▏| 301/329 [14:21<01:20,  2.87s/it]

loss: 0.590506  [19200/21029]


100%|██████████| 329/329 [15:40<00:00,  2.86s/it]


train loss: 0.7376373473634111, train_acc: 0.7186742118027486, train_f1-macro: 0.6737088976990047


100%|██████████| 132/132 [02:09<00:00,  1.02it/s]


Eval Error: 
 F1-macro: 0.7601134399227664 Accuracy: 80.7%, Avg loss: 0.481289 

dev loss: 0.4812888911727703, dev_acc: 0.8073318305848856, dev_f1-macro: 0.7601134399227664
Epoch 2
-------------------------------


  0%|          | 1/329 [00:02<15:35,  2.85s/it]

loss: 0.347230  [    0/21029]


 16%|█▌        | 51/329 [02:26<13:14,  2.86s/it]

loss: 0.509007  [ 3200/21029]


 31%|███       | 101/329 [04:50<10:54,  2.87s/it]

loss: 0.342529  [ 6400/21029]


 46%|████▌     | 151/329 [07:13<08:31,  2.87s/it]

loss: 0.431751  [ 9600/21029]


 61%|██████    | 201/329 [09:36<06:06,  2.86s/it]

loss: 0.478671  [12800/21029]


 76%|███████▋  | 251/329 [12:00<03:42,  2.86s/it]

loss: 0.397747  [16000/21029]


 91%|█████████▏| 301/329 [14:23<01:20,  2.87s/it]

loss: 0.493945  [19200/21029]


100%|██████████| 329/329 [15:42<00:00,  2.87s/it]


train loss: 0.43014155572852103, train_acc: 0.8359883969756051, train_f1-macro: 0.8265191668928779


100%|██████████| 132/132 [02:10<00:00,  1.01it/s]


Eval Error: 
 F1-macro: 0.7926728758927161 Accuracy: 82.2%, Avg loss: 0.446052 

dev loss: 0.44605237685821275, dev_acc: 0.82180567089809, dev_f1-macro: 0.7926728758927161
Epoch 3
-------------------------------


  0%|          | 1/329 [00:02<15:27,  2.83s/it]

loss: 0.424692  [    0/21029]


 16%|█▌        | 51/329 [02:26<13:13,  2.85s/it]

loss: 0.249789  [ 3200/21029]


 31%|███       | 101/329 [04:50<10:55,  2.87s/it]

loss: 0.209470  [ 6400/21029]


 46%|████▌     | 151/329 [07:13<08:29,  2.86s/it]

loss: 0.154601  [ 9600/21029]


 61%|██████    | 201/329 [09:37<06:07,  2.87s/it]

loss: 0.333394  [12800/21029]


 76%|███████▋  | 251/329 [12:00<03:43,  2.87s/it]

loss: 0.164642  [16000/21029]


 91%|█████████▏| 301/329 [14:24<01:20,  2.87s/it]

loss: 0.328582  [19200/21029]


100%|██████████| 329/329 [15:43<00:00,  2.87s/it]


train loss: 0.2778209178145171, train_acc: 0.8930524513766703, train_f1-macro: 0.8985298099956595


100%|██████████| 132/132 [02:10<00:00,  1.01it/s]


Eval Error: 
 F1-macro: 0.788402649516408 Accuracy: 82.2%, Avg loss: 0.466572 

dev loss: 0.46657221515973407, dev_acc: 0.8221615850041524, dev_f1-macro: 0.788402649516408
Epoch 4
-------------------------------


  0%|          | 1/329 [00:02<15:38,  2.86s/it]

loss: 0.132309  [    0/21029]


 16%|█▌        | 51/329 [02:26<13:18,  2.87s/it]

loss: 0.179246  [ 3200/21029]


 31%|███       | 101/329 [04:49<10:52,  2.86s/it]

loss: 0.241026  [ 6400/21029]


 46%|████▌     | 151/329 [07:13<08:30,  2.87s/it]

loss: 0.135611  [ 9600/21029]


 61%|██████    | 201/329 [09:36<06:05,  2.86s/it]

loss: 0.136766  [12800/21029]


 76%|███████▋  | 251/329 [11:59<03:42,  2.86s/it]

loss: 0.270681  [16000/21029]


 91%|█████████▏| 301/329 [14:23<01:20,  2.87s/it]

loss: 0.118557  [19200/21029]


100%|██████████| 329/329 [15:42<00:00,  2.86s/it]


train loss: 0.16677204091498193, train_acc: 0.9416044509962432, train_f1-macro: 0.9489501979470206


100%|██████████| 132/132 [02:09<00:00,  1.02it/s]


Eval Error: 
 F1-macro: 0.7997737943588249 Accuracy: 82.1%, Avg loss: 0.520414 

dev loss: 0.5204135826365515, dev_acc: 0.8213311187566734, dev_f1-macro: 0.7997737943588249
Epoch 5
-------------------------------


  0%|          | 1/329 [00:02<15:32,  2.84s/it]

loss: 0.076597  [    0/21029]


 16%|█▌        | 51/329 [02:26<13:14,  2.86s/it]

loss: 0.075208  [ 3200/21029]


 31%|███       | 101/329 [04:50<10:55,  2.88s/it]

loss: 0.063540  [ 6400/21029]


 46%|████▌     | 151/329 [07:13<08:29,  2.86s/it]

loss: 0.160256  [ 9600/21029]


 61%|██████    | 201/329 [09:37<06:06,  2.86s/it]

loss: 0.069967  [12800/21029]


 76%|███████▋  | 251/329 [12:00<03:43,  2.87s/it]

loss: 0.058142  [16000/21029]


 91%|█████████▏| 301/329 [14:24<01:20,  2.87s/it]

loss: 0.115504  [19200/21029]


100%|██████████| 329/329 [15:43<00:00,  2.87s/it]


train loss: 0.10214090670563711, train_acc: 0.9666650815540444, train_f1-macro: 0.9720584739788466


100%|██████████| 132/132 [02:10<00:00,  1.01it/s]


Eval Error: 
 F1-macro: 0.8106873491967906 Accuracy: 83.0%, Avg loss: 0.538846 

dev loss: 0.5388461945183349, dev_acc: 0.8296357812314628, dev_f1-macro: 0.8106873491967906


Evaluate now in the test set.

In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(list(enc.classes_))).to(device)
state_dict = torch.load("model.pth")
model.load_state_dict(state_dict)
model = model.to(device)
test_loss, test_acc, test_f1 = eval_loop(test_loader, model, loss_fn, device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load("model.pth")
100%|██████████| 165/165 [02:43<00:00,  1.01it/s]

Eval Error: 
 F1-macro: 0.7992194086909281 Accuracy: 82.0%, Avg loss: 0.572918 




