Class Balance in Training Set - Experiment 2 - distilBERT 

In [None]:
R_SEED = 123

import transformers
from transformers import set_seed
set_seed(R_SEED)

import random
random.seed(R_SEED)

import numpy as np
np.random.seed(R_SEED)

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.manual_seed(R_SEED)
torch.cuda.manual_seed(R_SEED)
torch.cuda.manual_seed_all(R_SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

cuda


In [None]:
#Import al required packages
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score

Get the dataset from Kaggle and split the data into training - dev- test.

In [None]:
!gdown 17-D6ZC8k2XFh3XbjMkbeoSVSsf8vGie6

Downloading...
From: https://drive.google.com/uc?id=17-D6ZC8k2XFh3XbjMkbeoSVSsf8vGie6
To: /content/Combined Data.csv
100% 31.5M/31.5M [00:00<00:00, 88.4MB/s]


In [None]:
!head -n 5 Combined\ Data.csv

,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless heart. All out of tune",Anxiety
2,"All wrong, back off dear, forward doubt. Stay in a restless and restless place",Anxiety
3,I've shifted my focus to something else but I'm still worried,Anxiety


Get rid of NAN lines.

In [None]:
data = pd.read_csv('/content/Combined Data.csv').dropna()
X = data['statement']
y = data['status']

Mapping all the labels with different numbers (from 0 to 6 as there are 7 different labels)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
enc = preprocessing.LabelEncoder()
encoded_y = enc.fit_transform(y)
print(list(enc.classes_))
# decoded_y = enc.inverse_transform(encoded_y)

['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal']


In [None]:
dev_pcrt = 0.2

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

X_train, X_test, y_train, y_test = train_test_split(
  X,
  encoded_y,
  test_size=dev_pcrt,
  random_state=R_SEED,
  shuffle=True,
  stratify=encoded_y,
)

X_train, X_dev, y_train, y_dev = train_test_split(
  X_train,
  y_train,
  test_size=dev_pcrt,
  random_state=R_SEED,
  shuffle=True,
  stratify=y_train,
)

#Class balance only in the training set
train_max_per_class = len(X_train) // len(list(enc.classes_))
train_df = pd.DataFrame({"X": list(X_train), "y": y_train})
train_filtered_df = train_df.groupby("y").apply(lambda x: x.sample(min(len(x), train_max_per_class), random_state=R_SEED)).reset_index(drop=True)
X_train = np.array(train_filtered_df["X"].tolist())
y_train = train_filtered_df["y"].values

for label in range(0, 7):
    print(f"Label {label}: {train_filtered_df['y'].value_counts().get(label, 0)}")


Label 0: 2458
Label 1: 1777
Label 2: 4816
Label 3: 4816
Label 4: 690
Label 5: 1656
Label 6: 4816


  train_filtered_df = train_df.groupby("y").apply(lambda x: x.sample(min(len(x), train_max_per_class), random_state=R_SEED)).reset_index(drop=True)


In [None]:
del encoded_y, R_SEED, X, data, dev_pcrt, y

Initilalize the 3 necessary functions: 
* `__init__`: This is executed whenever an instance of this is created.
* `__getitem__`: This allows you to access any element using an index or index list.
* `__len__`: This returns the number of elements.

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, X_data, y_data, tokenizer):
      super(Dataset, self).__init__()
      self.data = []
      self.tokenizer = tokenizer
      for x, y in zip(X_data, y_data):
        x = self.tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        x = { k:v.squeeze(0) for k,v in x.items() }
        self.data.append([x, y])

  def __getitem__(self, idx):
      sent = self.data[idx][0]
      lbl = self.data[idx][1]
      return sent, lbl

  def __len__(self):
      return len(self.data)

def collate_fn(batch):
    inputs, labels = zip(*batch)
    # Convert list of dicts into dict of tensors
    batch_inputs = {key: torch.stack([x[key] for x in inputs]) for key in inputs[0].keys()}
    # Convert labels into a tensor
    batch_labels = torch.tensor(labels, dtype=torch.long)
    return batch_inputs, batch_labels

Upload the BERT model. In our case, distilBERT architecture.

In [None]:
# https://huggingface.co/distilbert/distilbert-base-cased
MODEL_NAME = 'distilbert/distilbert-base-cased'
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
train_set = Dataset(X_train, y_train, tokenizer)
dev_set = Dataset(X_dev, y_dev, tokenizer)
test_set = Dataset(X_test, y_test, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
del X_dev, X_test, X_train, tokenizer, y_dev, y_train, y_test

EXPERIMENT 2 - EXPERIMENTAL SETTING + WEIGHT DECAY

In [None]:
#Hyperparameters
BATCH_SIZE = 64
LR = 1e-5
EPOCHS = 5
WEIGHT_DECAY = 1e-2

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE,
                                           shuffle=True, collate_fn=collate_fn)
dev_loader = torch.utils.data.DataLoader(dataset=dev_set, batch_size=BATCH_SIZE,
                                         shuffle=False, collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=BATCH_SIZE,
                                          shuffle=False, collate_fn=collate_fn)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    model.train()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, train_acc = 0, 0
    all_preds, all_labels = [], []

    for batch, (X, y) in tqdm(enumerate(dataloader), total=len(dataloader)):
      X = {key: tensor.to(device) for key, tensor in X.items()}
      y = y.to(device)
      # Compute prediction and loss
      pred = model(**X).logits
      loss = loss_fn(pred, y)

      # Backpropagation
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # Store loss and accuracy
      train_loss += loss.item()
      preds = pred.argmax(1)
      train_acc += (preds == y).type(torch.float).sum().item()

      all_preds.extend(preds.cpu().numpy())
      all_labels.extend(y.cpu().numpy())

      if batch % 50 == 0:
        loss, current = loss.item(), batch * len(y)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    f1 = f1_score(all_labels, all_preds, average="macro")

    train_loss /= num_batches
    train_acc /= size
    return train_loss, train_acc, f1

In [None]:
def eval_loop(dataloader, model, loss_fn, device):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_acc = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X, y in tqdm(dataloader):
          X = {key: tensor.to(device) for key, tensor in X.items()}
          y = y.to(device)
          pred = model(**X).logits
          test_loss += loss_fn(pred, y).item()
          preds = pred.argmax(1)
          test_acc += (preds == y).type(torch.float).sum().item()
          all_preds.extend(preds.cpu().numpy())
          all_labels.extend(y.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="macro")

    test_loss /= num_batches
    test_acc /= size
    print(f"Eval Error: \n F1-macro: {f1} Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, test_acc, f1

In [None]:
best_dev_f1 = 0.0

model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(list(enc.classes_))).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss()

for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss, train_acc, train_f1 = train_loop(train_loader, model, loss_fn, optimizer, device)
    print(f'train loss: {train_loss}, train_acc: {train_acc}, train_f1-macro: {train_f1}')
    dev_loss, dev_acc, dev_f1 = eval_loop(dev_loader, model, loss_fn, device)
    print(f'dev loss: {dev_loss}, dev_acc: {dev_acc}, dev_f1-macro: {dev_f1}')
    if dev_f1 > best_dev_f1: #If current dev_acc is greater than the best value so far
        best_dev_f1 = dev_f1 # store it
        torch.save(model.state_dict(), 'model.pth')  # save the model to disk under the name of model.pth

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
-------------------------------


  0%|          | 1/329 [00:03<18:57,  3.47s/it]

loss: 1.955739  [    0/21029]


 16%|█▌        | 51/329 [02:14<12:35,  2.72s/it]

loss: 1.505585  [ 3200/21029]


 31%|███       | 101/329 [04:33<10:41,  2.81s/it]

loss: 1.050437  [ 6400/21029]


 46%|████▌     | 151/329 [06:55<08:25,  2.84s/it]

loss: 0.967217  [ 9600/21029]


 61%|██████    | 201/329 [09:17<06:04,  2.85s/it]

loss: 0.736712  [12800/21029]


 76%|███████▋  | 251/329 [11:39<03:42,  2.86s/it]

loss: 0.674690  [16000/21029]


 91%|█████████▏| 301/329 [14:02<01:19,  2.85s/it]

loss: 0.751257  [19200/21029]


100%|██████████| 329/329 [15:21<00:00,  2.80s/it]


train loss: 1.0128487194562756, train_acc: 0.6274668315183793, train_f1-macro: 0.5264375485713746


100%|██████████| 132/132 [02:05<00:00,  1.05it/s]


Eval Error: 
 F1-macro: 0.6913016250405538 Accuracy: 77.1%, Avg loss: 0.584032 

dev loss: 0.5840324432109342, dev_acc: 0.770791315695812, dev_f1-macro: 0.6913016250405538
Epoch 2
-------------------------------


  0%|          | 1/329 [00:02<15:38,  2.86s/it]

loss: 0.520125  [    0/21029]


 16%|█▌        | 51/329 [02:25<13:13,  2.85s/it]

loss: 0.605971  [ 3200/21029]


 31%|███       | 101/329 [04:47<10:49,  2.85s/it]

loss: 0.522854  [ 6400/21029]


 46%|████▌     | 151/329 [07:10<08:26,  2.85s/it]

loss: 0.464663  [ 9600/21029]


 61%|██████    | 201/329 [09:32<06:04,  2.85s/it]

loss: 0.716294  [12800/21029]


 76%|███████▋  | 251/329 [11:55<03:42,  2.85s/it]

loss: 0.494416  [16000/21029]


 91%|█████████▏| 301/329 [14:17<01:19,  2.85s/it]

loss: 0.537180  [19200/21029]


100%|██████████| 329/329 [15:36<00:00,  2.85s/it]


train loss: 0.5789898316005081, train_acc: 0.7866279899186838, train_f1-macro: 0.7475072291311492


100%|██████████| 132/132 [02:05<00:00,  1.05it/s]


Eval Error: 
 F1-macro: 0.7539378014624345 Accuracy: 80.3%, Avg loss: 0.496830 

dev loss: 0.49682977479515655, dev_acc: 0.8028235852414284, dev_f1-macro: 0.7539378014624345
Epoch 3
-------------------------------


  0%|          | 1/329 [00:02<15:35,  2.85s/it]

loss: 0.667715  [    0/21029]


 16%|█▌        | 51/329 [02:25<13:12,  2.85s/it]

loss: 0.519354  [ 3200/21029]


 31%|███       | 101/329 [04:47<10:50,  2.85s/it]

loss: 0.359502  [ 6400/21029]


 46%|████▌     | 151/329 [07:10<08:26,  2.84s/it]

loss: 0.341721  [ 9600/21029]


 61%|██████    | 201/329 [09:32<06:05,  2.85s/it]

loss: 0.414674  [12800/21029]


 76%|███████▋  | 251/329 [11:55<03:41,  2.85s/it]

loss: 0.251545  [16000/21029]


 91%|█████████▏| 301/329 [14:17<01:19,  2.85s/it]

loss: 0.453994  [19200/21029]


100%|██████████| 329/329 [15:36<00:00,  2.85s/it]


train loss: 0.46181387144019176, train_acc: 0.8273812354367778, train_f1-macro: 0.8089827195360855


100%|██████████| 132/132 [02:05<00:00,  1.05it/s]


Eval Error: 
 F1-macro: 0.7707806701620157 Accuracy: 80.9%, Avg loss: 0.482527 

dev loss: 0.48252686734000844, dev_acc: 0.8094673152212599, dev_f1-macro: 0.7707806701620157
Epoch 4
-------------------------------


  0%|          | 1/329 [00:02<15:34,  2.85s/it]

loss: 0.413559  [    0/21029]


 16%|█▌        | 51/329 [02:24<13:11,  2.85s/it]

loss: 0.535982  [ 3200/21029]


 31%|███       | 101/329 [04:47<10:50,  2.85s/it]

loss: 0.459278  [ 6400/21029]


 46%|████▌     | 151/329 [07:09<08:28,  2.86s/it]

loss: 0.378865  [ 9600/21029]


 61%|██████    | 201/329 [09:32<06:04,  2.85s/it]

loss: 0.420320  [12800/21029]


 76%|███████▋  | 251/329 [11:55<03:42,  2.85s/it]

loss: 0.372520  [16000/21029]


 91%|█████████▏| 301/329 [14:17<01:19,  2.85s/it]

loss: 0.480846  [19200/21029]


100%|██████████| 329/329 [15:36<00:00,  2.85s/it]


train loss: 0.3830320587817659, train_acc: 0.8563887964239859, train_f1-macro: 0.8497116616528079


100%|██████████| 132/132 [02:05<00:00,  1.05it/s]


Eval Error: 
 F1-macro: 0.7793236477952014 Accuracy: 81.2%, Avg loss: 0.490625 

dev loss: 0.4906249310482632, dev_acc: 0.8116027998576344, dev_f1-macro: 0.7793236477952014
Epoch 5
-------------------------------


  0%|          | 1/329 [00:02<15:27,  2.83s/it]

loss: 0.295419  [    0/21029]


 16%|█▌        | 51/329 [02:24<13:11,  2.85s/it]

loss: 0.190315  [ 3200/21029]


 31%|███       | 101/329 [04:47<10:49,  2.85s/it]

loss: 0.309047  [ 6400/21029]


 46%|████▌     | 151/329 [07:09<08:27,  2.85s/it]

loss: 0.412834  [ 9600/21029]


 61%|██████    | 201/329 [09:32<06:03,  2.84s/it]

loss: 0.250641  [12800/21029]


 76%|███████▋  | 251/329 [11:54<03:42,  2.85s/it]

loss: 0.298265  [16000/21029]


 91%|█████████▏| 301/329 [14:17<01:19,  2.84s/it]

loss: 0.331336  [19200/21029]


100%|██████████| 329/329 [15:35<00:00,  2.84s/it]


train loss: 0.3230405119233581, train_acc: 0.8784060107470636, train_f1-macro: 0.8788506090793632


100%|██████████| 132/132 [02:05<00:00,  1.05it/s]


Eval Error: 
 F1-macro: 0.7880847088744707 Accuracy: 81.8%, Avg loss: 0.472963 

dev loss: 0.47296314709114307, dev_acc: 0.817534701625341, dev_f1-macro: 0.7880847088744707


In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(list(enc.classes_))).to(device)
state_dict = torch.load("model.pth")
model.load_state_dict(state_dict)
model = model.to(device)
test_loss, test_acc, test_f1 = eval_loop(test_loader, model, loss_fn, device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load("model.pth")
100%|██████████| 165/165 [02:26<00:00,  1.13it/s]

Eval Error: 
 F1-macro: 0.7873050998803223 Accuracy: 81.9%, Avg loss: 0.478841 




