In [17]:
from datetime import datetime
import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AdamW, get_scheduler
)
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch
from torch import cuda
from tqdm.auto import tqdm

In [18]:
""" Label formatting
"""
labels = []
label2id = {}
id2label = {}

df = pd.read_csv("./data/train.csv")

labels = df.columns.tolist()[2:]
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

df["labels"] = df[df.columns[2:]].values.astype("float").tolist()
df = df[["comment_text", "labels"]].drop(df.index[100000:]).reset_index(drop=True)
# df = df[["comment_text", "labels"]]
df.head()

Unnamed: 0,comment_text,labels
0,Explanation\nWhy the edits made under my usern...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,D'aww! He matches this background colour I'm s...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,"Hey man, I'm really not trying to edit war. It...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,"You, sir, are my hero. Any chance you remember...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [19]:
""" Load Huggingface dataset 3from In-memory data
"""
dataset = Dataset.from_pandas(df)

In [20]:
""" Encoding dataset (TOKENIZATION) for later use with BERT model
 - input_ids
 - attention_mask
 - token_type_ids
"""
MAX_LEN = 200
tokenizer_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

def preprocess(samples):
    """ 
        * Tokenize text(s) with a given pretrained tokenizer
        * Casting targets type from int type to float
    """    
    tokenized_samples = tokenizer(
        text=samples["comment_text"],
        max_length=MAX_LEN,
        padding="max_length",
        truncation=True,
        return_token_type_ids=True)

    return tokenized_samples

In [21]:
""" Tokenize texts in dataset
"""
encoded_dataset = dataset.map(function=preprocess,
                              batched=True,
                              num_proc=8)

In [22]:
""" Train/Test split
"""
TEST_SIZE = 0.2
SEED = 42
encoded_dataset = encoded_dataset.train_test_split(
                            test_size=TEST_SIZE,
                            seed=SEED)
train_dataset = encoded_dataset["train"].remove_columns(["comment_text"])
val_dataset = encoded_dataset["test"].remove_columns(["comment_text"])

# Set output format
train_dataset.set_format("torch")
val_dataset.set_format("torch")

In [23]:
""" Create Train/Test Dataloader (use in training loop)
"""
BATCH_SIZE = 16
train_dataloader = DataLoader(dataset=train_dataset,
                              shuffle=True,
                              batch_size=BATCH_SIZE,
                              num_workers=4)
val_dataloader = DataLoader(dataset=val_dataset,
                             batch_size=BATCH_SIZE,
                             num_workers=4)

In [24]:
""" Model Definition
"""
LR = 5e-5 # learning rate
N_EPOCHS = 3
N_TRAIN_STEPS = N_EPOCHS * len(train_dataloader)

# Define model (with multi-label configuration)
model_ckpt = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_ckpt,
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
    num_labels=len(labels)
)

# Optimizer
optimizer = AdamW(params=model.parameters(), lr=LR)

# LR scheduler
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=N_TRAIN_STEPS)

# CPU/GPU switching
device = torch.device("cuda:0") if cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [25]:
""" Training Loop
"""
training_bar = tqdm(range(N_TRAIN_STEPS))
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
tb_writer = SummaryWriter(log_dir="./tensorboard/runs")

for epoch in range(N_EPOCHS):
    print(f"EPOCH {epoch + 1}")
    # Phase 1 (train)
    accumulate_loss = 0.
    train_loss = 0.
    model.train()
    for idx, batch in enumerate(train_dataloader):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss # compute loss
        accumulate_loss += loss.item() # accumulate loss for visualizing later

        n_batch = 50
        if (idx + 1) % n_batch == 0:
            train_loss = accumulate_loss / n_batch
            print(f"Batch {idx + 1} | loss = {train_loss:.3f}")
            global_step = epoch * len(train_dataloader) + (idx + 1)
            tb_writer.add_scalar(tag="Train Loss",
                                 scalar_value=train_loss,
                                 global_step=global_step)
            accumulate_loss = 0.

        # compute loss grads w.r.t the model's params
        loss.backward()
        # update params + optimizer
        optimizer.step()
        lr_scheduler.step()
        # zero out prev gradients
        optimizer.zero_grad()
        training_bar.update(1)

    # Phase 2 (eval)
    validation_loss = 0.
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_dataloader):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            validation_loss += loss.item()
    validation_loss = validation_loss / (idx + 1)
    tb_writer.add_scalars(main_tag="",
                       tag_scalar_dict={"Training": train_loss,
                                        "Validation": validation_loss},
                       global_step=epoch+1)
    tb_writer.flush()

  0%|          | 0/15000 [00:00<?, ?it/s]

EPOCH 1
Batch 50 | loss = 0.249
Batch 100 | loss = 0.101
Batch 150 | loss = 0.078
Batch 200 | loss = 0.073
Batch 250 | loss = 0.074
Batch 300 | loss = 0.065
Batch 350 | loss = 0.058
Batch 400 | loss = 0.056
Batch 450 | loss = 0.063
Batch 500 | loss = 0.072
Batch 550 | loss = 0.074
Batch 600 | loss = 0.062
Batch 650 | loss = 0.056
Batch 700 | loss = 0.055
Batch 750 | loss = 0.053
Batch 800 | loss = 0.048
Batch 850 | loss = 0.052
Batch 900 | loss = 0.056
Batch 950 | loss = 0.053
Batch 1000 | loss = 0.059
Batch 1050 | loss = 0.050
Batch 1100 | loss = 0.061
Batch 1150 | loss = 0.048
Batch 1200 | loss = 0.053
Batch 1250 | loss = 0.053
Batch 1300 | loss = 0.055
Batch 1350 | loss = 0.051
Batch 1400 | loss = 0.052
Batch 1450 | loss = 0.054
Batch 1500 | loss = 0.054
Batch 1550 | loss = 0.052
Batch 1600 | loss = 0.057
Batch 1650 | loss = 0.052
Batch 1700 | loss = 0.045
Batch 1750 | loss = 0.037
Batch 1800 | loss = 0.055
Batch 1850 | loss = 0.059
Batch 1900 | loss = 0.054
Batch 1950 | loss = 0.04