# Preprocessing for BERT 

In [1]:
import datasets
import pickle
import random
import re
import torch
import transformers
import wandb
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict

from functools import partial

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, \
                            ConfusionMatrixDisplay, matthews_corrcoef, average_precision_score
from sklearn.model_selection import train_test_split

from torch.nn import BCEWithLogitsLoss
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

from transformers import BigBirdForSequenceClassification, RobertaForSequenceClassification, \
                            Trainer, TrainingArguments, \
                            BatchEncoding, EvalPrediction, AutoTokenizer, BigBirdForMaskedLM

%cd /

/


## Preprocessing

In [4]:
%cd /data-imperial
train_data = pd.read_pickle("saved/multilabel_text_with_preds.pickle")

/data-imperial


In [None]:
eval_data = pd.read_pickle("saved/eval_text_with_labels.pickle")

In [8]:
## truncating longer sequences 
max_token_length = 2048
train_data['text'] = train_data.text.apply(lambda x: x if len(x.split()) < max_token_length else " ".join(x.split()[:max_token_length]) )

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bigbird")

In [10]:
## padding to minimum length
min_token_length = 704
train_data['text'] = train_data.text.apply(lambda x: x if len(x.split()) >= min_token_length 
                                           else x + ((" " + tokenizer.pad_token)*(min_token_length - len(x.split()))))

In [11]:
## sorting the dataset by length
train_data['length'] = train_data.text.apply(lambda x: len(x.split()))
train_data = train_data.sort_values('length', axis=0)

In [13]:
## creating the dataset
labels = 'substance'
train_data = pd.DataFrame(train_data.set_index("text").loc[:,labels])
train_data = train_data.astype({labels: 'int32'})
train_data.columns = ['labels']

In [31]:
pickle.dump(train_data, open("saved/untokenized.pickle", "wb"))

## Tokenizing

In [4]:
%cd /data-imperial/

train_data = pickle.load(open('saved/untokenized.pickle', 'rb'))

/data-imperial


In [14]:
train_data, test_data = train_test_split(train_data, test_size=0.1)

train_data = datasets.Dataset.from_pandas(train_data)
test_data = datasets.Dataset.from_pandas(test_data)

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bigbird")

In [17]:
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = False, truncation=True, max_length = 2048)

train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [3]:
# %cd /
# tokenizer = AutoTokenizer.from_pretrained('google/bigbird-roberta-base', 
#                                           max_length = 2048,
#                                           cache_dir="/data-imperial/cache")
# tokenizer.save_pretrained("data-imperial/bigbird")

('data-imperial/bigbird/tokenizer_config.json',
 'data-imperial/bigbird/special_tokens_map.json',
 'data-imperial/bigbird/spiece.model',
 'data-imperial/bigbird/added_tokens.json',
 'data-imperial/bigbird/tokenizer.json')

In [18]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [19]:
pickle.dump(train_data, open("saved/substance/single_train_ds.pickle", "wb"))
pickle.dump(test_data, open("saved/substance/single_test_ds.pickle", "wb"))

## Processing and Tokenizing for MLM Finetuning

In [2]:
%cd /data-imperial
train_data = pd.read_pickle("saved/working_text_with_labels.pickle")

/data-imperial


In [3]:
tokenizer = AutoTokenizer.from_pretrained("bigbird")

In [4]:
## truncating longer sequences 
max_token_length = 2048
train_data['text'] = train_data.text.apply(lambda x: x if len(x.split()) < max_token_length else " ".join(x.split()[:max_token_length]) )

In [5]:
## padding to minimum length
min_token_length = 704
train_data['text'] = train_data.text.apply(lambda x: x if len(x.split()) >= min_token_length 
                                           else x + ((" " + tokenizer.pad_token)*(min_token_length - len(x.split()))))

In [6]:
train, test = train_test_split(train_data, test_size=0.1)
_, extra_short = train_test_split(train_data, test_size=0.0001)

In [7]:
def tokenize(text):
    return tokenizer(text, 
                   return_tensors='pt', 
                   max_length=2048, 
                   truncation=True, 
                   padding="max_length")

train = tokenize(list(train.text))
test = tokenize(list(test.text))
extra_short = tokenize(list(extra_short.text))

In [8]:
train['labels'] = train.input_ids.detach().clone()
test['labels'] = test.input_ids.detach().clone()
extra_short['labels'] = extra_short.input_ids.detach().clone()

In [9]:
def make_mask(inputs): 
    rand = torch.rand(inputs.input_ids.shape)
    mask_arr = (rand < 0.15) * (inputs.input_ids != 1) * (inputs.input_ids != 2) * (inputs.input_ids != 0)

    selection = []
    for i in range(inputs.input_ids.shape[0]):
        selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 67
        
    return inputs
        
train = make_mask(train)
test = make_mask(test)
extra_short = make_mask(extra_short)

In [4]:
class ShoutDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [11]:
train = ShoutDataset(train)
test = ShoutDataset(test)
extra_short = ShoutDataset(extra_short)

In [12]:
pickle.dump(train, open("saved/mlm/train.pickle", "wb"))
pickle.dump(test, open("saved/mlm/test.pickle", "wb"))
pickle.dump(extra_short, open("saved/mlm/short.pickle", "wb"))

## Finetuning with MLM

In [None]:
%cd /data-imperial
train = pickle.load(open("saved/mlm/train.pickle", "rb"))
test = pickle.load(open("saved/mlm/test.pickle", "rb"))
extra_short = pickle.load(open("saved/mlm/short.pickle", "rb"))

In [3]:
model = BigBirdForMaskedLM.from_pretrained("bigbird")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

Some weights of the model checkpoint at bigbird were not used when initializing BigBirdForMaskedLM: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
- This IS expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForMaskedLM were not initialized from the model checkpoint at bigbird and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNor

device(type='cuda')

In [7]:
args = TrainingArguments(
    output_dir = 'bigbird/output/finetune',
    num_train_epochs = 5,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 2,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 1e-5,
    log_level = 'warning', 
    fp16 = True,
    logging_dir='bigbird/logs/finetune',
    dataloader_num_workers = 0,
    run_name = 'bigbird_classification_test'
)

In [8]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    mcc = matthews_corrcoef(labels, preds)
    cm = confusion_matrix(labels, preds, labels=[0,1])
    auprc = average_precision_score(labels, preds)
    print(cm)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc,
        'auprc': auprc
    }

In [9]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train,
    eval_dataset=test
)

In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Epoch,Training Loss,Validation Loss
0,0.1615,0.143269
1,0.138,0.122171
2,0.1188,0.114133
3,0.1142,0.11101
4,0.117,0.1098


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  nn.utils.clip_grad_norm_(
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  nn.utils.clip_grad_norm_(
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2410, training_loss=0.14594357851506268, metrics={'train_runtime': 96583.7383, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.025, 'total_flos': 1.6361164666935706e+17, 'train_loss': 0.14594357851506268, 'epoch': 5.0})

## Training

In [2]:
%cd /data-imperial/

train_data = pickle.load(open('saved/substance/single_train_ds.pickle', 'rb'))
test_data = pickle.load(open('saved/substance/single_test_ds.pickle', 'rb'))

/data-imperial


In [3]:
model = BigBirdForSequenceClassification.from_pretrained('bigbird/output/substance/epoch-5',
                                                         gradient_checkpointing=False,
                                                         num_labels=2,
                                                         cache_dir='/data-imperial/cache',
                                                         return_dict=True)

# # model.save_pretrained("data-imperial/bigbird")

In [4]:
## from checkpoint
# checkpoint = "bigbird/output/ml/epoch-5" ## edit accordingly
# model = BigBirdForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained('bigbird')

In [5]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    mcc = matthews_corrcoef(labels, preds)
    cm = confusion_matrix(labels, preds, labels=[0,1])
    auprc = average_precision_score(labels, preds)
    print(cm)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc,
        'auprc': auprc
    }

In [6]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = 'bigbird/output/substance',
    num_train_epochs = 5,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 2,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 1e-5,
    log_level = 'warning', 
    fp16 = True,
    logging_dir='bigbird/logs/substance',
    dataloader_num_workers = 0,
    run_name = 'bigbird_classification_test'
)

In [7]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cuda')

In [8]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [9]:
# train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc,Auprc
0,0.0706,0.032949,0.994713,0.0,0.0,0.0,0.0,0.005287
1,0.0031,0.018281,0.994713,0.0,0.0,0.0,0.0,0.005287
2,0.0013,0.018122,0.994713,0.0,0.0,0.0,0.0,0.005287
3,0.0024,0.019613,0.994348,0.340426,0.444444,0.275862,0.347474,0.126434
4,0.0015,0.022832,0.994348,0.340426,0.444444,0.275862,0.347474,0.126434


  nn.utils.clip_grad_norm_(


[[5456    0]
 [  29    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
wandb: Network error (ReadTimeout), entering retry loop.
[34m[1mwandb[0m: Network error resolved after 0:00:37.840137, resuming normal operation.


[[5456    0]
 [  29    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  nn.utils.clip_grad_norm_(


[[5456    0]
 [  29    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  nn.utils.clip_grad_norm_(


[[5446   10]
 [  21    8]]
[[5446   10]
 [  21    8]]


TrainOutput(global_step=3855, training_loss=0.023155309161323516, metrics={'train_runtime': 93375.4023, 'train_samples_per_second': 2.643, 'train_steps_per_second': 0.041, 'total_flos': 2.0114757042812214e+17, 'train_loss': 0.023155309161323516, 'epoch': 5.0})

### BigBird Evaluation

In [7]:
final_test_data = pickle.load(open("saved/working_eval_ds_final.pickle", "rb"))

In [8]:
trainer.evaluate(eval_dataset=final_test_data) ## need to compute final_test_data

The following columns in the evaluation set  don't have a corresponding argument in `BigBirdForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3608
  Batch size = 32
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


[[2028  340]
 [ 149 1091]]


[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.11.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{'eval_loss': 0.3115261495113373,
 'eval_accuracy': 0.8644678492239468,
 'eval_f1': 0.8169225009359791,
 'eval_precision': 0.7624039133473096,
 'eval_recall': 0.8798387096774194,
 'eval_mcc': 0.7147965832312904,
 'eval_auprc': 0.7120895928891416,
 'eval_runtime': 787.0118,
 'eval_samples_per_second': 4.584,
 'eval_steps_per_second': 0.144}

## Substance Use Big Bird

### Epoch 4

In [9]:
trainer.evaluate()

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


[[5446   10]
 [  21    8]]


[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{'eval_loss': 0.019612709060311317,
 'eval_accuracy': 0.9943482224247949,
 'eval_f1': 0.3404255319148936,
 'eval_precision': 0.4444444444444444,
 'eval_recall': 0.27586206896551724,
 'eval_mcc': 0.34747396641953576,
 'eval_auprc': 0.12643398750336166,
 'eval_runtime': 820.9527,
 'eval_samples_per_second': 6.681,
 'eval_steps_per_second': 3.341}

### Epoch 5

In [9]:
trainer.evaluate()

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


[[5446   10]
 [  21    8]]


[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{'eval_loss': 0.022831875830888748,
 'eval_accuracy': 0.9943482224247949,
 'eval_f1': 0.3404255319148936,
 'eval_precision': 0.4444444444444444,
 'eval_recall': 0.27586206896551724,
 'eval_mcc': 0.34747396641953576,
 'eval_auprc': 0.12643398750336166,
 'eval_runtime': 820.4853,
 'eval_samples_per_second': 6.685,
 'eval_steps_per_second': 3.343}

## Model Summary for BigBird and RoBERTa-base

In [7]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                            Param #
BigBirdForSequenceClassification                                  --
├─BigBirdModel: 1-1                                               --
│    └─BigBirdEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        38,674,944
│    │    └─Embedding: 3-2                                        3,145,728
│    │    └─Embedding: 3-3                                        1,536
│    │    └─LayerNorm: 3-4                                        1,536
│    │    └─Dropout: 3-5                                          --
│    └─BigBirdEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,054,464
│    └─Linear: 2-3                                                590,592
│    └─Tanh: 2-4                                                  --
├─BigBirdClassificationHead: 1-2                                

In [10]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                       Param #
RobertaForSequenceClassification                             --
├─RobertaModel: 1-1                                          --
│    └─RobertaEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   38,603,520
│    │    └─Embedding: 3-2                                   394,752
│    │    └─Embedding: 3-3                                   768
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─RobertaEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,054,464
├─RobertaClassificationHead: 1-2                             --
│    └─Linear: 2-3                                           590,592
│    └─Dropout: 2-4                                          --
│    └─Linear: 2-5                                           1,538
To