# Setup

In [1]:
%%capture
! pip install datasets metrics transformers wandb

In [2]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import BertModel, BertTokenizer
from transformers.file_utils import ModelOutput
from datasets import Dataset
from torch import nn
import pandas as pd
import torch
import gc

In [3]:
gc.collect()
torch.cuda.empty_cache()

# Custom BERT model

In [4]:
class BERTCNN(nn.Module):

    def __init__(self, model_name='bert-base-uncased', device='cuda'):
        super(BERTCNN, self).__init__()
        self.bert = BertModel.from_pretrained(model_name).to(device)

        self.conv = nn.Conv2d(in_channels=13, out_channels=13, kernel_size=(3, 768), padding='valid').to(device)
        self.relu = nn.ReLU()
        # change the kernel size either to (3,1), e.g. 1D max pooling
        # or remove it altogether
        self.pool = nn.MaxPool2d(kernel_size=(3, 1), stride=1).to(device)
        self.dropout = nn.Dropout(0.1)
        # be careful here, this needs to be changed according to your max pooling
        # without pooling: 443, with 3x1 pooling: 416
        # FC
        self.fc = nn.Linear(598, 3).to(device) ## 416, 66004???
        self.flat = nn.Flatten()
        self.softmax = nn.LogSoftmax(dim=1).to(device)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            output_hidden_states=True)
        print(outputs[2].size)
        x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in outputs[2]]), 0), 0, 1)

        x = self.dropout(x)
        x = self.conv(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.pool(x)

        x = self.dropout(x)
        x = self.flat(x)
        x = self.dropout(x)
        x = self.fc(x)
        c = self.softmax(x)

        # Clean cache
        gc.collect()
        torch.cuda.empty_cache()
        del outputs

        # Compute loss
        loss = None
        if labels is not None:
            ce_loss = nn.CrossEntropyLoss()
            loss = ce_loss(c, labels)

        return ModelOutput({
            'loss': loss,
            'last_hidden_state': c
        })

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
# model = BERTCNN(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Pre-procesing

In [5]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels

    inputs = tokenizer(
        batch["1"],
        max_length=50,
        padding="max_length",
        truncation=True,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    labels = list(map(lambda x: int(x.split('__label__')[1]), batch['0']))
    batch["label"] = labels

    return batch

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate f1 using sklearn's function
  f_score = f1_score(labels, preds, pos_label=0, average='binary')
  accuracy = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, pos_label=0, average='binary')
  recall = recall_score(labels, preds, pos_label=0, average='binary')
  return {
      'f1_score': f_score,
      'accuracy': accuracy,
      'precision': precision,
      'recall': recall
  }
    
tokenize_batch_size = 2048
df = pd.read_csv('/content/labeledEligibilitySample1000000.csv', sep='\t', header=None)

FileNotFoundError: ignored

In [None]:
train_ds = Dataset.from_pandas(df)

train_ds = train_ds.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=tokenize_batch_size,
    remove_columns=["0", "1"]
)

train_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

  0%|          | 0/489 [00:00<?, ?ba/s]

Exception ignored in: <function tqdm.__del__ at 0x7ff3d30f35f0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 1147, in __del__
    self.close()
  File "/usr/local/lib/python3.7/dist-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


In [None]:
train_testvalid = train_ds.train_test_split(train_size=0.9)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

In [None]:
test_valid['train'][:2]

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]]),
 'input_ids': tensor([[  101,  2817, 19388,  2024,  6970,  7512,  5644,  1012,  4167,  1998,
           2430,  6091,  2291, 21434, 11616,  1998,  2002, 24952,  2278,   102,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [  101,  2817, 19388,  2024,  3160, 20589,  3447,  1012,  2754,  4921,
           2278,  5490,  6692, 27711,  3526,  2482, 21081,  2863,  1997,  1996,
           2474, 18143,  2595, 11616,  1998,

# Training and evaluation

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    fp16=True,                       # enable fp16 apex training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    report_to="wandb"
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_testvalid['train'],         # training dataset
    eval_dataset=test_valid['train'],          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

preds = trainer.predict(test_valid['test'])

preds[2]

Using amp half precision backend
***** Running training *****
  Num examples = 900000
  Num Epochs = 2
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 14064
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,F1 Score,Accuracy,Precision,Recall
400,0.4482,0.31842,0.864598,0.87068,0.900777,0.831213
800,0.2878,0.252861,0.898856,0.90082,0.910801,0.88722
1200,0.2481,0.239305,0.908999,0.9103,0.916193,0.901917
1600,0.2302,0.220884,0.911858,0.91476,0.93741,0.887663
2000,0.2219,0.208922,0.92073,0.92132,0.921547,0.919915
2400,0.2116,0.200189,0.922324,0.92442,0.942095,0.903366
2800,0.207,0.195159,0.926758,0.9272,0.926273,0.927243
3200,0.2007,0.192771,0.929723,0.9306,0.93533,0.924183
3600,0.193,0.182599,0.92966,0.93102,0.941935,0.9177
4000,0.1863,0.183521,0.932336,0.93324,0.938806,0.925954


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_

{'test_accuracy': 0.95446,
 'test_f1_score': 0.9540417801998183,
 'test_loss': 0.12664352357387543,
 'test_precision': 0.9609660892900708,
 'test_recall': 0.947216544427077,
 'test_runtime': 93.0455,
 'test_samples_per_second': 537.372,
 'test_steps_per_second': 4.202}

In [None]:
import torch
torch.save(trainer.model, '/content/model.model')

In [None]:
for i in range(10000000000000):
  continue

KeyboardInterrupt: ignored