In [1]:
# from huggingface_hub import hf_hub_download 
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
from datasets import load_dataset, load_from_disk
import pandas as pd
import numpy as np

In [1]:
# Find binary classification data set - Done
# Fine tune the last layer - When loading the pre-trained model, the last layer is randomly initialized
# What is LoRA
# Fine tuning vs no fine tuning
# Number of trainable layers on small dataset
# Why LoRA still take long time (autograd?)
# Why freezing encoders still have millions of parameters - Embedding layer was also trained
# Whether use class = 2 with softmax or class = 1 with sigmoid for binary classification? - Need to use class = 2 with softmax, cannot run class = 1 
# How to deal with unbalanced data
# What is the role of RAM
# What is weight decay
# What is gradient accumulation
# Hyperparameter tuning (hyperopt - Sagar's stuff, bayesian grid search)

### Questions:
# What are intermediate and output layers in BERT layer? - Low priority

In [2]:
dataset = load_dataset("yelp_polarity")

Downloading data: 100%|██████████| 256M/256M [00:06<00:00, 41.0MB/s] 
Downloading data: 100%|██████████| 17.7M/17.7M [00:00<00:00, 25.1MB/s]


Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

In [3]:
dataset["train"][100]

{'text': "In general I do like Shake N' Steak, but this location is a hit or miss location!  You never know what kind of quality or service you're going to find here.  A friend and myself went a few weeks back after a movie and it had to be one of the worst trips there EVER!  You can't entirely blame the waitress since she was the only one there for the entire place...poor scheduling on the manager's part. However, while she can't be accountable for the slooooow service, she was accountable for both orders being incorrect.  The burgers were over cooked and the fries were soggie and the milkshake was runny at best...\\n\\nBy far my worst visit to Steak n' Shake!",
 'label': 0}

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors='tf')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.save_to_disk('./dataset')

In [6]:
tokenized_datasets = load_from_disk('./dataset')

In [28]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [8]:
# load the model
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", 
                                                           num_labels=2)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
params = model._modules['bert'].parameters()
for param in params:
    param.requires_grad = False

print('Only train classifier')
print_trainable_parameters(model)

Only train classifier
trainable params: 1538 || all params: 108311810 || trainable%: 0.0014199744238416845


In [12]:
args = TrainingArguments(
    f"finetune-bert-base-cased",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Without any training

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
raw_output = trainer.predict(small_eval_dataset).predictions



In [15]:
raw_output

array([[ 0.2001286 , -0.23433197],
       [ 0.22055243, -0.23368587],
       [ 0.20732889, -0.25856608],
       ...,
       [ 0.1554417 , -0.21593827],
       [ 0.1378938 , -0.22907202],
       [ 0.18179026, -0.18574706]], dtype=float32)

In [17]:
sm = torch.nn.Softmax(dim=1)

In [18]:
sm(torch.from_numpy(raw_output))

tensor([[0.6069, 0.3931],
        [0.6116, 0.3884],
        [0.6144, 0.3856],
        ...,
        [0.5918, 0.4082],
        [0.5907, 0.4093],
        [0.5909, 0.4091]])

# Fine tune the last layer

In [None]:
# model.encoder.layer[-1].apply(model._init_weights)

In [36]:
# scheduler
args = TrainingArguments(
    f"finetune-bert-base-cased",
    evaluation_strategy = "epoch",
    learning_rate=0.001,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [37]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [38]:
# WIP: try larger batch size or smaller learning rate
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.388652
2,0.513600,0.457801
3,0.513600,0.399943
4,0.499800,0.387046
5,0.466800,0.401865
6,0.466800,0.395675
7,0.456800,0.388866
8,0.455000,0.383218
9,0.455000,0.383786
10,0.461900,0.383451




TrainOutput(global_step=3130, training_loss=0.4755889100388597, metrics={'train_runtime': 1389.0914, 'train_samples_per_second': 35.995, 'train_steps_per_second': 2.253, 'total_flos': 1.3155552768e+16, 'train_loss': 0.4755889100388597, 'epoch': 10.0})

In [39]:
raw_output = trainer.predict(small_eval_dataset).predictions

In [40]:
sm(torch.from_numpy(raw_output))

tensor([[0.4987, 0.5013],
        [0.4895, 0.5105],
        [0.8584, 0.1416],
        ...,
        [0.8269, 0.1731],
        [0.9246, 0.0754],
        [0.8795, 0.1205]])

In [41]:
raw_output

array([[-0.01528161, -0.00999662],
       [-0.02600306,  0.01591472],
       [ 0.8821367 , -0.91965866],
       ...,
       [ 0.7222047 , -0.8412853 ],
       [ 1.2529233 , -1.2529893 ],
       [ 0.95214224, -1.0359869 ]], dtype=float32)

In [42]:
trainer.save_model("last_layer")