# Quantize huggingface transformers

In [1]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [2]:
import os
import re
import pandas as pd
import numpy as np
from sklearn import preprocessing
import wandb

from transformers import AutoTokenizer
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import torch
from torch.utils.checkpoint import checkpoint
import torch.nn as nn

# You can change this if you want hugginface to automatically log to wandb
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

TRAIN_NOQUANT = True
QUANT_TRAIN = True

## Define configuration

In [3]:
# You can change the model name here, look up model names from huggingface docs
model_name = "distilbert-base-uncased"

## Prepare Data

### Get Data - And apply simple normalization

In [4]:
from datasets import load_dataset

dataset = load_dataset("tweet_eval", "emotion") # use tweet_eval dataset
dataset

Found cached dataset tweet_eval (/home/ken/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})

In [5]:
# Get tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512

### Create a label column and input column
* Input is going to be [discourse_text + essay_text]
* Label is going to be numericalised version of the three classes

## Create tokenized dataset

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=False, truncation=True)

tokenized_train_dataset = dataset["train"].shuffle(seed=42).map(tokenize_function, batched=True)
tokenized_test_dataset = dataset["test"].shuffle(seed=42).map(tokenize_function, batched=True)

Loading cached shuffled indices for dataset at /home/ken/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-e832d994c97c4069.arrow
Loading cached processed dataset at /home/ken/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-f063a764c8cd7bed.arrow
Loading cached shuffled indices for dataset at /home/ken/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-509d1d00e4b6d1ab.arrow
Loading cached processed dataset at /home/ken/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-1249873f574e44b3.arrow


In [7]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3257
})

## Define Dynamic padding

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

## Define Training Arguments

In [10]:
def trainaug(runname):
    training_args = TrainingArguments(
        report_to="wandb",  # enable logging to W&B
        run_name=runname,
        output_dir="./results",
        num_train_epochs=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.1, 
        lr_scheduler_type='cosine',
        # Optimising
        auto_find_batch_size=True,
        # The num of workers may vary for different machines, if you are not sure, just comment this line out
        dataloader_num_workers=8,
        gradient_accumulation_steps=4,
        fp16=True,
    )
    return training_args

training_args = trainaug(model_name)

In [11]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_compute_metrics(res: EvalPrediction) -> Dict:
    # res.predictions, res.label_idsはnumpyのarray
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        # Class weighting
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## Define Trainer

In [12]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_compute_metrics,
)

Using cuda_amp half precision backend


## Train the model

In [None]:
if TRAIN_NOQUANT:
    trainer.train()
    # save parameters
    torch.save(model.state_dict(), 'models/model.pth')
    wandb.finish()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3257
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 1020
  Number of trainable parameters = 66956548
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33marutema47[0m ([33mkeio-csg[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.623633,0.796054,0.689655,0.709487
2,No log,0.582134,0.74898,0.75677,0.752208


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1421
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-102
Configuration saved in ./results/checkpoint-102/config.json
Model weights saved in ./results/checkpoint-102/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-102/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-102/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1421
  B

# Swap layers

In [None]:
from pact_utils import QuantizedLinear

def replace_layer(module, name, noise=0.01, k=8):
    '''
    Replace linear layer to quantized layer
    '''
    # go through all attributes of module nn.module (e.g. network or layer) and put batch norms if present
    for attr_str in dir(module):
        target_attr = getattr(module, attr_str)
        if type(target_attr) == torch.nn.Linear:
            print('replaced: ', name, attr_str)
            new = QuantizedLinear(target_attr.in_features, target_attr.out_features, True, 
                                     wbits=k, abits=k, noise=noise)
            setattr(module, attr_str, new)

    # iterate through immediate child modules. Note, the recursion is done by our code no need to use named_modules()
    for name, immediate_child_module in module.named_children():
        replace_bn(immediate_child_module, name, noise, k)


In [None]:
# load parameters
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
replace_layer(model, 'model', noise=0.01, k=6)
model.cuda()
model.load_state_dict(torch.load('models/model.pth'))
model

# Train quantized network

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_compute_metrics,
)

In [None]:
if QUANT_TRAIN:
    for k in [4, 5, 6, 7, 8]:
        training_args = trainaug("tweet_distilbert-base-k{}".format(k))

        # load parameters
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
        replace_layer(model, 'model', noise=0.01, k=k) # quantize
        model.cuda()
        model.load_state_dict(torch.load('models/model.pth'))
        
        trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=custom_compute_metrics,
    )
        
        # Train loop
        trainer.train()
        torch.save(model.state_dict(), 'models/model_quant_k{}.pth'.format(k))
        wandb.finish()