# HW4 Auto Optimization (ONNX)

ДЗ: Применить изученные подходы к своим моделям и замерить производительность.

В качестве модели взяли **BERT base model (uncased)**

Будем решать задачу классификации отзывов(токсичный/не токсичный) для "виртуального интернет магазина". То есть бинарная классификация.


**Описание данных**

Данные находятся в файле `toxic_comments.csv`. Столбец *text* в нём содержит текст комментария, а *toxic* — целевой признак.


**ПРИМЕЧАНИЕ:** так как работа носит учебный характер, то все манипуляции с данными будут проводится в усеченном варианте, то есть будет браться часть датасета для обучения и тестирования (2 000 записей).

# Подготовка

Установим и загрузим необходимые библиотеки для работы

In [1]:
!pip -q install transformers optimum[exporters]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.0/301.0 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m43.4 MB/s[0

In [2]:
import pandas as pd
import numpy as np
import re
import os
import time
import copy
from pathlib import Path

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords
from tqdm import notebook

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans

import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import (BertTokenizer,
                          BertForSequenceClassification,
                          get_linear_schedule_with_warmup)

from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.pipelines import pipeline

In [3]:
!gdown 1ltdf-wuq52y6JLPzPaEywLYgRz5gfMsB #for colab

Downloading...
From: https://drive.google.com/uc?id=1ltdf-wuq52y6JLPzPaEywLYgRz5gfMsB
To: /content/toxic_comments.csv
100% 64.1M/64.1M [00:01<00:00, 63.9MB/s]


In [4]:
df_orig = pd.read_csv('/content/toxic_comments.csv') #for colab
# df_orig = pd.read_csv('toxic_comments.csv') #for local

Посмотрим на данные

In [5]:
df_orig.head(10)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word 'nonsense' was offensive to ...,0
9,alignment on this subject and which are contra...,0


Сначала сделаем предпреобразование наших данных

In [6]:
def lemmatize(text):
    lem = WordNetLemmatizer()
    clear_text = ' '.join(re.sub(r'[^a-zA-Z\']', ' ', text).split())
    lemm_list = lem.lemmatize(clear_text)
    ready_text = "".join(lemm_list)

    return ready_text

In [7]:
nltk.download('wordnet')
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Возьмем часть данных, как писалось выше. И дополнительно сбалансируем классы, для корректной работы алгоритмов.

Если выборку не сбалансировать, то в нее может попасть очень малое количество положительных таргетов. И некоторые модели не сумеют найти закономерности, показав 0 метрику.

In [8]:
df_sample = df_orig.sample(n=2000, weights=1./df_orig.groupby('toxic')['toxic'].transform('count'), random_state=101).reset_index(drop=True)
df_sample['text'] = df_sample['text'].apply(lambda x: lemmatize(x))

df = df_sample.copy()

Проверим баланс классов

In [9]:
df_sample['toxic'].value_counts(normalize=True)

0    0.5125
1    0.4875
Name: toxic, dtype: float64

Создадим таблицу для занесения результатов тестирования

In [10]:
result_df = pd.DataFrame(columns=['Name', 'F1_test', 'Size(mb)', 'Time for 1 predict(s)'])

# ONNX

## Исходный вариант

В данном варианте обучим модель и проведем замер интересующих характеристик. Это необходимо для сравнения результатов квантизации и прунинга.

In [11]:
# создается класс для загрузки данных и их подготовки
class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

Разобьем выборку на две части: тренировочную (60%), валидационную (20%) и тестовую (20%)

In [12]:
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=101)
df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=101)

Проверка

In [13]:
print(df_train.shape, df_valid.shape, df_test.shape)

(1200, 2) (400, 2) (400, 2)


Данные для обучения, валидации и тестирования

In [14]:
features_train = df_train.drop(['toxic'], axis=1)
target_train = df_train['toxic']

features_valid = df_valid.drop(['toxic'], axis=1)
target_valid = df_valid['toxic']

features_test = df_test.drop(['toxic'], axis=1)
target_test = df_test['toxic']

Задаем параметры и модель

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_save_path='/content/bert.pt'
n_classes = 2
max_len = 512
batch_size = 2
epochs = 3

In [16]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

out_features = model.bert.encoder.layer[1].output.dense.out_features
model.classifier = torch.nn.Linear(out_features, n_classes)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(features_train) * epochs
    )
loss_fn = torch.nn.CrossEntropyLoss().to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
# создание датасетов
train_set = CustomDataset(list(features_train['text']), list(target_train), tokenizer)
valid_set = CustomDataset(list(features_valid['text']), list(target_valid), tokenizer)

# создание дата лоудеров
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)

In [18]:
def train(model, train_loader, valid_loader, features_train, features_valid, loss_fn, optimizer):

    for epoch in range(epochs):

        print(f'---------------Epoch:{epoch+1}/{epochs}----------------')
        train_losses = []
        val_losses = []
        train_correct_predictions = 0
        val_correct_predictions = 0
        best_accuracy = 0

        ###Train###
        model.train()

        for data in train_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            targets = data["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, targets)

            train_correct_predictions += torch.sum(preds == targets)

            train_losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        train_acc = train_correct_predictions.double() / len(features_train)
        train_loss = np.mean(train_losses)

        ###Eval###
        model.eval()

        with torch.no_grad():
            for data in valid_loader:
                input_ids = data["input_ids"].to(device)
                attention_mask = data["attention_mask"].to(device)
                targets = data["targets"].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = loss_fn(outputs.logits, targets)
                val_correct_predictions += torch.sum(preds == targets)
                val_losses.append(loss.item())

        val_acc = val_correct_predictions.double() / len(features_valid)
        val_loss = np.mean(val_losses)

        print(f'Train loss=   {train_loss:.4f},   accuracy= {train_acc:.4f}')
        print(f'Val   loss=   {val_loss:.4f},   accuracy= {val_acc:.4f}')

        if val_acc > best_accuracy:
            torch.save(model, model_save_path)
            best_accuracy = val_acc

        model = torch.load(model_save_path)

def predict(model, text):

    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    out = {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten()
      }

    input_ids = out["input_ids"].to(device)
    attention_mask = out["attention_mask"].to(device)

    outputs = model(
        input_ids=input_ids.unsqueeze(0),
        attention_mask=attention_mask.unsqueeze(0)
    )

    prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

    return prediction

Обучаем нашу сетку

In [19]:
# на cpu считается больше часа для одной эпохи, не дождался окончания
# на gpu 2 минуты на одну эпоху, но тогда квантизация не работает
train(model, train_loader, valid_loader, features_train, features_valid, loss_fn, optimizer)

---------------Epoch:1/3----------------
Train loss=   0.5519,   accuracy= 0.8292
Val   loss=   0.2725,   accuracy= 0.9325
---------------Epoch:2/3----------------
Train loss=   0.1692,   accuracy= 0.9517
Val   loss=   0.2725,   accuracy= 0.9325
---------------Epoch:3/3----------------
Train loss=   0.1856,   accuracy= 0.9508
Val   loss=   0.2725,   accuracy= 0.9325


Делаем предсказания и считаем метрики

In [20]:
def predict_and_metrics(model, name, number,
                        features_test=features_test,
                        target_test=target_test,
                        df_test=df_test,
                        result_df=result_df):
    # делаем предсказания и замеряем скорость
    texts = list(features_test['text'])
    start_time = time.time()
    target_pred = [predict(model, t) for t in texts]
    total_time = round((time.time() - start_time)/len(df_test), 4)

    # высчитываем размер модели
    torch.save(model.state_dict(), "temp.p")
    size = round(os.path.getsize("temp.p")/1e6, 3)
    os.remove('temp.p')

    # считаем метрику F-1
    bert_report = classification_report(target_test, target_pred, output_dict=True)
    result_df.loc[number]=[name, round(bert_report['1']['f1-score'], 3), size, total_time]

    return result_df

In [21]:
result_df = predict_and_metrics(model, 'BERT_orig', 0)
result_df

Unnamed: 0,Name,F1_test,Size(mb),Time for 1 predict(s)
0,BERT_orig,0.916,438.003,0.0386


## ONNX


Сохраняем нашу обученную модель

In [22]:
save_directory = Path("/content/temp")
save_directory.mkdir(exist_ok=True)

In [23]:
model.save_pretrained(save_directory)

In [25]:
# Load a model from path and export it to ONNX
ort_model = ORTModelForSequenceClassification.from_pretrained(save_directory, export=True)

# Save the onnx model and tokenizer
ort_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.0.1+cu118
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR


('/content/temp/tokenizer_config.json',
 '/content/temp/special_tokens_map.json',
 '/content/temp/vocab.txt',
 '/content/temp/added_tokens.json')

Делаем предсказания и считаем метрики

In [26]:
onnx_classifier = pipeline("text-classification", model=ort_model, tokenizer=tokenizer)

In [28]:
# делаем предсказания и замеряем скорость
texts = list(features_test['text'])
start_time = time.time()
full_pred = [onnx_classifier(t[:max_len]) for t in texts]
total_time = round((time.time() - start_time)/len(df_test), 4)

# высчитываем размер модели
size = round(os.path.getsize("/content/temp/model.onnx")/1e6, 3)

# считаем метрику F-1
target_pred = [int(pred[0].get('label')[-1:]) for pred in full_pred]
bert_report = classification_report(target_test, target_pred, output_dict=True)
result_df.loc[1]=['BERT_onnx', round(bert_report['1']['f1-score'], 3), size, total_time]

In [29]:
result_df

Unnamed: 0,Name,F1_test,Size(mb),Time for 1 predict(s)
0,BERT_orig,0.916,438.003,0.0386
1,BERT_onnx,0.921,438.202,0.1309


# Выводы

In [30]:
result_df

Unnamed: 0,Name,F1_test,Size(mb),Time for 1 predict(s)
0,BERT_orig,0.916,438.003,0.0386
1,BERT_onnx,0.921,438.202,0.1309


Значение метрики **F-1** увеличилось о_О, а размер не изменился, так как кол-во весов осталось таким же. Скорость заметно упала, так как расчет производился на другом устройстве.

# OpenVINO

In [None]:
!pip install openvino

In [None]:
import openvino as ov

ov_model = ov.convert_model("/content/temp/model.onnx")
ov.runtime.serialize(ov_model, "/content/temp/model.xml")

compiled_model = ov.compile_model(ov_model)

In [None]:
# делаем предсказания и замеряем скорость
texts = list(features_test['text'])
start_time = time.time()
inp_data = [tokenizer(t[:max_len]) for t in texts]
full_pred = [compiled_model(item['input_ids'], item['attention_mask'], item['token_type_ids']) for item in inp_data]
total_time = round((time.time() - start_time)/len(df_test), 4)

# высчитываем размер модели
size = round(os.path.getsize("/content/temp/model.xml")/1e6, 3)

# считаем метрику F-1
target_pred = [int(pred[0].get('label')[-1:]) for pred in full_pred]
bert_report = classification_report(target_test, target_pred, output_dict=True)
result_df.loc[1]=['BERT_openvino', round(bert_report['1']['f1-score'], 3), size, total_time]