# Загрузка данных

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

data_dir = '/content/drive/My Drive/ProductBert'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
import string
import re
from tqdm import tqdm
import os
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv(os.path.join(data_dir, 'data.csv'))

In [None]:
def preprocess_text(x):

    x = x.translate(str.maketrans('', '', string.punctuation))
    x = x.translate(str.maketrans('', '', string.digits))
    x = ''.join([w for w in x if not re.match(r'[A-Z]+', w, re.I)])

    return x.strip().lower()

In [None]:
data['Наименование'] = data['Наименование'].apply(lambda x: preprocess_text(x))

In [None]:
data = data.drop_duplicates().reset_index(drop=True)

In [None]:
import numpy as np

In [None]:
data.groupby('Подкатегория_текст')['Наименование'].count().sort_values()

Подкатегория_текст
 Холодное оружие: луки спортивные массовые                                                                                                                                                                                                                                                                               1
Башмаки тормозных колодок подвижного состава магистральных железных дорог                                                                                                                                                                                                                                                1
Сыворотки прочие                                                                                                                                                                                                                                                                                                         1
Статические преобразователи для устр

In [None]:
# data_sampled = pd.DataFrame()

# for code in data['Подкатегория_текст'].unique():
    
#     temp = data[data['Подкатегория_текст'] == code]
    
#     data_sampled = pd.concat([
#         data_sampled, 
#         temp.sample(min(temp.shape[0], 50))
#     ], ignore_index=True)

In [None]:
# data_sampled.shape

In [None]:
# data_sampled.groupby('Подкатегория_текст')['Наименование'].count().sort_values().describe()

In [None]:
texts = list(data['Наименование'].values)

label_names = [str(x) for x in pd.get_dummies(data['Подкатегория_текст']).columns]
labels = pd.get_dummies(data['Подкатегория_текст']).values

labels = [np.where(x == 1)[0][0] for x in labels]

In [None]:
# label_df = pd.DataFrame()
# label_df['name'] = label_names
# label_df = label_df.reset_index().rename(columns={'index': 'id'})
# label_df.to_csv(data_dir+'label_df.csv', index=False)

# Обучение модели

In [None]:
# !pip install transformers

Successfully installed huggingface-hub-0.1.2 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.12.5


In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizerFast, BertForSequenceClassification
  
model_version = 'sberbank-ai/ruBert-base'

tokenizer = BertTokenizerFast.from_pretrained(model_version)

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/521 [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.1)

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=100)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=100)

In [None]:
## PYTORCH CODE
import torch

class ProdDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ProdDataset(train_encodings, y_train)
val_dataset = ProdDataset(val_encodings, y_val)

In [None]:
# model = BertForSequenceClassification.from_pretrained(model_version,
#                                                       num_labels=len(label_names))

model = BertForSequenceClassification.from_pretrained(data_dir+'/model5')

In [None]:
## PYTORCH CODE
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=data_dir+'/logs',            # directory for storing logs
    # logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    tokenizer=tokenizer
)

trainer.train()

***** Running training *****
  Num examples = 70620
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1104


Epoch,Training Loss,Validation Loss
1,0.6939,0.77578


***** Running Evaluation *****
  Num examples = 7847
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1104
Configuration saved in ./results/checkpoint-1104/config.json
Model weights saved in ./results/checkpoint-1104/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1104/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1104/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-1104 (score: 0.7757799625396729).


TrainOutput(global_step=1104, training_loss=0.6986201465993688, metrics={'train_runtime': 2268.9622, 'train_samples_per_second': 31.124, 'train_steps_per_second': 0.487, 'total_flos': 3641562262404000.0, 'train_loss': 0.6986201465993688, 'epoch': 1.0})

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=100, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1).cpu().detach().numpy()[0]
    # executing argmax function to get the candidate label
    return np.where(probs == probs.max())[0][0]

In [None]:
val_preds = []

for x_val in tqdm(X_val):
    val_preds.append(get_prediction(x_val))

100%|██████████| 7847/7847 [02:17<00:00, 57.19it/s]


In [None]:
accuracy_score(y_val, val_preds)

0.8006881610806678

In [None]:
trainer.save_model(data_dir+'/model6')

Saving model checkpoint to /content/drive/My Drive/ProductBert/model6
Configuration saved in /content/drive/My Drive/ProductBert/model6/config.json
Model weights saved in /content/drive/My Drive/ProductBert/model6/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/ProductBert/model6/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/ProductBert/model6/special_tokens_map.json


# Тестируем на сырой выборке

In [1]:
#!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
Collecting filelock
  Downloading filelock-3.4.0-py3-none-any.whl (9.8 kB)
Collecting sacremoses
  Using cached sacremoses-0.0.46-py3-none-any.whl (895 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-win_amd64.whl (2.0 MB)
Installing collected packages: filelock, tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed filelock-3.4.0 huggingface-hub-0.1.2 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.12.5


In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(data_dir+'/model6').to('cuda')

tokenizer = BertTokenizerFast.from_pretrained('sberbank-ai/ruBert-base')

In [None]:
label_df = pd.read_csv(data_dir+'label_df.csv')

In [None]:
label_df.to_csv('label_df.csv')

In [None]:
test_data = pd.read_csv(os.path.join(data_dir, 'data.csv')).sample(50000)
test_data['Наименование'] = test_data['Наименование'].apply(lambda x: preprocess_text(x))
test_texts = list(test_data['Наименование'].values)

test_labels = pd.merge(pd.DataFrame(test_data['Подкатегория_текст']).rename(columns={'Подкатегория_текст': 'name'}), 
                       label_df, on=['name'], how='left').id.values

In [None]:
test_preds = []
accuracys = []
i = 0

for x_test in tqdm(test_texts):

    if i % 1000 == 0:
        
        acc = round(accuracy_score(test_labels[:i], test_preds), 3)
        accuracys.append(acc)
        print('Accuracy on {} step – {}'.format(i, acc))

    i += 1
    test_preds.append(get_prediction(x_test))

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  0%|          | 4/50000 [00:00<25:13, 33.04it/s]

Accuracy on 0 step – nan


  2%|▏         | 1005/50000 [00:20<17:02, 47.90it/s]

Accuracy on 1000 step – 0.918


  4%|▍         | 2004/50000 [00:40<15:50, 50.47it/s]

Accuracy on 2000 step – 0.924


  6%|▌         | 3007/50000 [01:01<16:16, 48.13it/s]

Accuracy on 3000 step – 0.921


  8%|▊         | 4004/50000 [01:22<15:55, 48.12it/s]

Accuracy on 4000 step – 0.926


 10%|█         | 5007/50000 [01:42<14:57, 50.16it/s]

Accuracy on 5000 step – 0.926


 12%|█▏        | 6006/50000 [02:03<16:01, 45.74it/s]

Accuracy on 6000 step – 0.926


 14%|█▍        | 7008/50000 [02:24<15:01, 47.68it/s]

Accuracy on 7000 step – 0.926


 16%|█▌        | 8007/50000 [02:45<15:38, 44.72it/s]

Accuracy on 8000 step – 0.925


 18%|█▊        | 9005/50000 [03:05<14:37, 46.73it/s]

Accuracy on 9000 step – 0.925


 20%|██        | 10007/50000 [03:26<13:33, 49.14it/s]

Accuracy on 10000 step – 0.925


 22%|██▏       | 11006/50000 [03:46<12:40, 51.24it/s]

Accuracy on 11000 step – 0.925


 24%|██▍       | 12007/50000 [04:07<12:10, 52.00it/s]

Accuracy on 12000 step – 0.925


 26%|██▌       | 13004/50000 [04:27<13:03, 47.22it/s]

Accuracy on 13000 step – 0.925


 28%|██▊       | 14010/50000 [04:48<12:21, 48.53it/s]

Accuracy on 14000 step – 0.926


 30%|███       | 15005/50000 [05:08<11:54, 48.98it/s]

Accuracy on 15000 step – 0.926


 32%|███▏      | 16009/50000 [05:29<12:26, 45.56it/s]

Accuracy on 16000 step – 0.926


 34%|███▍      | 17008/50000 [05:49<12:01, 45.70it/s]

Accuracy on 17000 step – 0.926


 36%|███▌      | 18011/50000 [06:10<10:51, 49.13it/s]

Accuracy on 18000 step – 0.927


 38%|███▊      | 19008/50000 [06:32<11:07, 46.46it/s]

Accuracy on 19000 step – 0.928


 40%|████      | 20006/50000 [06:53<10:17, 48.61it/s]

Accuracy on 20000 step – 0.927


 42%|████▏     | 21009/50000 [07:13<09:55, 48.71it/s]

Accuracy on 21000 step – 0.927


 44%|████▍     | 22006/50000 [07:34<10:00, 46.60it/s]

Accuracy on 22000 step – 0.928


 46%|████▌     | 23005/50000 [07:54<09:29, 47.42it/s]

Accuracy on 23000 step – 0.929


 48%|████▊     | 24007/50000 [08:15<08:53, 48.73it/s]

Accuracy on 24000 step – 0.929


 50%|█████     | 25006/50000 [08:36<08:23, 49.66it/s]

Accuracy on 25000 step – 0.928


 52%|█████▏    | 26006/50000 [08:57<08:32, 46.83it/s]

Accuracy on 26000 step – 0.928


 54%|█████▍    | 27007/50000 [09:17<08:02, 47.65it/s]

Accuracy on 27000 step – 0.929


 56%|█████▌    | 28007/50000 [09:38<07:42, 47.54it/s]

Accuracy on 28000 step – 0.929


 58%|█████▊    | 29010/50000 [09:59<07:09, 48.85it/s]

Accuracy on 29000 step – 0.929


 60%|██████    | 30006/50000 [10:20<06:43, 49.49it/s]

Accuracy on 30000 step – 0.929


 62%|██████▏   | 31006/50000 [10:41<06:48, 46.53it/s]

Accuracy on 31000 step – 0.929


 64%|██████▍   | 32005/50000 [11:01<06:46, 44.29it/s]

Accuracy on 32000 step – 0.929


 66%|██████▌   | 33005/50000 [11:22<05:51, 48.37it/s]

Accuracy on 33000 step – 0.929


 68%|██████▊   | 34006/50000 [11:43<05:36, 47.60it/s]

Accuracy on 34000 step – 0.928


 70%|███████   | 35009/50000 [12:04<05:34, 44.88it/s]

Accuracy on 35000 step – 0.928


 72%|███████▏  | 36007/50000 [12:25<05:12, 44.71it/s]

Accuracy on 36000 step – 0.929


 74%|███████▍  | 37005/50000 [12:46<04:42, 46.07it/s]

Accuracy on 37000 step – 0.929


 76%|███████▌  | 38006/50000 [13:07<04:15, 46.90it/s]

Accuracy on 38000 step – 0.928


 78%|███████▊  | 39005/50000 [13:29<04:07, 44.50it/s]

Accuracy on 39000 step – 0.928


 80%|████████  | 40008/50000 [13:50<03:41, 45.04it/s]

Accuracy on 40000 step – 0.928


 82%|████████▏ | 41008/50000 [14:11<03:03, 49.12it/s]

Accuracy on 41000 step – 0.928


 84%|████████▍ | 42006/50000 [14:31<02:47, 47.86it/s]

Accuracy on 42000 step – 0.929


 86%|████████▌ | 43009/50000 [14:52<02:33, 45.50it/s]

Accuracy on 43000 step – 0.929


 88%|████████▊ | 44006/50000 [15:13<02:07, 47.14it/s]

Accuracy on 44000 step – 0.929


 90%|█████████ | 45008/50000 [15:33<01:46, 46.99it/s]

Accuracy on 45000 step – 0.929


 92%|█████████▏| 46006/50000 [15:53<01:26, 46.02it/s]

Accuracy on 46000 step – 0.928


 94%|█████████▍| 47007/50000 [16:14<01:01, 48.88it/s]

Accuracy on 47000 step – 0.929


 96%|█████████▌| 48010/50000 [16:35<00:40, 49.34it/s]

Accuracy on 48000 step – 0.929


 98%|█████████▊| 49008/50000 [16:56<00:19, 50.06it/s]

Accuracy on 49000 step – 0.929


100%|██████████| 50000/50000 [17:16<00:00, 48.23it/s]


In [36]:
from sklearn.metrics import accuracy_score, f1_score, precision_score

print('accuracy_score on 50 000 samples:', accuracy_score(test_labels, test_preds), '\n')

print(f1_score(test_labels, test_preds, average='weighted'))

print(precision_score(test_labels, test_preds, average='weighted'))

accuracy_score on 50 000 samples: 0.92878 

0.9240579045440929
0.927017869699829


  _warn_prf(average, modifier, msg_start, len(result))


# Функция для инференса

In [None]:
import string
import re
import pandas as pd


def preprocess_text(x):
    x = x.translate(str.maketrans('', '', string.punctuation))
    x = x.translate(str.maketrans('', '', string.digits))
    x = ''.join([w for w in x if not re.match(r'[A-Z]+', w, re.I)])
    return x.strip().lower()


def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=100, return_tensors='pt').to('cpu')
    outputs = model(**inputs)
    probs = outputs[0].softmax(1).cpu().detach().numpy()[0]
    return np.where(probs == probs.max())[0][0], probs.max()


def bert_inference(data):
    texts = list(data['Наименование'].apply(lambda x: preprocess_text(x)).values)
    test_preds = []
    for text in tqdm(texts):
        pred_label, pred_prob = get_prediction(text)
        pred_label = label_df[label_df.id == pred_label].name.iloc[0]
        pred_prob = round(pred_prob, 3)
        test_preds.append([pred_label, pred_prob])
    data[['предсказанная_Подкатегория_текст', 'Вероятность']] = test_preds
    return data


from transformers import BertTokenizerFast, BertForSequenceClassification

path_to_pretrained_model = data_dir+'/model6'

model = BertForSequenceClassification.from_pretrained(path_to_pretrained_model)
tokenizer = BertTokenizerFast.from_pretrained('sberbank-ai/ruBert-base')

label_df = pd.read_csv(data_dir+'label_df.csv')

In [None]:
bert_inference(test_data.sample(1000))

100%|██████████| 1000/1000 [02:35<00:00,  6.43it/s]


Unnamed: 0,Наименование,Код_ЕП_РФ_подкатегория,Подкатегория_текст,Код_подкатегория,предсказанная_Подкатегория_текст,Вероятность
273214,посуда из пластмасс для взрослых в том числе с...,2293.0,Изделия культурно-бытового назначения и хозяйс...,71,"Посуда, в том числе одноразового применения (к...",0.588
24082,микардис таблетки мг шт упаковки ячейковые к...,9300.1,"Лекарственные средства, зарегистрированные в у...",156,"Лекарственные средства, зарегистрированные в у...",0.999
7918,цитрамонлект таблетки шт упаковки ячейковые к...,9300.1,"Лекарственные средства, зарегистрированные в у...",156,"Лекарственные средства, зарегистрированные в у...",0.999
44548,алфлутоп раствор для инъекций мл ампулы темно...,9300.1,"Лекарственные средства, зарегистрированные в у...",156,"Лекарственные средства, зарегистрированные в у...",0.998
241,карбамазепин таблетки мг шт упаковки ячейков...,9300.0,"Медикаменты, химико-фармацевтическая продукция...",189,"Лекарственные средства, зарегистрированные в у...",0.833
...,...,...,...,...,...,...
230988,сухие гранулированные корма для непродуктивных...,9219.1,Корма животного происхождения (включая корма д...,145,Корма животного происхождения (включая корма д...,0.558
209613,зажигалки кроме питаемых от сети марка «» « » ...,9692.2,Зажигалки (кроме питаемых от сети),62,Зажигалки (кроме питаемых от сети),0.990
101803,диклофенак гель для наружного применения г ...,9300.1,"Лекарственные средства, зарегистрированные в у...",156,"Лекарственные средства, зарегистрированные в у...",0.982
61317,сенаде таблетки мг шт упаковки ячейковые кон...,9300.1,"Лекарственные средства, зарегистрированные в у...",156,"Лекарственные средства, зарегистрированные в у...",0.999


# Интерпретация модели

In [None]:
!pip install transformers-interpret

Installing collected packages: captum, transformers-interpret
Successfully installed captum-0.4.1 transformers-interpret-0.5.2


In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers_interpret import SequenceClassificationExplainer

model_ = BertForSequenceClassification.from_pretrained(data_dir+'/model2')

tokenizer = BertTokenizerFast.from_pretrained('sberbank-ai/ruBert-base')

In [None]:
cls_explainer = SequenceClassificationExplainer(model_, tokenizer)
word_attributions = cls_explainer(X_val[0])

In [None]:
word_attributions

[('[CLS]', 0.0),
 ('диф', -0.2746449512509223),
 ('##люка', -0.007256275827461156),
 ('##н', -0.01630014240946843),
 ('порошок', 0.058419181658606885),
 ('для', -0.08696114882494556),
 ('приготовления', -0.12324108376767691),
 ('су', -0.007056759491409794),
 ('##спен', -0.03700200667123896),
 ('##зии', 0.0277030457814121),
 ('для', 0.0071234709107578995),
 ('приема', -0.024686156975558548),
 ('внутрь', 0.033667412934518644),
 ('мг', 0.26660069249261786),
 ('мл', 0.07303240504601471),
 ('флакон', 0.22518619195498454),
 ('##ы', -0.012670780749270454),
 ('в', -0.07372753425691753),
 ('комплекте', -0.010989733368204942),
 ('с', -0.03141239640383053),
 ('лож', 0.09436570691010775),
 ('##ко', -0.0066625076987905365),
 ('##и', -0.004980702077321382),
 ('мерно', -0.049673472264284035),
 ('##и', 0.04792532995109837),
 ('пачки', 0.2715367688769194),
 ('картон', 0.2534528970195248),
 ('##ные', 0.07552983380568992),
 ('год', 0.5431859901912499),
 ('##ен', 0.09807008631661074),
 ('до', 0.3620264841

In [None]:
cls_explainer.predicted_class_name

'LABEL_156'

In [None]:
cls_explainer.visualize("ruBert.html")

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
156.0,LABEL_156 (1.00),LABEL_156,2.46,[CLS] диф ##люка ##н порошок для приготовления су ##спен ##зии для приема внутрь мг мл флакон ##ы в комплекте с лож ##ко ##и мерно ##и пачки картон ##ные год ##ен до код т ##н в ##эд контракт № от ин ##вои ##с № от [SEP]
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
156.0,LABEL_156 (1.00),LABEL_156,2.46,[CLS] диф ##люка ##н порошок для приготовления су ##спен ##зии для приема внутрь мг мл флакон ##ы в комплекте с лож ##ко ##и мерно ##и пачки картон ##ные год ##ен до код т ##н в ##эд контракт № от ин ##вои ##с № от [SEP]
,,,,


In [None]:
label_names[156]

'Лекарственные средства, зарегистрированные в установленном порядке и внесенные в государственный реестр, состоящие из смешанных и несмешанных продуктов для использования в терапевтических или профилактических целях, расфасованные в виде дозированных лекарственных форм или в упаковки для розничной продажи'

In [None]:
y_val[0]

156