In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

from transformers import (
    BertTokenizer, 
    BertForSequenceClassification,
    AdamW, 
    get_linear_schedule_with_warmup
)

from torch.utils.data import (
    TensorDataset,
    DataLoader, 
    RandomSampler, 
    SequentialSampler
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
RANDOM_STATE = 42
EPOCHS = 3
DATA_PATH = 'data/train.csv'
MODEL = 'bert-base-multilingual-cased'
CHECKPOINTS = 'models'

In [4]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,text,class
0,проблема: не открывается АКТ сверки по СБС № П...,1-46
1,В случае выбора неверного шаблона просьба указ...,1-70
2,Вопрос/информация Добрый день! В ЗНВ 300001962...,4-11
3,Вопрос/проблема: После выхода из отпуска с 01....,6-1
4,В УВХД необходимо снять резервирование с ЗНС *...,2-28
...,...,...
15444,Вопрос/проблема: Добрый день. Направила догово...,1-64
15445,"Член комиссии в документе не меняется, в чем п...",2-10
15446,нет доступа к документам сотрудников для управ...,5-17
15447,Добрый день! В июне выставляли запрос на оплат...,2-18


In [6]:
possible_labels = df['class'].unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'1-46': 0,
 '1-70': 1,
 '4-11': 2,
 '6-1': 3,
 '2-28': 4,
 '2-1': 5,
 '1-45': 6,
 '1-51': 7,
 '5-19': 8,
 '5-2': 9,
 '5-21': 10,
 '5-29': 11,
 '3-27': 12,
 '2-30': 13,
 '5-4': 14,
 '111': 15,
 '5-17': 16,
 '6-2': 17,
 '4-5': 18,
 '2-19': 19,
 '1-44': 20,
 '1-38': 21,
 '999': 22,
 '1-43': 23,
 '1-72': 24,
 '2-10': 25,
 '2-14': 26,
 '1-75': 27,
 '6-5': 28,
 '2-18': 29,
 '2-29': 30,
 '0': 31,
 '3-1': 32,
 '5-26': 33,
 '2-6': 34,
 '3-9': 35,
 '2-23': 36,
 '3-5': 37,
 '1-22': 38,
 '1-37': 39,
 '5-30': 40,
 '1-1': 41,
 '1-64': 42,
 '1-74': 43,
 '4-17': 44,
 '2-16': 45,
 '3-8': 46,
 '2-31': 47,
 '5-8': 48,
 '1-67': 49,
 '5-12': 50,
 '2-25': 51,
 '4-7': 52,
 '3-13': 53,
 '1-71': 54,
 '3-11': 55,
 '3-15': 56,
 '3-21': 57,
 '2-26': 58,
 '3-25': 59,
 '3-2': 60,
 '1-16': 61,
 '2-43': 62,
 '2-44': 63,
 '5-1': 64,
 '4-23': 65,
 '5-7': 66,
 '1-14': 67,
 '6-4': 68,
 '1-6': 69,
 '1-4': 70,
 '1-29': 71,
 '2-2': 72,
 '5-27': 73,
 '2-12': 74,
 '1-23': 75,
 '2-22': 76}

In [7]:
df['label'] = df['class'].replace(label_dict)
df.head()

Unnamed: 0,text,class,label
0,проблема: не открывается АКТ сверки по СБС № П...,1-46,0
1,В случае выбора неверного шаблона просьба указ...,1-70,1
2,Вопрос/информация Добрый день! В ЗНВ 300001962...,4-11,2
3,Вопрос/проблема: После выхода из отпуска с 01....,6-1,3
4,В УВХД необходимо снять резервирование с ЗНС *...,2-28,4


In [8]:
X_train_idx, X_val_idx, y_train, y_val = train_test_split(
    df.index.values, 
    df['label'], 
    test_size=0.1, 
    random_state=RANDOM_STATE
)

In [9]:
tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=True)

model = BertForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [10]:
def data_to_tensor(df, index):
    encoded_data = tokenizer.batch_encode_plus(
        df.loc[index, 'text'].values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
    )
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(df.loc[index, 'label'].values)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    return dataset

In [11]:
dataset_train = data_to_tensor(df, X_train_idx)
dataset_val = data_to_tensor(df, X_val_idx)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
batch_size = 3

dataloader_train = DataLoader(
    dataset_train, 
    sampler=RandomSampler(dataset_train), 
    batch_size=batch_size
)

dataloader_validation = DataLoader(
    dataset_val, 
    sampler=SequentialSampler(dataset_val), 
    batch_size=batch_size
)

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * EPOCHS)

In [14]:
def evaluate(dataloader_val):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val: 
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total / len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [15]:
for epoch in tqdm(range(1, EPOCHS + 1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    torch.save(model.state_dict(), f'{CHECKPOINTS}/{MODEL}_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    tqdm.write(f'Validation loss: {val_loss}')

    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()
    tqdm.write(f'Accuracy (Weighted): {accuracy_score(labels_flat, preds_flat)}')
    tqdm.write(f'Precision (Weighted): {precision_score(labels_flat, preds_flat, average="weighted")}')
    tqdm.write(f'Recall (Weighted): {recall_score(labels_flat, preds_flat, average="weighted")}')
    tqdm.write(f'F1 Score (Weighted): {f1_score(labels_flat, preds_flat, average="weighted")}')

    

  0%|          | 0/3 [21:35<?, ?it/s]


Epoch 1
Training loss: 2.642741766848482


 33%|███▎      | 1/3 [22:05<44:11, 1325.78s/it]

Validation loss: 1.7814994244754894
Accuracy (Weighted): 0.6006472491909385
Precision (Weighted): 0.5152377775948012
Recall (Weighted): 0.6006472491909385
F1 Score (Weighted): 0.5328327973108835


 33%|███▎      | 1/3 [43:32<44:11, 1325.78s/it]


Epoch 2
Training loss: 1.432109464064819


 67%|██████▋   | 2/3 [44:02<22:00, 1320.49s/it]

Validation loss: 1.3198310524695418
Accuracy (Weighted): 0.6809061488673139
Precision (Weighted): 0.6473155356786207
Recall (Weighted): 0.6809061488673139
F1 Score (Weighted): 0.640635620687231


 67%|██████▋   | 2/3 [1:05:05<22:00, 1320.49s/it]


Epoch 3
Training loss: 1.0195263704972215


100%|██████████| 3/3 [1:05:35<00:00, 1311.78s/it]

Validation loss: 1.2045164129362234
Accuracy (Weighted): 0.7249190938511327
Precision (Weighted): 0.7141110051482213
Recall (Weighted): 0.7249190938511327
F1 Score (Weighted): 0.6967257714846804



