Training was done in kuggle

### Preparation

In [1]:
import torch
from tqdm.notebook import tqdm

import pandas as pd

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          AutoTokenizer,
                          GPT2Tokenizer,
                          AdamW, 
                          get_cosine_schedule_with_warmup,
                          GPT2ForSequenceClassification, 
                          T5ForConditionalGeneration)

epochs = 4
batch_size = 20
max_length = 612

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name_or_path = 'ai-forever/rugpt3small_based_on_gpt2'



In [2]:
# renaming columns
df = pd.read_csv("/home/jupyter/datasphere/project/train_dataset_train.csv", sep=';')
df = df[['Текст инцидента', 'Группа тем']]
df.columns = ['text', 'label']

In [3]:
df.head()

Unnamed: 0,text,label
0,"'Добрый день. Сегодня, 20.08.22, моя мать шла ...",Благоустройство
1,"'Пермь г, +79194692145. В Перми с ноября 2021 ...",Социальное обслуживание и защита
2,'Добрый день ! Скажите пожалуйста если подовал...,Социальное обслуживание и защита
3,'Каждая из них не о чем. Люди на остановках хо...,Общественный транспорт
4,'В Березниках у сына привитого откоронавируса ...,Здравоохранение/Медицина


In [4]:
labels = list(df.label.unique())
label2id = dict(zip(labels, list(range(len(labels)))))
n_labels = len(label2id)

In [5]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.n_examples = df.shape[0]
        self.texts = df['text'].to_list()
        self.labels = df['label'].to_list()
  
    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {'text':self.texts[item], 'label':self.labels[item]}

In [6]:
class Gpt2ClassificationCollator(object):
    def __init__(self, tokenizer, labels_encoder, max_sequence_len=None):
            self.use_tokenizer = tokenizer
            self.max_sequence_len = tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
            self.labels_encoder = labels_encoder


    def __call__(self, sequences):
        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs


In [7]:
from torch.nn import CrossEntropyLoss

def train(dataloader, optimizer_, scheduler_, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0
    
    loss_fn = CrossEntropyLoss()

    model.train()

    for batch in tqdm(dataloader, total=len(dataloader)):

        true_labels += batch['labels'].numpy().flatten().tolist()

        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        model.zero_grad()

        outputs = model(**batch)

        loss, logits = outputs[:2]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer_.step()

        scheduler_.step()

        logits = logits.detach().cpu().numpy()

        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

In [8]:
def validation(dataloader, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.eval()

    for batch in tqdm(dataloader, total=len(dataloader)):

        true_labels += batch['labels'].numpy().flatten().tolist()

        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():        

            outputs = model(**batch)

            loss, logits = outputs[:2]

            logits = logits.detach().cpu().numpy()

            total_loss += loss.item()

            predict_content = logits.argmax(axis=-1).flatten().tolist()

            predictions_labels += predict_content

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

### Training

In [9]:
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Loading tokenizer...


vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Loading model...


pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ai-forever/rugpt3small_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cuda`


#### Training with original data¶

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42, 
                                                    stratify=df["label"], shuffle=True)

In [11]:
df_train = pd.DataFrame(data={
    "text": X_train,
    "label": y_train
})

df_val = pd.DataFrame(data={
    "text": X_val,
    "label": y_val
})

In [12]:
gpt2_classificaiton_collator = Gpt2ClassificationCollator(tokenizer=tokenizer, 
                                                          labels_encoder=label2id, 
                                                          max_sequence_len=max_length)


print('Dealing with Train...')
train_dataset = MyDataset(df=df_train, 
                               tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

Dealing with Train...
Created `train_dataset` with 18502 examples!
Created `train_dataloader` with 926 batches!



In [13]:
print('Dealing with Validation...')
valid_dataset =  MyDataset(df=df_val, 
                               tokenizer=tokenizer)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))

Dealing with Validation...
Created `valid_dataset` with 4626 examples!
Created `eval_dataloader` with 232 batches!


In [14]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import Adafactor

optimizer = Adafactor(model.parameters(),
                  lr = 2e-3,
                  relative_step=False)

total_steps = len(train_dataloader) * epochs

scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')

    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_f1 = f1_score(train_labels, train_predict, average='weighted')


    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_f1 = f1_score(valid_labels, valid_predict, average='weighted')

    print("train_loss: %.5f - val_loss: %.5f - train_f1: %.5f - val_f1: %.5f"%(train_loss, val_loss, train_f1, val_f1))
    print()

Epoch


  0%|          | 0/4 [00:00<?, ?it/s]


Training on batches...


  0%|          | 0/926 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_loss: 1.00315 - val_loss: 0.84139 - train_f1: 0.31363 - val_f1: 0.36394


Training on batches...


  0%|          | 0/926 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_loss: 0.61726 - val_loss: 0.74319 - train_f1: 0.45799 - val_f1: 0.43419


Training on batches...


  0%|          | 0/926 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_loss: 0.36380 - val_loss: 0.79848 - train_f1: 0.66044 - val_f1: 0.48903


Training on batches...


  0%|          | 0/926 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_loss: 0.20252 - val_loss: 0.96696 - train_f1: 0.86784 - val_f1: 0.49852



In [15]:
valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
report = classification_report(valid_labels, valid_predict)

  0%|          | 0/232 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(report)

              precision    recall  f1-score   support

           0       0.66      0.62      0.64       506
           1       0.87      0.90      0.88       925
           2       0.85      0.86      0.85       174
           3       0.86      0.88      0.87       935
           4       0.80      0.74      0.77       347
           5       0.67      0.67      0.67       140
           6       0.72      0.81      0.76       626
           7       0.80      0.82      0.81       541
           8       0.72      0.56      0.63       162
           9       0.00      0.00      0.00         6
          10       0.33      0.21      0.26        14
          11       0.23      0.26      0.24        47
          12       0.79      0.88      0.83        17
          13       0.78      0.63      0.70        60
          14       0.25      0.12      0.16        17
          15       0.75      0.78      0.76        27
          16       0.00      0.00      0.00         3
          17       0.61    

#### Training with augmented data

In [18]:
df_aug = pd.read_csv('augmented.csv').sample(frac=1).reset_index()

In [19]:
df_train_aug = pd.DataFrame(data={
    "text": df_aug.text,
    "label": df_aug.big_labels
})

In [20]:
print('Dealing with Train...')
train_aug_dataset = MyDataset(df=df_train_aug, 
                               tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_aug_dataset))

train_aug_dataloader = DataLoader(train_aug_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_aug_dataset))

print()

Dealing with Train...
Created `train_dataset` with 23594 examples!
Created `train_dataloader` with 23594 batches!



In [21]:
optimizer = Adafactor(model.parameters(),
                  lr = 2e-3,
                  relative_step=False)

total_steps = len(train_dataloader) * epochs

scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')

    train_aug_labels, train_aug_predict, train_aug_loss = train(train_aug_dataloader, optimizer, scheduler, device)
    train_aug_f1 = f1_score(train_aug_labels, train_aug_predict, average='weighted')


    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_f1 = f1_score(valid_labels, valid_predict, average='weighted')

    print("train_aug_loss: %.5f - val_loss: %.5f - train_aug_f1: %.5f - val_f1: %.5f"%(train_aug_loss, val_loss, train_aug_f1, val_f1))
    print()

Epoch


  0%|          | 0/4 [00:00<?, ?it/s]


Training on batches...


  0%|          | 0/1180 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_aug_loss: 0.76822 - val_loss: 0.68392 - train_aug_f1: 0.76498 - val_f1: 0.79696


Training on batches...


  0%|          | 0/1180 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_aug_loss: 0.46078 - val_loss: 0.59715 - train_aug_f1: 0.85592 - val_f1: 0.82308


Training on batches...


  0%|          | 0/1180 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_aug_loss: 0.22483 - val_loss: 0.63751 - train_aug_f1: 0.92962 - val_f1: 0.84148


Training on batches...


  0%|          | 0/1180 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/232 [00:00<?, ?it/s]

train_aug_loss: 0.16153 - val_loss: 0.68024 - train_aug_f1: 0.95157 - val_f1: 0.83846



In [22]:
valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
report = classification_report(valid_labels, valid_predict)

  0%|          | 0/232 [00:00<?, ?it/s]

In [23]:
print(report)

              precision    recall  f1-score   support

           0       0.78      0.69      0.73       506
           1       0.94      0.89      0.91       925
           2       0.94      0.86      0.90       174
           3       0.89      0.90      0.90       935
           4       0.73      0.88      0.80       347
           5       0.94      0.76      0.84       140
           6       0.83      0.83      0.83       626
           7       0.91      0.81      0.85       541
           8       0.87      0.64      0.74       162
           9       0.20      1.00      0.33         6
          10       0.86      0.43      0.57        14
          11       0.74      0.49      0.59        47
          12       0.52      0.94      0.67        17
          13       0.91      0.70      0.79        60
          14       0.10      0.76      0.18        17
          15       0.83      0.70      0.76        27
          16       0.12      1.00      0.21         3
          17       0.83    

Due to the augmentation data there is can be a leak in the validation part (because we augmented all data and trained with it). Later we checked F1-score with the correct validation and the actual score was about 0.81

### Saving model's weights

In [24]:
torch.save(model.state_dict(), f"rugpt_large_labels_84.pt")