Training was done in **kuggle**

### Preparation

In [1]:
import torch
from tqdm.notebook import tqdm

import pandas as pd

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          AutoTokenizer,
                          GPT2Tokenizer,
                          AdamW, 
                          get_cosine_schedule_with_warmup,
                          GPT2ForSequenceClassification, 
                          T5ForConditionalGeneration)

epochs = 4
batch_size = 8
max_length = 612

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name_or_path = 'ai-forever/rugpt3small_based_on_gpt2'



In [2]:
df = pd.read_csv('/kaggle/input/hackathon/train_dataset_train.csv', sep=';')
df.head()

Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема
0,Лысьвенский городской округ,Благоустройство,"'Добрый день. Сегодня, 20.08.22, моя мать шла ...",★ Ямы во дворах
1,Министерство социального развития ПК,Социальное обслуживание и защита,"'Пермь г, +79194692145. В Перми с ноября 2021 ...",Оказание гос. соц. помощи
2,Министерство социального развития ПК,Социальное обслуживание и защита,'Добрый день ! Скажите пожалуйста если подовал...,Дети и многодетные семьи
3,Город Пермь,Общественный транспорт,'Каждая из них не о чем. Люди на остановках хо...,Содержание остановок
4,Министерство здравоохранения,Здравоохранение/Медицина,'В Березниках у сына привитого откоронавируса ...,Технические проблемы с записью на прием к врачу


In [3]:
# renaming columns
df["text"] = df["Группа тем"] + ': ' + df["Текст инцидента"]
df = df[['text', 'Тема']]
df = df.dropna()
df.columns = ['text', 'label']
df.head()

Unnamed: 0,text,label
0,"Благоустройство: 'Добрый день. Сегодня, 20.08....",★ Ямы во дворах
1,"Социальное обслуживание и защита: 'Пермь г, +7...",Оказание гос. соц. помощи
2,Социальное обслуживание и защита: 'Добрый день...,Дети и многодетные семьи
3,Общественный транспорт: 'Каждая из них не о че...,Содержание остановок
4,Здравоохранение/Медицина: 'В Березниках у сына...,Технические проблемы с записью на прием к врачу


In [4]:
labels = list(df.label.unique())
label2id = dict(zip(labels, list(range(len(labels)))))
n_labels = len(label2id)

In [5]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.n_examples = df.shape[0]
        self.texts = df['text'].to_list()
        self.labels = df['label'].to_list()
  
    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {'text':self.texts[item], 'label':self.labels[item]}

In [6]:
class Gpt2ClassificationCollator(object):
    def __init__(self, tokenizer, labels_encoder, max_sequence_len=None):
            self.use_tokenizer = tokenizer
            self.max_sequence_len = tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
            self.labels_encoder = labels_encoder


    def __call__(self, sequences):
        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs

In [7]:
from torch.nn import CrossEntropyLoss

def train(dataloader, optimizer_, scheduler_, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0
    
    loss_fn = CrossEntropyLoss()

    model.train()

    for batch in tqdm(dataloader, total=len(dataloader)):

        true_labels += batch['labels'].numpy().flatten().tolist()

        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        model.zero_grad()

        outputs = model(**batch)

        loss, logits = outputs[:2]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer_.step()

        scheduler_.step()

        logits = logits.detach().cpu().numpy()

        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

In [8]:
def validation(dataloader, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.eval()

    for batch in tqdm(dataloader, total=len(dataloader)):

        true_labels += batch['labels'].numpy().flatten().tolist()

        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():        

            outputs = model(**batch)

            loss, logits = outputs[:2]

            logits = logits.detach().cpu().numpy()

            total_loss += loss.item()

            predict_content = logits.argmax(axis=-1).flatten().tolist()

            predictions_labels += predict_content

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

### Training

In [9]:
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...


Downloading config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Loading tokenizer...


Downloading vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Loading model...


Downloading pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ai-forever/rugpt3small_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cuda`


#### Training with original data

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42, 
                                                    stratify=df["label"], shuffle=True)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [11]:
df_train = pd.DataFrame(data={
    "text": X_train,
    "label": y_train
})

df_val = pd.DataFrame(data={
    "text": X_val,
    "label": y_val
})

In [12]:
gpt2_classificaiton_collator = Gpt2ClassificationCollator(tokenizer=tokenizer, 
                                                          labels_encoder=label2id, 
                                                          max_sequence_len=max_length)


print('Dealing with Train...')
train_dataset = MyDataset(df=df_train, 
                               tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

Dealing with Train...
Created `train_dataset` with 18502 examples!
Created `train_dataloader` with 2313 batches!



In [13]:
print('Dealing with Validation...')
valid_dataset =  MyDataset(df=df_val, 
                               tokenizer=tokenizer)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))

Dealing with Validation...
Created `valid_dataset` with 4626 examples!
Created `eval_dataloader` with 579 batches!


In [14]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import Adafactor

optimizer = Adafactor(model.parameters(),
                  lr = 2e-3,
                  relative_step=False)

total_steps = len(train_dataloader) * epochs

scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')

    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_f1 = f1_score(train_labels, train_predict, average='weighted')


    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_f1 = f1_score(valid_labels, valid_predict, average='weighted')

    print("train_loss: %.5f - val_loss: %.5f - train_f1: %.5f - val_f1: %.5f"%(train_loss, val_loss, train_f1, val_f1))
    print()

Epoch


  0%|          | 0/4 [00:00<?, ?it/s]


Training on batches...


  0%|          | 0/2313 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 1.96687 - val_loss: 1.47415 - train_f1: 0.41666 - val_f1: 0.50372


Training on batches...


  0%|          | 0/2313 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 1.19662 - val_loss: 1.19754 - train_f1: 0.60193 - val_f1: 0.59954


Training on batches...


  0%|          | 0/2313 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 0.72810 - val_loss: 1.26275 - train_f1: 0.74881 - val_f1: 0.62772


Training on batches...


  0%|          | 0/2313 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 0.38769 - val_loss: 1.38034 - train_f1: 0.86921 - val_f1: 0.63143



In [15]:
valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
report = classification_report(valid_labels, valid_predict)

  0%|          | 0/579 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.65      0.58      0.61       377
           2       0.68      0.82      0.74       402
           3       0.69      0.77      0.73        26
           4       0.67      0.69      0.68       188
           5       0.87      0.87      0.87       158
           6       0.61      0.78      0.68        36
           7       0.86      0.75      0.80        16
           8       0.00      0.00      0.00        23
           9       0.89      0.93      0.91       254
          10       0.00      0.00      0.00        17
          11       0.00      0.00      0.00        23
          12       0.95      0.77      0.85        47
          13       0.74      0.79      0.76        76
          14       0.57      0.67      0.62       115
          15       0.84      0.94      0.89        17
          16       1.00      0.75      0.86         4
          17       0.66    

#### Training with augmented data

In [33]:
df_aug = pd.read_csv('/kaggle/input/augmented/augmented.csv').sample(frac=1).reset_index()
df_aug.head(3)

Unnamed: 0,index,text,big_labels,smol_labels
0,8646,Как можно узнать расписание автобуса и маршрут...,Общественный транспорт,График движения общественного транспорта
1,11581,"Врач пульмонолога и его коллега. Вот и боимся,...",Здравоохранение/Медицина,"Ошибки врачей, халатность"
2,22970,"Где найти авторитеты? Им не стыдно, что в горо...",Роспотребнадзор,Санитарно-эпидемиологическое благополучие


In [34]:
# renaming columns
df_aug["text"] = df_aug["big_labels"] + ': ' + df_aug["text"]
df_aug = df_aug[['text', 'smol_labels']]
df_aug = df_aug.dropna()
df_aug.columns = ['text', 'label']
df_aug.head()

Unnamed: 0,text,label
0,Общественный транспорт: Как можно узнать распи...,График движения общественного транспорта
1,Здравоохранение/Медицина: Врач пульмонолога и ...,"Ошибки врачей, халатность"
2,Роспотребнадзор: Где найти авторитеты? Им не с...,Санитарно-эпидемиологическое благополучие
3,Коронавирус: Мне дали второй компонент ревакци...,Доступность вакцин
4,"Благоустройство: Я хочу рассказать о том, как ...",★ Нарушение правил уборки от снега и наледи вн...


In [35]:
aug_val = pd.DataFrame(data={
    "text": df_aug.text,
    "label": df_aug.label
})

In [36]:
print('Dealing with Validation...')
aug_dataset =  MyDataset(df=aug_val, 
                               tokenizer=tokenizer)
print('Created `valid_dataset` with %d examples!'%len(aug_dataset))

aug_dataloader = DataLoader(aug_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(aug_dataset))

Dealing with Validation...
Created `valid_dataset` with 23594 examples!
Created `eval_dataloader` with 23594 batches!


In [37]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import Adafactor

optimizer = Adafactor(model.parameters(),
                  lr = 2e-3, # default is 5e-5, our notebook had 2e-5
                  relative_step=False)

total_steps = len(train_dataloader) * epochs

scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')

    train_labels, train_predict, train_loss = train(aug_dataloader, optimizer, scheduler, device)
    train_f1 = f1_score(train_labels, train_predict, average='weighted')


    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_f1 = f1_score(valid_labels, valid_predict, average='weighted')

    print("train_loss: %.5f - val_loss: %.5f - train_f1: %.5f - val_f1: %.5f"%(train_loss, val_loss, train_f1, val_f1))
    print()

Epoch


  0%|          | 0/4 [00:00<?, ?it/s]


Training on batches...


  0%|          | 0/2950 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 1.24849 - val_loss: 1.07032 - train_f1: 0.60384 - val_f1: 0.63302


Training on batches...


  0%|          | 0/2950 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 0.77093 - val_loss: 0.85364 - train_f1: 0.74519 - val_f1: 0.71778


Training on batches...


  0%|          | 0/2950 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 0.36716 - val_loss: 0.89804 - train_f1: 0.88189 - val_f1: 0.73797


Training on batches...


  0%|          | 0/2950 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/579 [00:00<?, ?it/s]

train_loss: 0.24401 - val_loss: 0.92434 - train_f1: 0.92596 - val_f1: 0.73730



In [38]:
valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
report = classification_report(valid_labels, valid_predict)

  0%|          | 0/579 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
print(report)

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.75      0.69      0.72       377
           2       0.74      0.84      0.79       402
           3       0.78      0.81      0.79        26
           4       0.72      0.77      0.74       188
           5       0.92      0.91      0.91       158
           6       0.74      0.89      0.81        36
           7       0.85      0.69      0.76        16
           8       0.38      0.39      0.38        23
           9       0.89      0.96      0.93       254
          10       0.27      0.24      0.25        17
          11       0.20      0.04      0.07        23
          12       0.93      0.89      0.91        47
          13       0.87      0.78      0.82        76
          14       0.70      0.81      0.75       115
          15       0.76      0.94      0.84        17
          16       1.00      0.75      0.86         4
          17       0.72    

Due to the augmentation data there is can be a leak in the validation part (because we augmented all data and trained with it). Later we checked F1-score with the correct validation and the actual score was about 0.7

### Saving model's weights

In [49]:
torch.save(model.state_dict(), f"rugpt_small_labels_74.pt")