## Importing the libraries needed

In [5]:
import pandas as pd
import numpy as np
import stopwords
import nltk
from nltk.corpus import stopwords
import re
from transformers import AutoTokenizer, AutoModel
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [6]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## preproc and EDA

In [8]:
train = pd.read_csv('../data/raw/train.csv')

In [9]:
train['sentiment'].unique()

array(['+', '−', '?'], dtype=object)

In [10]:
train.describe()

Unnamed: 0.1,Unnamed: 0
count,19361.0
mean,10755.525283
std,6217.076236
min,0.0
25%,5368.0
50%,10757.0
75%,16152.0
max,21512.0


In [11]:
sw = stopwords.words('russian')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^а-яА-Я?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [12]:
set(train['1category'])

{'?', 'Communication', 'Price', 'Quality', 'Safety'}

In [13]:
train = train.loc[train['1category'] != '?'].reset_index()

In [14]:
new_df = train[['sentence', '1category', '2category']]
new_df['sentence'] = new_df['sentence'].apply(lambda x: clean_text(x))

In [15]:
dict_lavel = {'Communication':0, 'Price':1, 'Quality':2, 'Safety':3}
new_df['1category'] = new_df['1category'].apply(lambda x: dict_lavel[x])
new_df

Unnamed: 0,sentence,1category,2category
0,получал качественные услуги,0,
1,отвратительное отношение клиентам,0,
2,"любое время дня ночи помогут, ответят, решат",0,
3,"время согласовывалось, вс делалось быстро",0,
4,абсолютное бездействие нежелание банка работат...,2,
...,...,...,...
13430,руководитель ф рин крайне неквалифицирован воп...,0,
13431,коем случае открывайте счет недостойном довери...,0,
13432,ти откровенно забили качество развивают свои м...,2,
13433,"писал мужчина очень доходчиво, финансовым язык...",0,


In [17]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 24
VALID_BATCH_SIZE = 48
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('ai-forever/ruRoberta-large', truncation=True, do_lower_case=True)

In [18]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.sentence
        self.targets = self.data['1category']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=42)

test_data=new_df.drop(train_data.index)
test_data_index = test_data.index.tolist()
test_data=test_data.reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)
all_data_set = SentimentData(new_df, tokenizer, MAX_LEN)

In [20]:
new_df

Unnamed: 0,sentence,1category,2category
0,получал качественные услуги,0,
1,отвратительное отношение клиентам,0,
2,"любое время дня ночи помогут, ответят, решат",0,
3,"время согласовывалось, вс делалось быстро",0,
4,абсолютное бездействие нежелание банка работат...,2,
...,...,...,...
13430,руководитель ф рин крайне неквалифицирован воп...,0,
13431,коем случае открывайте счет недостойном довери...,0,
13432,ти откровенно забили качество развивают свои м...,2,
13433,"писал мужчина очень доходчиво, финансовым язык...",0,


In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
all_df_params = {'batch_size': 24,
                'shuffle': False,
                'num_workers': 0
                }



training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
all_data_loader = DataLoader(all_data_set, **all_df_params)

## training

In [23]:
class MTnluClass(torch.nn.Module):
    def __init__(self):
        super(MTnluClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('ai-forever/ruRoberta-large')
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(1024, 4) # 4 classes for category

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
    def get_embed(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        return pooler
    
model = MTnluClass()
model.to(device)
print('ok')

Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

ok


In [18]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [19]:
def calc_roc_auc(preds, targets):
    # proba_dict = {0:np.array([1.,0.,0.]),
    #               1:np.array([0.,1.,0.]),
    #               2:np.array([0.,0.,1.])}
    # targets = np.array(list(map(lambda x: proba_dict[x], targets.cpu().numpy())))
    # preds = np.array(list(map(lambda x: proba_dict[x], preds.cpu().numpy())))
    # print(preds)
    # print(targets)
    y_preds = label_binarize(preds.cpu().numpy(), classes=[0,1,2])
    return roc_auc_score(y_preds, targets.cpu().numpy(), multi_class='ovr')

## ФАЙНТЮЮЮн]

In [20]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
epochs = 5
for epoch in range(0, epochs):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(all_data_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        y_preds = label_binarize(big_idx.cpu().numpy(), classes=[0,1,2,3])
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training Loss per 5000 steps: {loss_step}")
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
        
    y_preds_test = []
    targets_test = []
    for _,data in tqdm(enumerate(testing_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        with torch.no_grad():
            outputs = model(ids, mask, token_type_ids)
            big_val, big_idx = torch.max(outputs.data, dim=1)
            y_preds = label_binarize(big_idx.cpu().numpy(), classes=[0,1,2,3])
            y_preds_test += y_preds.tolist()
            targets_test += targets.cpu().tolist()
            
    epoch_loss = tr_loss/nb_tr_steps     
    print(f"Training Loss Epoch: {epoch_loss}") 
    print(f"valid Roc Auc Epoch: {roc_auc_score(np.array(targets_test), np.array(y_preds_test), multi_class='ovr')}")
    #print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    
    # epoch_accu = (n_correct*100)/nb_tr_examples

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Loss per 5000 steps: 1.3883835077285767


560it [10:39,  1.14s/it]
56it [00:44,  1.25it/s]
0it [00:00, ?it/s]

Training Loss Epoch: 0.7049157687063728
valid Roc Auc Epoch: 0.697187171633992
Training Loss per 5000 steps: 0.6880572438240051


383it [07:18,  1.14s/it]


KeyboardInterrupt: 

In [21]:
torch.save(model.state_dict(), '/app/hsehack_2023/models/rorubert69_CAT.pth')

## Creating embeddings for training Catboost

In [25]:
model = MTnluClass()
model.load_state_dict(torch.load('/app/hsehack_2023/models/rorubert69_CAT.pth', map_location=device))
model.to(device)


Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

MTnluClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm

In [27]:
outputs_all = []
targets_all = []
with torch.no_grad():
    for _,data in tqdm(enumerate(all_data_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        embed = model.get_embed(ids, mask, token_type_ids)
        outputs_all += embed.cpu().numpy().tolist()
        targets_all += targets.cpu().numpy().tolist()

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
560it [03:42,  2.52it/s]


In [28]:
df_rtyuio = pd.DataFrame(outputs_all)
df_rtyuio.shape

(13435, 1024)

In [31]:
set(df_rtyuio['target'])

{0, 1, 2, 3}

In [None]:
df_rtyuio['target'] = targets_all


In [32]:
df_rtyuio.to_csv('categ_emb_train_final_ok.csv')