## Importing the libraries needed

In [23]:
import pandas as pd
import numpy as np
import stopwords
import nltk
from nltk.corpus import stopwords
import re


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [24]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## preproc and EDA

In [26]:
train = pd.read_csv('../data/raw/1sentencenewtest.csv')

In [27]:
train.describe()

Unnamed: 0.1,Unnamed: 0
count,949.0
mean,474.0
std,274.097002
min,0.0
25%,237.0
50%,474.0
75%,711.0
max,948.0


In [28]:
sw = stopwords.words('russian')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^а-яА-Я?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [29]:
train

Unnamed: 0.1,Unnamed: 0,0
0,0,15.03.2022 обратился на горячую линию для закр...
1,1,"Уже который год в ТКБ не решается ""глобальная ..."
2,2,Добрый день
3,3,"Добрый день Сегодня, зайдя в свой личный кабин..."
4,4,"Обслуживаюсь в Тинькофф пару лет, возникла жес..."
...,...,...
944,944,Отвратительный сервис и отношение к клиентам! ...
945,945,28.04.2022 обратилась в банк о возможности пер...
946,946,В начале 2021 года была акция по выплате 8% ке...
947,947,Бездействие банка и некомпетентность сотрудников


In [30]:
train = train[['0']]
train.columns = ['texts']
new_df = train[['texts']]
new_df['texts'] = new_df['texts'].apply(lambda x: clean_text(x))

In [31]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 24
VALID_BATCH_SIZE = 48
VALID_BATCH_SIZE
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('ai-forever/ruRoberta-large', truncation=True, do_lower_case=True)

In [33]:
class TestData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.texts
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
all_data_set = TestData(new_df, tokenizer, MAX_LEN)

In [36]:
test_emb_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 0
                }

In [37]:
all_data_loader = DataLoader(all_data_set, **test_emb_params)

## training

In [38]:
class MTnluClass(torch.nn.Module):
    def __init__(self):
        super(MTnluClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("ai-forever/ruRoberta-large")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(1024, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
    def get_embed(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        return pooler
    
model = MTnluClass()
model.load_state_dict(torch.load('/app/hsehack_2023/models/rorubert69_CAT.pth', map_location='cpu'))
model.to(device)
print('ok')

Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

ok


In [39]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [40]:
def calc_roc_auc(preds, targets):
    # proba_dict = {0:np.array([1.,0.,0.]),
    #               1:np.array([0.,1.,0.]),
    #               2:np.array([0.,0.,1.])}
    # targets = np.array(list(map(lambda x: proba_dict[x], targets.cpu().numpy())))
    # preds = np.array(list(map(lambda x: proba_dict[x], preds.cpu().numpy())))
    # print(preds)
    # print(targets)
    y_preds = label_binarize(preds.cpu().numpy(), classes=[0,1,2])
    return roc_auc_score(y_preds, targets.cpu().numpy(), multi_class='ovr')

## get embed

In [43]:
outputs_all = []
targets_all = []
with torch.no_grad():
    for _,data in tqdm(enumerate(all_data_loader)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        embed = model.get_embed(ids, mask, token_type_ids)
        outputs_all += embed.cpu().numpy().tolist()

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
949it [00:24, 38.58it/s]


In [44]:
test_emb = pd.DataFrame(outputs_all)

In [45]:
test_emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.105424,-0.029983,1.804730,0.464423,-0.169759,0.758372,0.648669,1.030470,0.312744,-0.562729,...,0.460382,-0.230731,-0.253761,1.402969,0.640821,0.191719,0.217602,-0.054679,-0.569984,-0.195901
1,0.021426,0.097600,0.824718,0.305604,0.727099,1.256473,0.539974,1.645575,-0.322572,-0.148913,...,0.033642,0.592735,-0.066000,0.577592,0.541111,0.470499,-0.566479,0.434932,0.427349,-0.008582
2,0.324814,0.035358,2.206734,1.183492,-0.400563,0.018054,0.161001,0.089501,0.916523,-0.431724,...,0.048207,-0.079259,0.851949,1.619860,1.704702,0.129969,0.121547,-0.124396,-0.071353,-0.024287
3,-0.626395,0.252436,1.275309,0.042013,0.921104,0.421728,-0.036197,1.101363,0.866821,-0.201999,...,-0.728560,0.064300,0.526804,0.907070,0.436407,0.244496,0.154220,-0.450410,-0.661164,-0.126389
4,-0.411529,0.153146,0.719210,-0.219197,0.055934,0.576629,0.351966,1.051119,0.997267,-0.194857,...,-0.211627,0.061256,0.550845,0.460794,-0.345221,0.602286,-0.506242,1.024468,1.414754,0.059959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,0.022085,-0.134628,0.899856,0.473601,-0.058273,0.502134,0.053399,0.867759,0.758415,-0.190501,...,-0.310579,0.657822,0.248714,0.548124,0.794550,0.610424,0.038128,-0.080167,0.643243,-0.129744
945,0.377225,0.457951,1.539207,0.436627,0.772954,1.523339,0.307820,0.926620,0.326569,-0.045936,...,0.176585,-1.293137,-0.180391,0.747121,-0.035019,0.192736,-0.775966,0.113963,0.554274,-0.390376
946,0.278895,-0.677258,1.518457,1.458838,1.204773,0.896946,0.451350,0.319609,0.630101,-0.010940,...,0.031515,-1.128981,-0.103006,0.022389,-1.477037,-0.204174,-1.104696,1.513376,1.225346,0.363850
947,-0.089570,0.231045,0.439491,0.759646,0.201833,0.459167,0.739456,0.605382,0.020604,-0.026076,...,0.102963,0.880153,0.285767,0.732744,0.982480,0.100637,0.417203,-0.265152,0.309159,0.016138


In [46]:
test_emb.to_csv('test_emb_cat_final.csv')