## Importing the libraries needed

In [15]:
import pandas as pd
import numpy as np
import stopwords
import nltk
from nltk.corpus import stopwords
import re


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [16]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## preproc and EDA

In [18]:
train = pd.read_csv('../data/raw/1sentencenewtest.csv')

In [19]:
train.describe()

Unnamed: 0.1,Unnamed: 0
count,949.0
mean,474.0
std,274.097002
min,0.0
25%,237.0
50%,474.0
75%,711.0
max,948.0


In [20]:
sw = stopwords.words('russian')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^а-яА-Я?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [22]:
train = train[['0']]

In [23]:
train.columns = ['texts']

In [24]:
train

Unnamed: 0,texts
0,15.03.2022 обратился на горячую линию для закр...
1,"Уже который год в ТКБ не решается ""глобальная ..."
2,Добрый день
3,"Добрый день Сегодня, зайдя в свой личный кабин..."
4,"Обслуживаюсь в Тинькофф пару лет, возникла жес..."
...,...
944,Отвратительный сервис и отношение к клиентам! ...
945,28.04.2022 обратилась в банк о возможности пер...
946,В начале 2021 года была акция по выплате 8% ке...
947,Бездействие банка и некомпетентность сотрудников


In [25]:
new_df = train[['texts']]
new_df['texts'] = new_df['texts'].apply(lambda x: clean_text(x))

In [26]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 24
VALID_BATCH_SIZE = 48
VALID_BATCH_SIZE
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('ai-forever/ruRoberta-large', truncation=True, do_lower_case=True)

In [28]:
class TestData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.texts
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
all_data_set = TestData(new_df, tokenizer, MAX_LEN)

In [31]:
test_emb_params = {'batch_size': 1,
                'shuffle': True,
                'num_workers': 0
                }
all_data_loader = DataLoader(all_data_set, **test_emb_params)

In [33]:
class MTnluClass(torch.nn.Module):
    def __init__(self):
        super(MTnluClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("ai-forever/ruRoberta-large")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(1024, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
    def get_embed(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        return pooler
    
model = MTnluClass()
model.load_state_dict(torch.load('/app/hsehack_2023/models/rorubert88_n.pth', map_location='cpu'))
model.to(device)
print('ok')

Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

ok


In [63]:
one_test_data_set = TestData(new_df, tokenizer, MAX_LEN)
test_emb_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 0
                }
one_test_data_set_loader = DataLoader(one_test_data_set, **test_emb_params)

## Creating embeddings for training Catboost (test)

In [64]:
outputs_all = []
targets_all = []
with torch.no_grad():
    for _,data in tqdm(enumerate(one_test_data_set_loader)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        embed = model.get_embed(ids, mask, token_type_ids)
        outputs_all += embed.cpu().numpy().tolist()

949it [00:15, 61.95it/s]


In [65]:
pd.DataFrame(outputs_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.902784,1.297987,1.097821,0.473421,1.317624,-0.386039,-0.650004,0.352344,-1.403260,-0.897125,...,0.226840,1.308397,0.352750,1.283166,0.345595,-1.013453,-0.267477,1.366219,0.117605,-0.537010
1,0.944837,0.068766,0.572766,1.070573,0.882211,0.149938,-0.545597,-0.353828,-0.057254,0.895224,...,0.475763,-0.490487,-0.670113,1.174313,-0.529262,-0.447935,0.400307,0.223325,-0.919455,0.124954
2,0.796639,1.319802,1.337188,1.457283,-0.419512,-1.532776,-1.176540,-0.290054,-0.172236,-0.272150,...,-0.403010,0.322864,0.691736,1.032278,0.096567,-0.375766,0.334493,1.567537,-0.823011,-0.217747
3,1.275302,0.918977,1.185814,1.794785,0.934858,-0.178377,-1.075327,-0.810617,-0.653230,0.612582,...,0.195291,0.156671,0.038190,0.962776,-0.394968,-0.453509,0.663603,0.394088,-1.177859,0.548877
4,-0.793053,0.351079,0.368790,-0.409059,0.174641,-2.175166,0.631875,0.475518,-0.590075,-0.857128,...,0.135209,0.281186,0.527828,1.498899,0.140249,-0.097825,-0.085177,1.868248,0.258400,-0.696183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,0.369200,0.948618,0.673172,0.967206,0.485682,0.090520,-1.017609,-0.299503,-0.744506,0.155051,...,0.279977,-1.034256,-0.305458,1.236149,-0.485411,-0.403664,1.283632,0.009503,-0.377079,0.035251
945,-1.247719,0.991658,0.270783,0.428659,0.860768,-1.100553,-0.312232,1.169824,-0.932231,-0.079898,...,-0.082377,0.539750,0.937748,0.697156,0.158279,-0.720043,0.056121,1.879779,-0.028113,-0.078348
946,-1.820407,0.242353,0.623281,0.288891,0.490508,-1.459233,-0.269929,-0.344785,-0.662301,-0.942734,...,1.397328,-0.040501,0.674187,0.562148,0.199344,-0.023411,-0.075049,1.775784,-0.020173,-0.500412
947,0.800777,0.432450,0.663991,1.563629,0.178798,-0.084471,-0.484259,-0.209525,-0.471602,0.368284,...,0.387483,-0.284803,0.038228,1.723029,-0.493063,-1.122302,0.487029,0.663240,-0.910323,0.367088


In [34]:
outputs_all = []
targets_all = []
with torch.no_grad():
    for _,data in tqdm(enumerate(all_data_loader)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        embed = model.get_embed(ids, mask, token_type_ids)
        outputs_all += embed.cpu().numpy().tolist()

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
949it [00:16, 58.35it/s]


In [35]:
test_emb = pd.DataFrame(outputs_all)

In [66]:
test_emb.to_csv('result_test_sent_true.csv')