In [4]:
import numpy as np
import pandas as pd

/kaggle/input/aaaaaaaaaaaaaaaaaaaaaaaaaaa/resampled_data.csv


In [102]:
df_resampled = pd.read_csv('/kaggle/input/aaaaaaaaaaaaaaaaaaaaaaaaaaa/resampled_data.csv') 

In [103]:
df_new = pd.DataFrame({'text': df_resampled['text'],
                     'label': df_resampled['Segment_num']})

In [104]:
df_new = df_new[~df_new['text'].str.contains('субтитры|динамичная|позитивная', case=False, na=False)]

In [105]:
from sklearn.model_selection import train_test_split

train_data, test_val_df = train_test_split(df_new, test_size=0.3, stratify=df_new['label'], random_state=42)
valid_data, test_data = train_test_split(test_val_df, stratify=test_val_df['label'], test_size=0.5, random_state=42)

In [106]:
import torch
from torch.utils.data import Dataset

In [107]:
class CustomDataset(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

In [108]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

In [109]:
class BertClassifier:

    def __init__(self, model_path, tokenizer_path, n_classes=2, epochs=1, model_save_path='/content/bert.pt'):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)
    
    def preparation(self, X_train, y_train, X_valid, y_valid):
        # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

        # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=2, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=2, shuffle=True)

        # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
            
    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in self.train_loader:
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss
    
    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())
        
        val_acc = correct_predictions.double() / len(self.valid_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss
    
    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')

            val_acc, val_loss = self.eval()
            print(f'Val loss {val_loss} accuracy {val_acc}')
            print('-' * 10)

            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_acc

        self.model = torch.load(self.model_save_path)
    
    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }
        
        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)
        
        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
        
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

In [110]:
classifier = BertClassifier(
        model_path='cointegrated/rubert-tiny',
        tokenizer_path='cointegrated/rubert-tiny',
        n_classes=19,
        epochs=5,
        model_save_path='/kaggle/working/bert.pt'
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [111]:
classifier.preparation(
        X_train=list(train_data['text']),
        y_train=list(train_data['label']),
        X_valid=list(valid_data['text']),
        y_valid=list(valid_data['label'])
    )



In [112]:
classifier.train()

Epoch 1/5
Train loss 2.3148873379103936 accuracy 0.3338119167264896
Val loss 1.8421301969745816 accuracy 0.5503355704697986
----------
Epoch 2/5
Train loss 1.6223812102843904 accuracy 0.5886575735821967
Val loss 1.5213026276370822 accuracy 0.587248322147651
----------
Epoch 3/5
Train loss 1.3085937014269178 accuracy 0.6884422110552764
Val loss 1.3556793525114956 accuracy 0.6442953020134228
----------
Epoch 4/5
Train loss 1.1321229236668118 accuracy 0.7401292175161522
Val loss 1.2814050991863213 accuracy 0.6644295302013423
----------
Epoch 5/5
Train loss 1.0503085999398183 accuracy 0.7559224694903087
Val loss 1.260870841025506 accuracy 0.6644295302013423
----------


In [113]:
classifier_new = BertClassifier(
    model_path='cointegrated/rubert-tiny',
    tokenizer_path='cointegrated/rubert-tiny',
    n_classes=19,
    epochs=5,
    model_save_path='/kaggle/working/bert.pt'
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [114]:
classifier_new.model = torch.load('/kaggle/working/bert.pt')
classifier_new.model.to(classifier.device)
classifier_new.model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, e

In [115]:
texts = list(test_data['text'])
labels = list(test_data['label'])

predictions = [classifier_new.predict(t) for t in texts]

In [116]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1score = precision_recall_fscore_support(labels, predictions, average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: 0.66815799360089, recall: 0.6829534662867995, f1score: 0.6594732140551114


  _warn_prf(average, modifier, msg_start, len(result))


In [117]:
df_info = pd.read_excel('/kaggle/input/hack-chunk-desc/segment_dict.xlsx')

In [118]:
dd

{0: 'Промо/Нет/Нет',
 1: 'Имидж/Нет/Нет',
 2: 'Имидж/Нет/Да',
 3: 'Промо/Доставка/Нет',
 4: 'Промо/Нет/Да',
 5: 'Имидж/Доставка/Нет',
 7: 'Имидж',
 8: 'Кредитование',
 9: 'Range',
 10: 'Дебетовые карты',
 11: 'Услуги бизнесу',
 12: 'Кредитные карты',
 13: 'Инвестиционные продукты',
 14: 'Экосистемные сервисы',
 15: 'Музыка',
 16: 'Колонки+Голосовой помощник',
 17: 'Клипы',
 18: 'Соц сети'}

In [119]:
labels = df_info.Segment_num.tolist()
names = df_info.Segment.tolist()

In [120]:
dd = dict()

for i in range(len(labels)):
    dd[labels[i]] = names[i]

In [121]:
del dd[6]

In [129]:
text = '''
Привет! Это снова я, Борис Петров. На Восьмое марта я сделал подарки всем девушкам в отделе. 
Маше, Оле, Лене, (особенно выделяя голосом) Юлечке Беловой. Теперь все шутят, что у меня зарплата больше, чем у шефа. 
А мне это было совсем нетрудно. Ведь с кредитной картой «Абсолют Банка» я могу купить подарки прямо сейчас, а оплатить потом.
Кредитная карта «Абсолют Банка». Подарки сейчас, а оплата потом. Условия и порядок оформления на сайте.
'''
prediction = classifier.predict(text)

print(f"Предсказанный класс: {dd[prediction]}")

Предсказанный класс: Кредитные карты
