<a href="https://www.kaggle.com/code/akscent/feature-extraction-classifer-txt?scriptVersionId=149040261" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:

!pip install pymorphy2 cleantext -U nlp_profiler textblob pymystem3
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.insert(1, '/kaggle/input/ods-huawei/nlp_huawei_new2_task-master/nlp_huawei_new2_task-master/baseline_transformers')
# from dataset import *
# from model import *
# from trainer import Trainer

import torch
from torch.utils.data import Dataset
from typing import Dict
import json
from numpy import asarray
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW
from tqdm.notebook import tqdm
from textblob import TextBlob

torch.manual_seed(42)

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cleantext
  Downloading cleantext-1.1.4-py3-none-any.whl (4.9 kB)
Collecting nlp_profiler
  Downloading nlp_profiler-0.0.3-py2.py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pymystem3
  Downloading pymystem3-0.2.0-py3-none-any.whl (10 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Collecting nltk (from cleantext)
  Downloading nltk-3.8



<torch._C.Generator at 0x7dd1dd013310>

In [2]:
class FiveDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        self.targets = None
        if 'rate' in dataframe:
            self.targets = dataframe['rate'].tolist()
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.targets is not None:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

    def __len__(self) -> int:
        return len(self.text)
    

class ModelForClassification(torch.nn.Module):

    def __init__(self, model_path: str, config: Dict):
        super(ModelForClassification, self).__init__()
        self.model_name = model_path
        self.config = config
        self.n_classes = config['num_classes']
        self.dropout_rate = config['dropout_rate']
        self.bert = AutoModel.from_pretrained(self.model_name)
        self.pre_classifier = torch.nn.Linear(312, 768)
        self.dropout = torch.nn.Dropout(self.dropout_rate)
        self.classifier = torch.nn.Linear(768, self.n_classes)
        self.softmax = torch.nn.LogSoftmax(dim = 1)

    def forward(self, input_ids, attention_mask,):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_state = output[0]
        hidden_state = hidden_state[:, 0]
        hidden_state = self.pre_classifier(hidden_state)
        hidden_state = torch.nn.ReLU()(hidden_state)
        hidden_state = self.dropout(hidden_state)
        output = self.classifier(hidden_state)
        output = self.softmax(output)
        return output


class Trainer:
    def __init__(self, config: Dict, class_weights=None):
        self.config = config
        self.device = config['device']
        self.n_epochs = config['n_epochs']
        self.optimizer = None
        self.opt_fn = lambda model: AdamW(model.parameters(), config['lr'])
        self.model = None
        self.history = None
        if class_weights is not None:
            class_weights = class_weights.to(self.device)
            self.loss_fn = CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fn = CrossEntropyLoss()
        self.device = config['device']
        self.verbose = config.get('verbose', True)
        
    def save_history(self, path: str):
        history = {
            'train_loss': self.history['train_loss'],
            'val_loss': self.history['val_loss'],
            'val_acc': self.history['val_acc']
        }
        val_acc = sum(self.history['val_acc']) / len(self.history['val_acc'])
        print("All ACCURACY = ", val_acc)
        with open(path, 'w') as file:
            json.dump(history, file)
        
    def load_history(self, path: str):
        with open(path, 'r') as file:
            history = json.load(file)
        self.history = {
            'train_loss': history['train_loss'],
            'val_loss': history['val_loss'],
            'val_acc': history['val_acc']
        }

    def fit(self, model, train_dataloader, val_dataloader):
        self.model = model.to(self.device)
        self.optimizer = self.opt_fn(model)
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_acc': []
        }
        best_val_loss = float('inf')

        for epoch in range(self.n_epochs):
            print(f"Epoch {epoch + 1}/{self.n_epochs}")
            train_info = self.train_epoch(train_dataloader)
            val_info = self.val_epoch(val_dataloader)
            self.history['train_loss'].extend(train_info['loss'])
            self.history['val_loss'].extend([val_info['loss']])
            self.history['val_acc'].extend([val_info['acc']])

            if val_info['loss'] < best_val_loss:
                best_val_loss = val_info['loss']
                self.save_model_weights('best_model_weights.ckpt')

            self.save_history('history.json')

        return self.model.eval()

    def save_model_weights(self, path: str):
        torch.save(self.model.state_dict(), path)



    def train_epoch(self, train_dataloader):
        self.model.train()
        losses = []
        total_loss = 0
        if self.verbose:
            train_dataloader = tqdm(train_dataloader)
        for batch in train_dataloader:
            ids = batch['ids'].to(self.device, dtype=torch.long)
            mask = batch['mask'].to(self.device, dtype=torch.long)
            targets = batch['targets'].to(self.device, dtype=torch.long)

            outputs = self.model(ids, mask)
            loss = self.loss_fn(outputs, targets)
            total_loss += loss.item()
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_val = loss.item()
            if self.verbose:
                train_dataloader.set_description(f"Loss={loss_val:.3}")
            losses.append(loss_val)
        avg_loss = total_loss / len(train_dataloader)
        print("AVG LOSS = ", avg_loss)
        return {'loss': losses}

    def val_epoch(self, val_dataloader):
        self.model.eval()
        all_logits = []
        all_labels = []
        if self.verbose:
            val_dataloader = tqdm(val_dataloader)
        with torch.no_grad():
            for batch in val_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                targets = batch['targets'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                all_logits.append(outputs)
                all_labels.append(targets)
        all_labels = torch.cat(all_labels).to(self.device)
        all_logits = torch.cat(all_logits).to(self.device)
        loss = self.loss_fn(all_logits, all_labels).item()
        acc = (all_logits.argmax(1) == all_labels).float().mean().item()
        print("ACCURACY for EPOCH = ", acc)
        if self.verbose:
            val_dataloader.set_description(f"Loss={loss:.3}; Acc:{acc:.3}")
        return {
            'acc': acc,
            'loss': loss
        }

    def predict(self, test_dataloader):
        if not self.model:
            raise RuntimeError("You should train the model first")
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                preds = torch.exp(outputs)
                predictions.extend(preds.tolist())
        return asarray(predictions)

    def save(self, path: str):
        if self.model is None:
            raise RuntimeError("You should train the model first")
        checkpoint = {
            "config": self.model.config,
            "trainer_config": self.config,
            "model_name": self.model.model_name,
            "model_state_dict": self.model.state_dict()
        }
        torch.save(checkpoint, path)

    def plot_history(self):
        import matplotlib.pyplot as plt
        
        if self.history is None:
            raise RuntimeError("History is not available. Train the model first.")

        train_loss = self.history['train_loss']
        val_loss = self.history['val_loss']
        val_acc = self.history['val_acc']

        epochs = range(1, len(train_loss) + 1)

        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(epochs, train_loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'r', label='Validation loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(epochs, val_acc, 'g', label='Validation accuracy')
        plt.title('Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.show()


    @classmethod
    def load(cls, path: str):
        ckpt = torch.load(path)
        keys = ["config", "trainer_config", "model_state_dict"]
        for key in keys:
            if key not in ckpt:
                raise RuntimeError(f"Missing key {key} in checkpoint")
        new_model = ModelForClassification(
            ckpt['model_name'],
            ckpt["config"]
        )
        new_model.load_state_dict(ckpt["model_state_dict"])
        new_trainer = cls(ckpt["trainer_config"])
        new_trainer.model = new_model
        new_trainer.model.to(new_trainer.device)
        return new_trainer

In [3]:
from textblob import TextBlob

# Создание объекта TextBlob с текстом на русском языке
text = "Привет, как дела?"
blob = TextBlob(text)


# Извлечение основы слов
print("Основы слов:", [word.lemmatize() for word in blob.words])

# Анализ тональности
print("Тональность текста:", blob.sentiment)


Основы слов: ['Привет', 'как', 'дела']
Тональность текста: Sentiment(polarity=0.0, subjectivity=0.0)


# Data load

In [62]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(PATH, "test.csv"))
le = LabelEncoder()
train_data.rate = le.fit_transform(train_data.rate)
train_data.head()

Unnamed: 0,rate,text
0,3,Очень понравилось. Были в начале марта с соба...
1,4,В целом магазин устраивает.\nАссортимент позво...
2,4,"Очень хорошо что открылась 5 ка, теперь не над..."
3,2,Пятёрочка громко объявила о том как она заботи...
4,2,"Тесно, вечная сутолока, между рядами трудно ра..."


# Pre-clean text

In [8]:
# import re
# TOKEN_RE = re.compile(r'[а-яё]+')

# def tokenize_text(text, min_length_token=1):
#     text = text.lower()
#     tokens = TOKEN_RE.findall(text)
#     return [token for token in tokens if len(token) >= min_length_token]

# def text_cleaning(text):
#     tokens = tokenize_text(text)
#     return ' '.join(tokens)

# tqdm.pandas()
# train_data['text'] = train_data['text'].progress_apply(text_cleaning)
# test_data['text'] = test_data['text'].progress_apply(text_cleaning)

import re
import pymorphy2

from nltk.corpus import stopwords

ru_stopwords = stopwords.words('russian')
digits = [str(i) for i in range(10)]

TOKEN_RE = re.compile(r'[а-яё!.,?%]+')
lemmatizer = pymorphy2.MorphAnalyzer()

def is_valid_word(word):
    if not word[0].isdigit() and word not in ru_stopwords:
        parsed_word = lemmatizer.normal_forms(word)[0]
        return parsed_word
    return False

def text_cleaning(text):
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    words = text.split()
    cleaned_words = [word for word in words[:512] if is_valid_word(word) and len(word) < 15]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

tqdm.pandas()
train_data['text'] = train_data['text'].progress_apply(text_cleaning)
test_data['text'] = test_data['text'].progress_apply(text_cleaning)

train_data["num_words"] = train_data["text"].apply(
    lambda x: len(str(x).split()))
test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))

In [51]:
# train_data.to_csv("cleaned_train.csv", index=False)
# test_data.to_csv("cleaned_test.csv", index=False)

In [12]:
# del zero
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]

In [65]:
from collections import Counter

def remove_infrequent_words(dataset, min_count=3):
    word_counter = Counter()
    for text in dataset:
        words = text.split()
        word_counter.update(words)
    infrequent_words = [word for word, count in word_counter.items() if count < min_count]
    def remove_infrequent(text):
        words = text.split()
        cleaned_words = [word for word in words if word not in infrequent_words]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text
    cleaned_dataset = [remove_infrequent(text) for text in tqdm(dataset, desc="Cleaning text")]

    return cleaned_dataset

cleaned_train = remove_infrequent_words(train_data['text'].tolist())
cleaned_test = remove_infrequent_words(test_data['text'].tolist())


Cleaning text:   0%|          | 0/48640 [00:00<?, ?it/s]

Cleaning text:   0%|          | 0/12167 [00:00<?, ?it/s]

In [66]:
train_data['cleaned_text'] = cleaned_train
test_data['cleaned_text'] = cleaned_test
train_data.to_csv("cleaned_train.csv", index=False)
test_data.to_csv("cleaned_test.csv", index=False)

# New data

In [4]:
PATH = "/kaggle/input/cleaned-text"
train_data = pd.read_csv(os.path.join(PATH, "cleaned_train (1).csv"))
test_data = pd.read_csv(os.path.join(PATH, "cleaned_test (1).csv"))
# del zero
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]
train_data.head()

Unnamed: 0,rate,text,num_words,cleaned_text
0,3,Очень понравилось. Были начале марта собакой. ...,29,Очень понравилось. Были начале марта собакой. ...
1,4,В целом магазин устраивает. Ассортимент позвол...,39,В целом магазин устраивает. Ассортимент позвол...
2,4,"Очень открылась ка, далеко ехать рядом!",6,"Очень открылась ка, далеко ехать рядом!"
3,2,Пятрочка громко объявила заботится пенсионерах...,26,Пятрочка громко заботится часы посещения магаз...
4,2,"Тесно, вечная сутолока, рядами трудно разойтис...",12,"Тесно, вечная рядами трудно разойтись, грязно...."


In [25]:
# replace nan

def replace_nan_with_text(row):
    if pd.isna(row['cleaned_text']):
        return row['text']
    return row['cleaned_text']

train_data['cleaned_text'] = train_data.progress_apply(replace_nan_with_text, axis=1)
test_data['cleaned_text'] = test_data.progress_apply(replace_nan_with_text, axis=1)

  0%|          | 0/48640 [00:00<?, ?it/s]

  0%|          | 0/12165 [00:00<?, ?it/s]

In [26]:
def truncate_text(text, max_words=512):
    words = text.split()
    if len(words) > max_words:
        truncated_text = ' '.join(words[:max_words])
    else:
        truncated_text = text
    return truncated_text

tqdm.pandas()
train_data['cleaned_text'] = train_data['cleaned_text'].progress_apply(truncate_text)
test_data['cleaned_text'] = test_data['cleaned_text'].progress_apply(truncate_text)

  0%|          | 0/48640 [00:00<?, ?it/s]

  0%|          | 0/12165 [00:00<?, ?it/s]

In [34]:
# идея суммирования текста в более короткий текст


from transformers import MBartTokenizer, MBartForConditionalGeneration

model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def summary_rows(article_text):
    input_ids = tokenizer(
        [article_text],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )[0]

    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    return summary

def text_summary(text):
    if isinstance(text, str) and text.strip() and len(str(text).split()) > 150:
        return summary_rows(text)
    else:
        return text
    

train_data['summary'] = train_data['cleaned_text'].progress_apply(text_summary)
test_data['summary'] = test_data['cleaned_text'].progress_apply(text_summary)

  0%|          | 0/48640 [00:00<?, ?it/s]

  0%|          | 0/12165 [00:00<?, ?it/s]

In [35]:
train_data

Unnamed: 0,rate,text,num_words,cleaned_text,summary
0,3,Очень понравилось. Были начале марта собакой. ...,29,Очень понравилось. Были начале марта собакой. ...,Очень понравилось. Были начале марта собакой. ...
1,4,В целом магазин устраивает. Ассортимент позвол...,39,В целом магазин устраивает. Ассортимент позвол...,В целом магазин устраивает. Ассортимент позвол...
2,4,"Очень открылась ка, далеко ехать рядом!",6,"Очень открылась ка, далеко ехать рядом!","Очень открылась ка, далеко ехать рядом!"
3,2,Пятрочка громко объявила заботится пенсионерах...,26,Пятрочка громко заботится часы посещения магаз...,Пятрочка громко заботится часы посещения магаз...
4,2,"Тесно, вечная сутолока, рядами трудно разойтис...",12,"Тесно, вечная рядами трудно разойтись, грязно....","Тесно, вечная рядами трудно разойтись, грязно...."
...,...,...,...,...,...
48635,4,"Удобный, маленький ещ обновили другие пятрки",6,"Удобный, маленький ещ обновили другие пятрки","Удобный, маленький ещ обновили другие пятрки"
48636,1,"Постоянно обман цене,написанна сумма акции ито...",26,Постоянно обман сумма акции итогу пробивают бо...,Постоянно обман сумма акции итогу пробивают бо...
48637,1,Очень хочется пожелать этому магазину стать та...,9,Очень хочется пожелать этому магазину стать таким,Очень хочется пожелать этому магазину стать таким
48638,4,"Нравится ваш магазин, персонал одекватный, пор...",6,"Нравится ваш магазин, персонал порядок.","Нравится ваш магазин, персонал порядок."


In [46]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
from tqdm.notebook import tqdm

model = AutoModelForSequenceClassification.from_pretrained("apanc/russian-sensitive-topics")
tokenizer = AutoTokenizer.from_pretrained("apanc/russian-sensitive-topics")
tokenizer.padding = True
tokenizer.truncation = True
tokenizer.max_length = 512
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=torch.device("cuda:0"))

def make_pipe(text):
    return pipe(text, return_all_scores=True)

tqdm.pandas()
train_data['theme_labels'] = train_data['summary'].progress_apply(make_pipe)


  0%|          | 0/48640 [00:00<?, ?it/s]



In [53]:
def extract_label_probs(row):
    label_probs = [label['score'] for label in row[0]]
    return label_probs

train_data['label_probs'] = train_data['theme_labels'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].apply(pd.Series).add_prefix('LABEL_')], axis=1)

del train_data['label_probs']
del train_data['theme_labels']

In [59]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

In [None]:
# # добавление переменных о чувствах
# from transformers import BertTokenizer, BertForSequenceClassification
# model_name = 'Skoltech/russian-sensitive-topics'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name);

# tokenized = tokenizer.batch_encode_plus(train_data[train_data["num_words"] > 80]['text'][370],
#                                         max_length = 512,
#                                         pad_to_max_length=True,
#                                         truncation=True,
#                                         return_token_type_ids=False)

# tokens_ids,mask = torch.tensor(tokenized['input_ids']),torch.tensor(tokenized['attention_mask']) 

# with torch.no_grad():
#     model_output = model(tokens_ids,mask)

# def adjust_multilabel(y, is_pred = False):
#     y_adjusted = []
#     for y_c in y:
#         y_test_curr = [0]*19
#         index = str(int(np.argmax(y_c)))
#         y_c = target_vaiables_id2topic_dict[index]
#     return y_c

# model_output

In [61]:
# тональность текста
pipe = pipeline(model="seara/rubert-tiny2-russian-sentiment", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['mood'] = train_data['summary'].progress_apply(make_pipe)

  0%|          | 0/48640 [00:00<?, ?it/s]



In [62]:
train_data['label_probs'] = train_data['mood'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('MOOD_')], axis=1)

del train_data['label_probs']
del train_data['mood']

  0%|          | 0/48640 [00:00<?, ?it/s]

In [63]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

Unnamed: 0,rate,text,num_words,cleaned_text,summary,LABEL_0,LABEL_1,LABEL_2,LABEL_3,LABEL_4,...,LABEL_386,LABEL_387,LABEL_388,LABEL_389,LABEL_390,LABEL_391,LABEL_392,MOOD_0,MOOD_1,MOOD_2
0,3,Очень понравилось. Были начале марта собакой. ...,29,Очень понравилось. Были начале марта собакой. ...,Очень понравилось. Были начале марта собакой. ...,0.996648,0.000086,0.000343,0.000294,0.000066,...,0.000003,0.000003,0.000002,0.000003,0.000001,0.000001,9.850864e-07,0.010388,0.986475,0.003136
1,4,В целом магазин устраивает. Ассортимент позвол...,39,В целом магазин устраивает. Ассортимент позвол...,В целом магазин устраивает. Ассортимент позвол...,0.997240,0.000076,0.000275,0.000140,0.000051,...,0.000003,0.000003,0.000002,0.000003,0.000001,0.000001,8.909273e-07,0.680389,0.243738,0.075873
2,4,"Очень открылась ка, далеко ехать рядом!",6,"Очень открылась ка, далеко ехать рядом!","Очень открылась ка, далеко ехать рядом!",0.982682,0.000592,0.001711,0.001077,0.000292,...,0.000008,0.000006,0.000004,0.000007,0.000003,0.000004,3.131172e-06,0.697352,0.186863,0.115785
3,2,Пятрочка громко объявила заботится пенсионерах...,26,Пятрочка громко заботится часы посещения магаз...,Пятрочка громко заботится часы посещения магаз...,0.391624,0.005887,0.000527,0.003289,0.001155,...,0.000053,0.000047,0.000041,0.000060,0.000027,0.000032,2.536447e-05,0.779933,0.021010,0.199057
4,2,"Тесно, вечная сутолока, рядами трудно разойтис...",12,"Тесно, вечная рядами трудно разойтись, грязно....","Тесно, вечная рядами трудно разойтись, грязно....",0.996155,0.000161,0.000396,0.000172,0.000082,...,0.000004,0.000003,0.000002,0.000003,0.000001,0.000001,1.052022e-06,0.545592,0.025131,0.429277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48635,4,"Удобный, маленький ещ обновили другие пятрки",6,"Удобный, маленький ещ обновили другие пятрки","Удобный, маленький ещ обновили другие пятрки",0.993928,0.000115,0.000334,0.000135,0.000090,...,0.000004,0.000003,0.000003,0.000005,0.000002,0.000002,1.358161e-06,0.157865,0.816660,0.025475
48636,1,"Постоянно обман цене,написанна сумма акции ито...",26,Постоянно обман сумма акции итогу пробивают бо...,Постоянно обман сумма акции итогу пробивают бо...,0.005893,0.732756,0.000896,0.002507,0.004022,...,0.000358,0.000367,0.000286,0.000251,0.000170,0.000438,2.403251e-04,0.120826,0.003329,0.875844
48637,1,Очень хочется пожелать этому магазину стать та...,9,Очень хочется пожелать этому магазину стать таким,Очень хочется пожелать этому магазину стать таким,0.976297,0.000925,0.000675,0.000224,0.000425,...,0.000010,0.000008,0.000007,0.000009,0.000005,0.000005,3.461709e-06,0.212692,0.730556,0.056752
48638,4,"Нравится ваш магазин, персонал одекватный, пор...",6,"Нравится ваш магазин, персонал порядок.","Нравится ваш магазин, персонал порядок.",0.996822,0.000127,0.000203,0.000077,0.000039,...,0.000003,0.000003,0.000002,0.000003,0.000001,0.000001,9.963611e-07,0.623831,0.361995,0.014174


In [64]:
# токичность
from transformers import BertTokenizer, BertForSequenceClassification

# tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# batch = tokenizer.encode(train_data[train_data["num_words"] > 80]['text'][48421], return_tensors='pt')
# model(batch)

pipe = pipeline(model="SkolkovoInstitute/russian_toxicity_classifier", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['toxic'] = train_data['summary'].progress_apply(make_pipe)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/48640 [00:00<?, ?it/s]



In [66]:
train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('TOXIC_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

  0%|          | 0/48640 [00:00<?, ?it/s]

In [69]:
# эмоции
import torch
from transformers import BertForSequenceClassification, AutoTokenizer

LABELS = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']
tokenizer = AutoTokenizer.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')
model = BertForSequenceClassification.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')

@torch.no_grad()
def predict_emotion(text: str) -> str:
    """
        We take the input text, tokenize it, pass it through the model, and then return the predicted label
        :param text: The text to be classified
        :type text: str
        :return: The predicted emotion
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
        
    return LABELS[predicted[0]]

@torch.no_grad()    
def predict_emotions(text: str) -> list:
    """
        It takes a string of text, tokenizes it, feeds it to the model, and returns a dictionary of emotions and their
        probabilities
        :param text: The text you want to classify
        :type text: str
        :return: A dictionary of emotions and their probabilities.
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    emotions_list = {}
    for i in range(len(predicted.numpy()[0].tolist())):
        emotions_list[LABELS[i]] = predicted.numpy()[0].tolist()[i]
    return emotions_list

train_data['toxic'] = train_data['summary'].progress_apply(predict_emotions)

  0%|          | 0/48640 [00:00<?, ?it/s]

In [79]:
train_data['toxic'][0]

{'neutral': 0.0004063371161464602,
 'happiness': 0.9980689883232117,
 'sadness': 0.000311726878862828,
 'enthusiasm': 0.00048426855937577784,
 'fear': 0.0002681941259652376,
 'anger': 0.0003283586702309549,
 'disgust': 0.0001321935123996809}

In [80]:
def extract_label_probs(row):
    label_probs = [row.get(label, 0.0) for label in LABELS]
    return label_probs

train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('EMOTION_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

  0%|          | 0/48640 [00:00<?, ?it/s]

In [82]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

# the end poka

In [None]:
# textblob - обработка текста, генерация фич https://textblob.readthedocs.io/en/dev/quickstart.html
# еще одна библиотека для классификации текстов https://small-text.readthedocs.io/en/latest/
# полярность слов https://polyglot.readthedocs.io/en/latest/
# обработка фич https://github.com/jbesomi/texthero
# фичегенерация https://github.com/neomatrix369/nlp_profiler#Notebooks
# классификация на других предобученных моделях, перечисленных у Алерона https://github.com/a-milenkin/Competitive_Data_Science/blob/main/notebooks/9.2.1%20-%20Text_Embeddings.ipynb
# использовать эти ноутбуки для классификации https://github.com/e0xextazy/vkcup2022-first-stage/blob/main/inference.ipynb

In [None]:
train_data["num_words"] = train_data["text"].apply(
    lambda x: len(str(x).split()))
test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))

In [None]:
train_data[train_data["num_words"] > 80]['text']

# Import Libraries

# CLasses

In [None]:

MAX_LEN = 50
BATCH_SIZE = 64

# Loading data

# Label encoding

# Cleaning

In [None]:
%aimport nltk.corpus.reader.bracket_parse
%autoreload 0


In [None]:
import plotly.express as px
def show_count_by_rate(data, rate_name = None, name = "Data"):
    fig = px.histogram(data, x="num_words", color=rate_name, title=f"Number of words in {name} by rate")
    fig.update_layout(bargap=0.2)

    fig.show()
    
show_count_by_rate(train_data, rate_name = "rate", name = 'Train_data')
show_count_by_rate(test_data, name = 'Test_data')

In [None]:
# удалим все 0 и те, чье количество меньше 30
train_data = train_data[train_data['num_words'] != 0]
# train_data = train_data[train_data['num_words'] < 30]

# Train Test split

In [None]:
# train_split, val_split = train_test_split(train_data[train_data['rate'] != 4], test_size=0.15, random_state=42, 
#                                           shuffle = True, stratify=train_data[train_data['rate'] != 4]['rate'])

train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42, 
                                          shuffle = True, stratify=train_data['rate'])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.hist(train_split['rate'], bins=10, alpha=0.5, label='Train Split')
plt.hist(val_split['rate'], bins=10, alpha=0.5, label='Validation Split')

plt.xlabel('Rate')
plt.ylabel('Frequency')
plt.legend()
plt.title('Histogram of Rates for Train and Validation Splits')

plt.show()

In [None]:
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_split['rate']), y=train_split['rate'])
weight_tensor = torch.FloatTensor(weights)
print(weights)

# Loading tokenizer from pretrained

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "cointegrated/rubert-tiny2", truncation=True, do_lower_case=True)

# Creating datasets and dataloaders

In [None]:
train_dataset = FiveDataset(train_split, tokenizer, MAX_LEN)
val_dataset = FiveDataset(val_split, tokenizer, MAX_LEN)
test_dataset = FiveDataset(test_data, tokenizer, MAX_LEN)

In [None]:
MAX_LEN = 100
BATCH_SIZE = 400

In [None]:
train_params = {"batch_size": BATCH_SIZE,
                "shuffle": True,
                "num_workers": 0
                }

test_params = {"batch_size": BATCH_SIZE,
               "shuffle": False,
               "num_workers": 0
               }

train_dataloader = DataLoader(train_dataset, **train_params)
val_dataloader = DataLoader(val_dataset, **test_params)
test_dataloader = DataLoader(test_dataset, **test_params)

# Loading pretrained model from Huggingface

In [None]:
config = {
    "num_classes": len(np.unique(train_split['rate'])),
    "dropout_rate": 0.1
}
model = ModelForClassification(
    "cointegrated/rubert-tiny2",
    config=config
)

# Creating Trainer object and fitting the model

In [None]:
trainer_config = {
    "lr": 3e-4,
    "n_epochs": 3,
    "weight_decay": 1e-6,
    "batch_size": BATCH_SIZE,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "seed": 42,
}
t = Trainer(trainer_config, class_weights=weight_tensor)
# ,class_weights=weight_tensor

In [None]:
t.fit(
    model,
    train_dataloader,
    val_dataloader
)

# Save model

In [None]:
t.save("best_baseline_model.ckpt")

# Load pretrained Model

In [None]:
t = Trainer.load("best_baseline_model.ckpt")

# Get testset predictions


In [None]:
predictions = t.predict(test_dataloader)

In [None]:
predicted_classes = [np.argmax(probabilities) + 1 for probabilities in predictions]

# Create submission


In [None]:
sample_submission = pd.read_csv(os.path.join(PATH, "sample_submission.csv"))
sample_submission["rate"] = predicted_classes
# sample_submission.rate = le.inverse_transform(sample_submission.rate)
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False)

# Train conf matrix

In [None]:
predictions_val = t.predict(val_dataloader)

In [None]:
predicted_classes_val = [np.argmax(probabilities) + 1 for probabilities in predictions_val]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_classification_metrics(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    print(f"Classification Report for {model_name}:\n", classification_report(y_true, y_pred))
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

evaluate_classification_metrics(predicted_classes_val, le.inverse_transform(val_split['rate']), "val dataset")