In [18]:
# !pip install pymorphy2 cleantext -U nlp_profiler textblob pymystem3
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.insert(1, '/kaggle/input/ods-huawei/nlp_huawei_new2_task-master/nlp_huawei_new2_task-master/baseline_transformers')
# from dataset import *
# from model import *
# from trainer import Trainer

import torch
from torch.utils.data import Dataset
from typing import Dict
import json
from numpy import asarray
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW
from tqdm.notebook import tqdm
from textblob import TextBlob

torch.manual_seed(42)

<torch._C.Generator at 0x7ae1709ead10>

In [19]:
class FiveDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        self.targets = None
        if 'rate' in dataframe:
            self.targets = dataframe['rate'].tolist()
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.targets is not None:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

    def __len__(self) -> int:
        return len(self.text)
    

class ModelForClassification(torch.nn.Module):

    def __init__(self, model_path: str, config: Dict):
        super(ModelForClassification, self).__init__()
        self.model_name = model_path
        self.config = config
        self.n_classes = config['num_classes']
        self.dropout_rate = config['dropout_rate']
        self.bert = AutoModel.from_pretrained(self.model_name)
        self.pre_classifier = torch.nn.Linear(312, 768)
        self.dropout = torch.nn.Dropout(self.dropout_rate)
        self.classifier = torch.nn.Linear(768, self.n_classes)
        self.softmax = torch.nn.LogSoftmax(dim = 1)

    def forward(self, input_ids, attention_mask,):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_state = output[0]
        hidden_state = hidden_state[:, 0]
        hidden_state = self.pre_classifier(hidden_state)
        hidden_state = torch.nn.ReLU()(hidden_state)
        hidden_state = self.dropout(hidden_state)
        output = self.classifier(hidden_state)
        output = self.softmax(output)
        return output


class Trainer:
    def __init__(self, config: Dict, class_weights=None):
        self.config = config
        self.device = config['device']
        self.n_epochs = config['n_epochs']
        self.optimizer = None
        self.opt_fn = lambda model: AdamW(model.parameters(), config['lr'])
        self.model = None
        self.history = None
        if class_weights is not None:
            class_weights = class_weights.to(self.device)
            self.loss_fn = CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fn = CrossEntropyLoss()
        self.device = config['device']
        self.verbose = config.get('verbose', True)
        
    def save_history(self, path: str):
        history = {
            'train_loss': self.history['train_loss'],
            'val_loss': self.history['val_loss'],
            'val_acc': self.history['val_acc']
        }
        val_acc = sum(self.history['val_acc']) / len(self.history['val_acc'])
        print("All ACCURACY = ", val_acc)
        with open(path, 'w') as file:
            json.dump(history, file)
        
    def load_history(self, path: str):
        with open(path, 'r') as file:
            history = json.load(file)
        self.history = {
            'train_loss': history['train_loss'],
            'val_loss': history['val_loss'],
            'val_acc': history['val_acc']
        }

    def fit(self, model, train_dataloader, val_dataloader):
        self.model = model.to(self.device)
        self.optimizer = self.opt_fn(model)
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_acc': []
        }
        best_val_loss = float('inf')

        for epoch in range(self.n_epochs):
            print(f"Epoch {epoch + 1}/{self.n_epochs}")
            train_info = self.train_epoch(train_dataloader)
            val_info = self.val_epoch(val_dataloader)
            self.history['train_loss'].extend(train_info['loss'])
            self.history['val_loss'].extend([val_info['loss']])
            self.history['val_acc'].extend([val_info['acc']])

            if val_info['loss'] < best_val_loss:
                best_val_loss = val_info['loss']
                self.save_model_weights('best_model_weights.ckpt')

            self.save_history('history.json')

        return self.model.eval()

    def save_model_weights(self, path: str):
        torch.save(self.model.state_dict(), path)



    def train_epoch(self, train_dataloader):
        self.model.train()
        losses = []
        total_loss = 0
        if self.verbose:
            train_dataloader = tqdm(train_dataloader)
        for batch in train_dataloader:
            ids = batch['ids'].to(self.device, dtype=torch.long)
            mask = batch['mask'].to(self.device, dtype=torch.long)
            targets = batch['targets'].to(self.device, dtype=torch.long)

            outputs = self.model(ids, mask)
            loss = self.loss_fn(outputs, targets)
            total_loss += loss.item()
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_val = loss.item()
            if self.verbose:
                train_dataloader.set_description(f"Loss={loss_val:.3}")
            losses.append(loss_val)
        avg_loss = total_loss / len(train_dataloader)
        print("AVG LOSS = ", avg_loss)
        return {'loss': losses}

    def val_epoch(self, val_dataloader):
        self.model.eval()
        all_logits = []
        all_labels = []
        if self.verbose:
            val_dataloader = tqdm(val_dataloader)
        with torch.no_grad():
            for batch in val_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                targets = batch['targets'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                all_logits.append(outputs)
                all_labels.append(targets)
        all_labels = torch.cat(all_labels).to(self.device)
        all_logits = torch.cat(all_logits).to(self.device)
        loss = self.loss_fn(all_logits, all_labels).item()
        acc = (all_logits.argmax(1) == all_labels).float().mean().item()
        print("ACCURACY for EPOCH = ", acc)
        if self.verbose:
            val_dataloader.set_description(f"Loss={loss:.3}; Acc:{acc:.3}")
        return {
            'acc': acc,
            'loss': loss
        }

    def predict(self, test_dataloader):
        if not self.model:
            raise RuntimeError("You should train the model first")
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                preds = torch.exp(outputs)
                predictions.extend(preds.tolist())
        return asarray(predictions)

    def save(self, path: str):
        if self.model is None:
            raise RuntimeError("You should train the model first")
        checkpoint = {
            "config": self.model.config,
            "trainer_config": self.config,
            "model_name": self.model.model_name,
            "model_state_dict": self.model.state_dict()
        }
        torch.save(checkpoint, path)

    def plot_history(self):
        import matplotlib.pyplot as plt
        
        if self.history is None:
            raise RuntimeError("History is not available. Train the model first.")

        train_loss = self.history['train_loss']
        val_loss = self.history['val_loss']
        val_acc = self.history['val_acc']

        epochs = range(1, len(train_loss) + 1)

        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(epochs, train_loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'r', label='Validation loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(epochs, val_acc, 'g', label='Validation accuracy')
        plt.title('Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.show()


    @classmethod
    def load(cls, path: str):
        ckpt = torch.load(path)
        keys = ["config", "trainer_config", "model_state_dict"]
        for key in keys:
            if key not in ckpt:
                raise RuntimeError(f"Missing key {key} in checkpoint")
        new_model = ModelForClassification(
            ckpt['model_name'],
            ckpt["config"]
        )
        new_model.load_state_dict(ckpt["model_state_dict"])
        new_trainer = cls(ckpt["trainer_config"])
        new_trainer.model = new_model
        new_trainer.model.to(new_trainer.device)
        return new_trainer

# Data load

In [3]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(PATH, "test.csv"))
le = LabelEncoder()
train_data.rate = le.fit_transform(train_data.rate)
train_data.head()

Unnamed: 0,rate,text
0,3,Очень понравилось. Были в начале марта с соба...
1,4,В целом магазин устраивает.\nАссортимент позво...
2,4,"Очень хорошо что открылась 5 ка, теперь не над..."
3,2,Пятёрочка громко объявила о том как она заботи...
4,2,"Тесно, вечная сутолока, между рядами трудно ра..."


# Pre-clean text

In [None]:
# import re
# TOKEN_RE = re.compile(r'[а-яё]+')

# def tokenize_text(text, min_length_token=1):
#     text = text.lower()
#     tokens = TOKEN_RE.findall(text)
#     return [token for token in tokens if len(token) >= min_length_token]

# def text_cleaning(text):
#     tokens = tokenize_text(text)
#     return ' '.join(tokens)

# tqdm.pandas()
# train_data['text'] = train_data['text'].progress_apply(text_cleaning)
# test_data['text'] = test_data['text'].progress_apply(text_cleaning)

import re
import pymorphy2

from nltk.corpus import stopwords

ru_stopwords = stopwords.words('russian')
digits = [str(i) for i in range(10)]

TOKEN_RE = re.compile(r'[а-яё!.,?%]+')
lemmatizer = pymorphy2.MorphAnalyzer()

def is_valid_word(word):
    if not word[0].isdigit() and word not in ru_stopwords:
        parsed_word = lemmatizer.normal_forms(word)[0]
        return parsed_word
    return False

def text_cleaning(text):
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    words = text.split()
    cleaned_words = [word for word in words[:512] if is_valid_word(word) and len(word) < 15]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

tqdm.pandas()
train_data['text'] = train_data['text'].progress_apply(text_cleaning)
test_data['text'] = test_data['text'].progress_apply(text_cleaning)

train_data["num_words"] = train_data["text"].apply(
    lambda x: len(str(x).split()))
test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))

In [None]:
# train_data.to_csv("cleaned_train.csv", index=False)
# test_data.to_csv("cleaned_test.csv", index=False)

In [None]:
# del zero
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]

In [None]:
from collections import Counter

def remove_infrequent_words(dataset, min_count=3):
    word_counter = Counter()
    for text in dataset:
        words = text.split()
        word_counter.update(words)
    infrequent_words = [word for word, count in word_counter.items() if count < min_count]
    def remove_infrequent(text):
        words = text.split()
        cleaned_words = [word for word in words if word not in infrequent_words]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text
    cleaned_dataset = [remove_infrequent(text) for text in tqdm(dataset, desc="Cleaning text")]

    return cleaned_dataset

cleaned_train = remove_infrequent_words(train_data['text'].tolist())
cleaned_test = remove_infrequent_words(test_data['text'].tolist())


In [None]:
train_data['cleaned_text'] = cleaned_train
test_data['cleaned_text'] = cleaned_test
train_data.to_csv("cleaned_train.csv", index=False)
test_data.to_csv("cleaned_test.csv", index=False)

# New data

In [None]:
PATH = "/kaggle/input/cleaned-text"
train_data = pd.read_csv(os.path.join(PATH, "cleaned_train (1).csv"))
test_data = pd.read_csv(os.path.join(PATH, "cleaned_test (1).csv"))
# del zero
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]
train_data.head()

In [None]:
# replace nan

def replace_nan_with_text(row):
    if pd.isna(row['cleaned_text']):
        return row['text']
    return row['cleaned_text']

train_data['cleaned_text'] = train_data.progress_apply(replace_nan_with_text, axis=1)
test_data['cleaned_text'] = test_data.progress_apply(replace_nan_with_text, axis=1)

In [None]:
def truncate_text(text, max_words=512):
    words = text.split()
    if len(words) > max_words:
        truncated_text = ' '.join(words[:max_words])
    else:
        truncated_text = text
    return truncated_text

tqdm.pandas()
train_data['cleaned_text'] = train_data['cleaned_text'].progress_apply(truncate_text)
test_data['cleaned_text'] = test_data['cleaned_text'].progress_apply(truncate_text)

In [None]:
# идея суммирования текста в более короткий текст


from transformers import MBartTokenizer, MBartForConditionalGeneration

model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def summary_rows(article_text):
    input_ids = tokenizer(
        [article_text],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )[0]

    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    return summary

def text_summary(text):
    if isinstance(text, str) and text.strip() and len(str(text).split()) > 150:
        return summary_rows(text)
    else:
        return text
    

train_data['summary'] = train_data['cleaned_text'].progress_apply(text_summary)
test_data['summary'] = test_data['cleaned_text'].progress_apply(text_summary)

In [None]:
train_data

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
from tqdm.notebook import tqdm

model = AutoModelForSequenceClassification.from_pretrained("apanc/russian-sensitive-topics")
tokenizer = AutoTokenizer.from_pretrained("apanc/russian-sensitive-topics")
tokenizer.padding = True
tokenizer.truncation = True
tokenizer.max_length = 512
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=torch.device("cuda:0"))

def make_pipe(text):
    return pipe(text, return_all_scores=True)

tqdm.pandas()
train_data['theme_labels'] = train_data['summary'].progress_apply(make_pipe)


In [None]:
def extract_label_probs(row):
    label_probs = [label['score'] for label in row[0]]
    return label_probs

train_data['label_probs'] = train_data['theme_labels'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].apply(pd.Series).add_prefix('LABEL_')], axis=1)

del train_data['label_probs']
del train_data['theme_labels']

In [None]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

In [None]:
# # добавление переменных о чувствах
# from transformers import BertTokenizer, BertForSequenceClassification
# model_name = 'Skoltech/russian-sensitive-topics'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name);

# tokenized = tokenizer.batch_encode_plus(train_data[train_data["num_words"] > 80]['text'][370],
#                                         max_length = 512,
#                                         pad_to_max_length=True,
#                                         truncation=True,
#                                         return_token_type_ids=False)

# tokens_ids,mask = torch.tensor(tokenized['input_ids']),torch.tensor(tokenized['attention_mask']) 

# with torch.no_grad():
#     model_output = model(tokens_ids,mask)

# def adjust_multilabel(y, is_pred = False):
#     y_adjusted = []
#     for y_c in y:
#         y_test_curr = [0]*19
#         index = str(int(np.argmax(y_c)))
#         y_c = target_vaiables_id2topic_dict[index]
#     return y_c

# model_output

In [None]:
# тональность текста
pipe = pipeline(model="seara/rubert-tiny2-russian-sentiment", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['mood'] = train_data['summary'].progress_apply(make_pipe)

In [None]:
train_data['label_probs'] = train_data['mood'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('MOOD_')], axis=1)

del train_data['label_probs']
del train_data['mood']

In [None]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

In [None]:
# токичность
from transformers import BertTokenizer, BertForSequenceClassification

# tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# batch = tokenizer.encode(train_data[train_data["num_words"] > 80]['text'][48421], return_tensors='pt')
# model(batch)

pipe = pipeline(model="SkolkovoInstitute/russian_toxicity_classifier", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['toxic'] = train_data['summary'].progress_apply(make_pipe)

In [None]:
train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('TOXIC_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

In [None]:
# эмоции
import torch
from transformers import BertForSequenceClassification, AutoTokenizer

LABELS = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']
tokenizer = AutoTokenizer.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')
model = BertForSequenceClassification.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')

@torch.no_grad()
def predict_emotion(text: str) -> str:
    """
        We take the input text, tokenize it, pass it through the model, and then return the predicted label
        :param text: The text to be classified
        :type text: str
        :return: The predicted emotion
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
        
    return LABELS[predicted[0]]

@torch.no_grad()    
def predict_emotions(text: str) -> list:
    """
        It takes a string of text, tokenizes it, feeds it to the model, and returns a dictionary of emotions and their
        probabilities
        :param text: The text you want to classify
        :type text: str
        :return: A dictionary of emotions and their probabilities.
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    emotions_list = {}
    for i in range(len(predicted.numpy()[0].tolist())):
        emotions_list[LABELS[i]] = predicted.numpy()[0].tolist()[i]
    return emotions_list

train_data['toxic'] = train_data['summary'].progress_apply(predict_emotions)

In [None]:
train_data['toxic'][0]

In [None]:
def extract_label_probs(row):
    label_probs = [row.get(label, 0.0) for label in LABELS]
    return label_probs

train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('EMOTION_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

In [None]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

# the end poka

In [None]:
# textblob - обработка текста, генерация фич https://textblob.readthedocs.io/en/dev/quickstart.html
# еще одна библиотека для классификации текстов https://small-text.readthedocs.io/en/latest/
# полярность слов https://polyglot.readthedocs.io/en/latest/
# обработка фич https://github.com/jbesomi/texthero
# фичегенерация https://github.com/neomatrix369/nlp_profiler#Notebooks
# классификация на других предобученных моделях, перечисленных у Алерона https://github.com/a-milenkin/Competitive_Data_Science/blob/main/notebooks/9.2.1%20-%20Text_Embeddings.ipynb
# использовать эти ноутбуки для классификации https://github.com/e0xextazy/vkcup2022-first-stage/blob/main/inference.ipynb

In [4]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "feature_train.csv"))

In [5]:
train_data["num_words"] = train_data["text"].apply(
    lambda x: len(str(x).split()))
test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))

In [7]:
train_data.head(10)

Unnamed: 0,rate,text,num_words,cleaned_text,summary,LABEL_0,LABEL_1,LABEL_2,LABEL_3,LABEL_4,...,MOOD_2,TOXIC_0,TOXIC_1,EMOTION_0,EMOTION_1,EMOTION_2,EMOTION_3,EMOTION_4,EMOTION_5,EMOTION_6
0,3,Очень понравилось. Были начале марта собакой. ...,29,Очень понравилось. Были начале марта собакой. ...,Очень понравилось. Были начале марта собакой. ...,0.996648,8.6e-05,0.000343,0.000294,6.6e-05,...,0.003136,0.998674,0.001326,0.000406,0.998069,0.000312,0.000484,0.000268,0.000328,0.000132
1,4,В целом магазин устраивает. Ассортимент позвол...,39,В целом магазин устраивает. Ассортимент позвол...,В целом магазин устраивает. Ассортимент позвол...,0.99724,7.6e-05,0.000275,0.00014,5.1e-05,...,0.075873,0.998687,0.001313,0.999554,7.6e-05,0.000105,8.1e-05,6.9e-05,7.2e-05,4.3e-05
2,4,"Очень открылась ка, далеко ехать рядом!",6,"Очень открылась ка, далеко ехать рядом!","Очень открылась ка, далеко ехать рядом!",0.982682,0.000592,0.001711,0.001077,0.000292,...,0.115785,0.997499,0.002501,0.001433,0.941779,0.050075,0.001406,0.002641,0.001579,0.001087
3,2,Пятрочка громко объявила заботится пенсионерах...,26,Пятрочка громко заботится часы посещения магаз...,Пятрочка громко заботится часы посещения магаз...,0.391624,0.005887,0.000527,0.003289,0.001155,...,0.199057,0.996688,0.003312,0.999452,9.1e-05,0.000123,0.000112,7.6e-05,9.7e-05,4.9e-05
4,2,"Тесно, вечная сутолока, рядами трудно разойтис...",12,"Тесно, вечная рядами трудно разойтись, грязно....","Тесно, вечная рядами трудно разойтись, грязно....",0.996155,0.000161,0.000396,0.000172,8.2e-05,...,0.429277,0.99525,0.00475,0.000524,0.000424,0.996605,0.000305,0.000807,0.000924,0.000411
5,3,Магазин пешей доступности. После ремонта рекон...,13,Магазин пешей доступности. После ремонта рекон...,Магазин пешей доступности. После ремонта рекон...,0.769079,0.001755,0.000365,0.002494,0.001035,...,0.004294,0.99861,0.00139,0.999216,0.00023,0.000144,0.000119,9.3e-05,0.000132,6.6e-05
6,4,Магазин хороший цены скидки нормальные токо вр...,13,Магазин хороший цены скидки нормальные токо вр...,Магазин хороший цены скидки нормальные токо вр...,0.994967,0.000159,0.00113,0.000186,0.00024,...,0.028344,0.998878,0.001122,0.999532,0.0001,9e-05,8.6e-05,6.7e-05,7.8e-05,4.7e-05
7,2,"Редко сюда забегаю. Маленький магазинчик, это ...",22,"Редко сюда забегаю. Маленький магазинчик, это ...","Редко сюда забегаю. Маленький магазинчик, это ...",0.948751,0.000679,0.000612,0.019587,0.00126,...,0.232759,0.995381,0.004619,0.998663,0.000173,0.000143,0.0005,0.000154,0.000271,9.6e-05
8,4,Сложно найти торговом центре. А магазин норм,7,Сложно найти торговом центре. А магазин норм,Сложно найти торговом центре. А магазин норм,0.565123,0.006904,0.000616,0.007276,0.004125,...,0.053535,0.998884,0.001117,0.99954,7.8e-05,0.000102,9e-05,7.3e-05,7.4e-05,4.3e-05
9,3,После ремонта магазин нутри стал ещ лучше. Бол...,18,После ремонта магазин нутри стал ещ лучше. Бол...,После ремонта магазин нутри стал ещ лучше. Бол...,0.993944,0.000154,0.000255,0.000182,0.000107,...,0.120662,0.999014,0.000986,0.999063,0.000261,0.000107,0.00022,9.2e-05,0.000182,7.5e-05


# TFIDF

In [23]:
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm

# from sklearn.cluster import MiniBatchKMeans
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE

# tfidf = TfidfVectorizer(
#     min_df = 5,
#     max_df = 0.85,
#     max_features = 8000,
# )
# tfidf.fit(train_data.summary)
# text = tfidf.transform(train_data.summary)

# def find_optimal_clusters(data, max_k):
#     iters = range(2, max_k+1, 2)
    
#     sse = []
#     for k in iters:
#         sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
#         print('Fit {} clusters'.format(k))
        
#     f, ax = plt.subplots(1, 1)
#     ax.plot(iters, sse, marker='o')
#     ax.set_xlabel('Cluster Centers')
#     ax.set_xticks(iters)
#     ax.set_xticklabels(iters)
#     ax.set_ylabel('SSE')
#     ax.set_title('SSE by Cluster Center Plot')

# def plot_tsne_pca(data, labels):
#     max_label = max(labels)
#     max_items = np.random.choice(range(data.shape[0]), size=3000, replace=False)
    
#     pca = PCA(n_components=2).fit_transform(np.asarray(data[max_items,:].todense()))
#     tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(np.asarray(data[max_items,:].todense())))
    
    
#     idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
#     label_subset = labels[max_items]
#     label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
#     f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
#     ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
#     ax[0].set_title('PCA Cluster Plot')
    
#     ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
#     ax[1].set_title('TSNE Cluster Plot')

# find_optimal_clusters(text, 20)

# plot_tsne_pca(text, clusters)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000,
                                 min_df=0.01,
                                 use_idf=True, tokenizer=token_and_stem, ngram_range=(1,3))
get_ipython().magic('time tfidf_matrix = tfidf_vectorizer.fit_transform(titles)')
print(tfidf_matrix.shape)

In [None]:
num_clusters = 5

# Метод к-средних - KMeans
from sklearn.cluster import KMeans

km = KMeans(n_clusters=num_clusters)
get_ipython().magic('time km.fit(tfidf_matrix)')
idx = km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

print(clusters)
print (km.labels_)

# MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans

mbk  = MiniBatchKMeans(init='random', n_clusters=num_clusters) #(init='k-means++', ‘random’ or an ndarray)
mbk.fit_transform(tfidf_matrix)
%time mbk.fit(tfidf_matrix)
miniclusters = mbk.labels_.tolist()
print (mbk.labels_)


# DBSCAN
from sklearn.cluster import DBSCAN
get_ipython().magic('time db = DBSCAN(eps=0.3, min_samples=10).fit(tfidf_matrix)')
labels = db.labels_
labels.shape
print(labels)

# Аггломеративная класстеризация
from sklearn.cluster import AgglomerativeClustering

agglo1 = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean') #affinity можно выбрать любое или попробовать все по очереди: cosine, l1, l2, manhattan
get_ipython().magic('time answer = agglo1.fit_predict(tfidf_matrix.toarray())')
answer.shape

# NLP profiler

In [37]:
!pip install -U nlp-profiler > installer_log.txt

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scattertext 0.1.19 requires gensim>=4.0.0, but you have gensim 3.8.1 which is incompatible.
texthero 1.1.0 requires spacy<3.0.0, but you have spacy 3.6.1 which is incompatible.[0m[31m
[0m

In [39]:
from nlp_profiler.core import apply_text_profiling

profiled_text_dataframe = apply_text_profiling(train_data, 'text')

final params: {'high_level': True, 'granular': True, 'grammar_check': False, 'spelling_check': True, 'parallelisation_method': 'default'}


  0%|                                                                                                         …



  0%|                                                                                                         …

  0%|                                                                                                         …

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_p

  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …

  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …



  0%|                                                                                                         …

  0%|                                                                                                         …



  0%|                                                                                                         …



In [40]:
profiled_text_dataframe

Unnamed: 0,text,sentences_count,characters_count,spaces_count,count_words,duplicates_count,chars_excl_spaces_count,emoji_count,whole_numbers_count,alpha_numeric_count,...,noun_phase_count,sentiment_polarity_score,sentiment_polarity,sentiment_polarity_summarised,sentiment_subjectivity_score,sentiment_subjectivity,sentiment_subjectivity_summarised,spelling_quality_score,spelling_quality,spelling_quality_summarised
0,Очень понравилось. Были начале марта собакой. ...,8,203,28,27,1,175,0,0,0,...,22,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.277778,Pretty bad,Bad
1,В целом магазин устраивает. Ассортимент позвол...,5,327,38,45,4,289,0,0,0,...,40,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.207547,Pretty bad,Bad
2,"Очень открылась ка, далеко ехать рядом!",1,39,5,6,0,34,0,0,0,...,6,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.250000,Pretty bad,Bad
3,Пятрочка громко объявила заботится пенсионерах...,3,204,25,27,3,179,0,0,0,...,25,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.235294,Pretty bad,Bad
4,"Тесно, вечная сутолока, рядами трудно разойтис...",3,85,11,12,2,74,0,0,0,...,11,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.333333,Pretty bad,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48635,"Удобный, маленький ещ обновили другие пятрки",1,44,5,6,0,39,0,0,0,...,6,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.142857,Pretty bad,Bad
48636,"Постоянно обман цене,написанна сумма акции ито...",2,226,25,30,3,201,0,0,0,...,30,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.230769,Pretty bad,Bad
48637,Очень хочется пожелать этому магазину стать та...,1,67,8,9,0,59,0,0,0,...,9,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.100000,Quite bad,Bad
48638,"Нравится ваш магазин, персонал одекватный, пор...",1,51,5,6,1,46,0,0,0,...,6,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.333333,Pretty bad,Bad


In [41]:
profiled_text_dataframe.to_csv("feature_profiler_train.csv", index=False)

# Import Libraries

# CLasses

In [None]:

MAX_LEN = 50
BATCH_SIZE = 64

# Loading data

# Label encoding

# Cleaning

In [None]:
%aimport nltk.corpus.reader.bracket_parse
%autoreload 0


In [None]:
import plotly.express as px
def show_count_by_rate(data, rate_name = None, name = "Data"):
    fig = px.histogram(data, x="num_words", color=rate_name, title=f"Number of words in {name} by rate")
    fig.update_layout(bargap=0.2)

    fig.show()
    
show_count_by_rate(train_data, rate_name = "rate", name = 'Train_data')
show_count_by_rate(test_data, name = 'Test_data')

In [None]:
# удалим все 0 и те, чье количество меньше 30
train_data = train_data[train_data['num_words'] != 0]
# train_data = train_data[train_data['num_words'] < 30]

# Train Test split

In [None]:
# train_split, val_split = train_test_split(train_data[train_data['rate'] != 4], test_size=0.15, random_state=42, 
#                                           shuffle = True, stratify=train_data[train_data['rate'] != 4]['rate'])

train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42, 
                                          shuffle = True, stratify=train_data['rate'])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.hist(train_split['rate'], bins=10, alpha=0.5, label='Train Split')
plt.hist(val_split['rate'], bins=10, alpha=0.5, label='Validation Split')

plt.xlabel('Rate')
plt.ylabel('Frequency')
plt.legend()
plt.title('Histogram of Rates for Train and Validation Splits')

plt.show()

In [None]:
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_split['rate']), y=train_split['rate'])
weight_tensor = torch.FloatTensor(weights)
print(weights)

# Loading tokenizer from pretrained

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "cointegrated/rubert-tiny2", truncation=True, do_lower_case=True)

# Creating datasets and dataloaders

In [None]:
train_dataset = FiveDataset(train_split, tokenizer, MAX_LEN)
val_dataset = FiveDataset(val_split, tokenizer, MAX_LEN)
test_dataset = FiveDataset(test_data, tokenizer, MAX_LEN)

In [None]:
MAX_LEN = 100
BATCH_SIZE = 400

In [None]:
train_params = {"batch_size": BATCH_SIZE,
                "shuffle": True,
                "num_workers": 0
                }

test_params = {"batch_size": BATCH_SIZE,
               "shuffle": False,
               "num_workers": 0
               }

train_dataloader = DataLoader(train_dataset, **train_params)
val_dataloader = DataLoader(val_dataset, **test_params)
test_dataloader = DataLoader(test_dataset, **test_params)

# Loading pretrained model from Huggingface

In [None]:
config = {
    "num_classes": len(np.unique(train_split['rate'])),
    "dropout_rate": 0.1
}
model = ModelForClassification(
    "cointegrated/rubert-tiny2",
    config=config
)

# Creating Trainer object and fitting the model

In [None]:
trainer_config = {
    "lr": 3e-4,
    "n_epochs": 3,
    "weight_decay": 1e-6,
    "batch_size": BATCH_SIZE,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "seed": 42,
}
t = Trainer(trainer_config, class_weights=weight_tensor)
# ,class_weights=weight_tensor

In [None]:
t.fit(
    model,
    train_dataloader,
    val_dataloader
)

# Save model

In [None]:
t.save("best_baseline_model.ckpt")

# Load pretrained Model

In [None]:
t = Trainer.load("best_baseline_model.ckpt")

# Get testset predictions


In [None]:
predictions = t.predict(test_dataloader)

In [None]:
predicted_classes = [np.argmax(probabilities) + 1 for probabilities in predictions]

# Create submission


In [None]:
sample_submission = pd.read_csv(os.path.join(PATH, "sample_submission.csv"))
sample_submission["rate"] = predicted_classes
# sample_submission.rate = le.inverse_transform(sample_submission.rate)
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False)

# Train conf matrix

In [None]:
predictions_val = t.predict(val_dataloader)

In [None]:
predicted_classes_val = [np.argmax(probabilities) + 1 for probabilities in predictions_val]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_classification_metrics(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    print(f"Classification Report for {model_name}:\n", classification_report(y_true, y_pred))
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

evaluate_classification_metrics(predicted_classes_val, le.inverse_transform(val_split['rate']), "val dataset")