<a href="https://www.kaggle.com/code/akscent/feature-extraction-classifer-txt?scriptVersionId=149943051" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:

!pip install pymorphy2 cleantext -U pip setuptools wheel nlp_profiler textblob pymystem3 > installer_log.txt
!pip install spacy > installer_log.txt
import os
import sys
import torch
import json
import spacy
import io
import ru_core_news_md
import shap
shap.initjs()
import pandas as pd
import numpy as np

from numpy import asarray
from collections import Counter
from typing import Dict
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModel, MBartTokenizer, MBartForConditionalGeneration, BertTokenizer, BertForSequenceClassification
from textblob import TextBlob
from nlp_profiler.core import apply_text_profiling
from pymystem3 import Mystem
from nltk.corpus import stopwords
from catboost import CatBoostClassifier

# sys.path.insert(1, '/kaggle/input/ods-huawei/nlp_huawei_new2_task-master/nlp_huawei_new2_task-master/baseline_transformers')
# from dataset import *
# from model import *
# from trainer import Trainer

torch.manual_seed(42)

In [None]:
class FiveDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        self.targets = None
        if 'rate' in dataframe:
            self.targets = dataframe['rate'].tolist()
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.targets is not None:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

    def __len__(self) -> int:
        return len(self.text)
    

class ModelForClassification(torch.nn.Module):

    def __init__(self, model_path: str, config: Dict):
        super(ModelForClassification, self).__init__()
        self.model_name = model_path
        self.config = config
        self.n_classes = config['num_classes']
        self.dropout_rate = config['dropout_rate']
        self.bert = AutoModel.from_pretrained(self.model_name)
        self.pre_classifier = torch.nn.Linear(312, 768)
        self.dropout = torch.nn.Dropout(self.dropout_rate)
        self.classifier = torch.nn.Linear(768, self.n_classes)
        self.softmax = torch.nn.LogSoftmax(dim = 1)

    def forward(self, input_ids, attention_mask,):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_state = output[0]
        hidden_state = hidden_state[:, 0]
        hidden_state = self.pre_classifier(hidden_state)
        hidden_state = torch.nn.ReLU()(hidden_state)
        hidden_state = self.dropout(hidden_state)
        output = self.classifier(hidden_state)
        output = self.softmax(output)
        return output


class Trainer:
    def __init__(self, config: Dict, class_weights=None):
        self.config = config
        self.device = config['device']
        self.n_epochs = config['n_epochs']
        self.optimizer = None
        self.opt_fn = lambda model: AdamW(model.parameters(), config['lr'])
        self.model = None
        self.history = None
        if class_weights is not None:
            class_weights = class_weights.to(self.device)
            self.loss_fn = CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fn = CrossEntropyLoss()
        self.device = config['device']
        self.verbose = config.get('verbose', True)
        
    def save_history(self, path: str):
        history = {
            'train_loss': self.history['train_loss'],
            'val_loss': self.history['val_loss'],
            'val_acc': self.history['val_acc']
        }
        val_acc = sum(self.history['val_acc']) / len(self.history['val_acc'])
        print("All ACCURACY = ", val_acc)
        with open(path, 'w') as file:
            json.dump(history, file)
        
    def load_history(self, path: str):
        with open(path, 'r') as file:
            history = json.load(file)
        self.history = {
            'train_loss': history['train_loss'],
            'val_loss': history['val_loss'],
            'val_acc': history['val_acc']
        }

    def fit(self, model, train_dataloader, val_dataloader):
        self.model = model.to(self.device)
        self.optimizer = self.opt_fn(model)
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_acc': []
        }
        best_val_loss = float('inf')

        for epoch in range(self.n_epochs):
            print(f"Epoch {epoch + 1}/{self.n_epochs}")
            train_info = self.train_epoch(train_dataloader)
            val_info = self.val_epoch(val_dataloader)
            self.history['train_loss'].extend(train_info['loss'])
            self.history['val_loss'].extend([val_info['loss']])
            self.history['val_acc'].extend([val_info['acc']])

            if val_info['loss'] < best_val_loss:
                best_val_loss = val_info['loss']
                self.save_model_weights('best_model_weights.ckpt')

            self.save_history('history.json')

        return self.model.eval()

    def save_model_weights(self, path: str):
        torch.save(self.model.state_dict(), path)



    def train_epoch(self, train_dataloader):
        self.model.train()
        losses = []
        total_loss = 0
        if self.verbose:
            train_dataloader = tqdm(train_dataloader)
        for batch in train_dataloader:
            ids = batch['ids'].to(self.device, dtype=torch.long)
            mask = batch['mask'].to(self.device, dtype=torch.long)
            targets = batch['targets'].to(self.device, dtype=torch.long)

            outputs = self.model(ids, mask)
            loss = self.loss_fn(outputs, targets)
            total_loss += loss.item()
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_val = loss.item()
            if self.verbose:
                train_dataloader.set_description(f"Loss={loss_val:.3}")
            losses.append(loss_val)
        avg_loss = total_loss / len(train_dataloader)
        print("AVG LOSS = ", avg_loss)
        return {'loss': losses}

    def val_epoch(self, val_dataloader):
        self.model.eval()
        all_logits = []
        all_labels = []
        if self.verbose:
            val_dataloader = tqdm(val_dataloader)
        with torch.no_grad():
            for batch in val_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                targets = batch['targets'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                all_logits.append(outputs)
                all_labels.append(targets)
        all_labels = torch.cat(all_labels).to(self.device)
        all_logits = torch.cat(all_logits).to(self.device)
        loss = self.loss_fn(all_logits, all_labels).item()
        acc = (all_logits.argmax(1) == all_labels).float().mean().item()
        print("ACCURACY for EPOCH = ", acc)
        if self.verbose:
            val_dataloader.set_description(f"Loss={loss:.3}; Acc:{acc:.3}")
        return {
            'acc': acc,
            'loss': loss
        }

    def predict(self, test_dataloader):
        if not self.model:
            raise RuntimeError("You should train the model first")
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                preds = torch.exp(outputs)
                predictions.extend(preds.tolist())
        return asarray(predictions)

    def save(self, path: str):
        if self.model is None:
            raise RuntimeError("You should train the model first")
        checkpoint = {
            "config": self.model.config,
            "trainer_config": self.config,
            "model_name": self.model.model_name,
            "model_state_dict": self.model.state_dict()
        }
        torch.save(checkpoint, path)

    def plot_history(self):
        import matplotlib.pyplot as plt
        
        if self.history is None:
            raise RuntimeError("History is not available. Train the model first.")

        train_loss = self.history['train_loss']
        val_loss = self.history['val_loss']
        val_acc = self.history['val_acc']

        epochs = range(1, len(train_loss) + 1)

        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(epochs, train_loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'r', label='Validation loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(epochs, val_acc, 'g', label='Validation accuracy')
        plt.title('Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.show()


    @classmethod
    def load(cls, path: str):
        ckpt = torch.load(path)
        keys = ["config", "trainer_config", "model_state_dict"]
        for key in keys:
            if key not in ckpt:
                raise RuntimeError(f"Missing key {key} in checkpoint")
        new_model = ModelForClassification(
            ckpt['model_name'],
            ckpt["config"]
        )
        new_model.load_state_dict(ckpt["model_state_dict"])
        new_trainer = cls(ckpt["trainer_config"])
        new_trainer.model = new_model
        new_trainer.model.to(new_trainer.device)
        return new_trainer

# Data load

In [None]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(PATH, "test.csv"))
le = LabelEncoder()
train_data.rate = le.fit_transform(train_data.rate)
train_data.head()

# Pre-clean text

In [None]:
# import re
# TOKEN_RE = re.compile(r'[а-яё]+')

# def tokenize_text(text, min_length_token=1):
#     text = text.lower()
#     tokens = TOKEN_RE.findall(text)
#     return [token for token in tokens if len(token) >= min_length_token]

# def text_cleaning(text):
#     tokens = tokenize_text(text)
#     return ' '.join(tokens)

# tqdm.pandas()
# train_data['text'] = train_data['text'].progress_apply(text_cleaning)
# test_data['text'] = test_data['text'].progress_apply(text_cleaning)


ru_stopwords = stopwords.words('russian')
digits = [str(i) for i in range(10)]

TOKEN_RE = re.compile(r'[а-яё!.,?%]+')
lemmatizer = pymorphy2.MorphAnalyzer()

def is_valid_word(word):
    if not word[0].isdigit() and word not in ru_stopwords:
        parsed_word = lemmatizer.normal_forms(word)[0]
        return parsed_word
    return False

def text_cleaning(text):
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    words = text.split()
    cleaned_words = [word for word in words[:512] if is_valid_word(word) and len(word) < 15]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

tqdm.pandas()
train_data['text'] = train_data['text'].progress_apply(text_cleaning)
test_data['text'] = test_data['text'].progress_apply(text_cleaning)

train_data["num_words"] = train_data["text"].apply(
    lambda x: len(str(x).split()))
test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))

In [None]:
# train_data.to_csv("cleaned_train.csv", index=False)
# test_data.to_csv("cleaned_test.csv", index=False)

In [None]:
# del zero
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]

In [None]:

def remove_infrequent_words(dataset, min_count=3):
    word_counter = Counter()
    for text in dataset:
        words = text.split()
        word_counter.update(words)
    infrequent_words = [word for word, count in word_counter.items() if count < min_count]
    def remove_infrequent(text):
        words = text.split()
        cleaned_words = [word for word in words if word not in infrequent_words]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text
    cleaned_dataset = [remove_infrequent(text) for text in tqdm(dataset, desc="Cleaning text")]

    return cleaned_dataset

cleaned_train = remove_infrequent_words(train_data['text'].tolist())
cleaned_test = remove_infrequent_words(test_data['text'].tolist())


In [None]:
train_data['cleaned_text'] = cleaned_train
test_data['cleaned_text'] = cleaned_test
train_data.to_csv("cleaned_train.csv", index=False)
test_data.to_csv("cleaned_test.csv", index=False)

# New data

In [None]:
PATH = "/kaggle/input/cleaned-text"
train_data = pd.read_csv(os.path.join(PATH, "cleaned_train (1).csv"))
test_data = pd.read_csv(os.path.join(PATH, "cleaned_test (1).csv"))
# del zero
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]
train_data.head()

In [None]:
# replace nan

def replace_nan_with_text(row):
    if pd.isna(row['cleaned_text']):
        return row['text']
    return row['cleaned_text']

train_data['cleaned_text'] = train_data.progress_apply(replace_nan_with_text, axis=1)
test_data['cleaned_text'] = test_data.progress_apply(replace_nan_with_text, axis=1)

In [None]:
def truncate_text(text, max_words=512):
    words = text.split()
    if len(words) > max_words:
        truncated_text = ' '.join(words[:max_words])
    else:
        truncated_text = text
    return truncated_text

tqdm.pandas()
train_data['cleaned_text'] = train_data['cleaned_text'].progress_apply(truncate_text)
test_data['cleaned_text'] = test_data['cleaned_text'].progress_apply(truncate_text)

In [None]:
# идея суммирования текста в более короткий текст

model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def summary_rows(article_text):
    input_ids = tokenizer(
        [article_text],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )[0]

    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    return summary

def text_summary(text):
    if isinstance(text, str) and text.strip() and len(str(text).split()) > 150:
        return summary_rows(text)
    else:
        return text
    

train_data['summary'] = train_data['cleaned_text'].progress_apply(text_summary)
test_data['summary'] = test_data['cleaned_text'].progress_apply(text_summary)

In [None]:
train_data

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("apanc/russian-sensitive-topics")
tokenizer = AutoTokenizer.from_pretrained("apanc/russian-sensitive-topics")
tokenizer.padding = True
tokenizer.truncation = True
tokenizer.max_length = 512
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=torch.device("cuda:0"))

def make_pipe(text):
    return pipe(text, return_all_scores=True)

tqdm.pandas()
train_data['theme_labels'] = train_data['summary'].progress_apply(make_pipe)


In [None]:
def extract_label_probs(row):
    label_probs = [label['score'] for label in row[0]]
    return label_probs

train_data['label_probs'] = train_data['theme_labels'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].apply(pd.Series).add_prefix('LABEL_')], axis=1)

del train_data['label_probs']
del train_data['theme_labels']

In [None]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

In [None]:
# # добавление переменных о чувствах
# from transformers import BertTokenizer, BertForSequenceClassification
# model_name = 'Skoltech/russian-sensitive-topics'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name);

# tokenized = tokenizer.batch_encode_plus(train_data[train_data["num_words"] > 80]['text'][370],
#                                         max_length = 512,
#                                         pad_to_max_length=True,
#                                         truncation=True,
#                                         return_token_type_ids=False)

# tokens_ids,mask = torch.tensor(tokenized['input_ids']),torch.tensor(tokenized['attention_mask']) 

# with torch.no_grad():
#     model_output = model(tokens_ids,mask)

# def adjust_multilabel(y, is_pred = False):
#     y_adjusted = []
#     for y_c in y:
#         y_test_curr = [0]*19
#         index = str(int(np.argmax(y_c)))
#         y_c = target_vaiables_id2topic_dict[index]
#     return y_c

# model_output

In [None]:
# тональность текста
pipe = pipeline(model="seara/rubert-tiny2-russian-sentiment", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['mood'] = train_data['summary'].progress_apply(make_pipe)

In [None]:
train_data['label_probs'] = train_data['mood'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('MOOD_')], axis=1)

del train_data['label_probs']
del train_data['mood']

In [None]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

In [None]:
# токичность

# tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# batch = tokenizer.encode(train_data[train_data["num_words"] > 80]['text'][48421], return_tensors='pt')
# model(batch)

pipe = pipeline(model="SkolkovoInstitute/russian_toxicity_classifier", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['toxic'] = train_data['summary'].progress_apply(make_pipe)

In [None]:
train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('TOXIC_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

In [None]:
# эмоции

LABELS = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']
tokenizer = AutoTokenizer.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')
model = BertForSequenceClassification.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')

@torch.no_grad()
def predict_emotion(text: str) -> str:
    """
        We take the input text, tokenize it, pass it through the model, and then return the predicted label
        :param text: The text to be classified
        :type text: str
        :return: The predicted emotion
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
        
    return LABELS[predicted[0]]

@torch.no_grad()    
def predict_emotions(text: str) -> list:
    """
        It takes a string of text, tokenizes it, feeds it to the model, and returns a dictionary of emotions and their
        probabilities
        :param text: The text you want to classify
        :type text: str
        :return: A dictionary of emotions and their probabilities.
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    emotions_list = {}
    for i in range(len(predicted.numpy()[0].tolist())):
        emotions_list[LABELS[i]] = predicted.numpy()[0].tolist()[i]
    return emotions_list

train_data['toxic'] = train_data['summary'].progress_apply(predict_emotions)

In [None]:
train_data['toxic'][0]

In [None]:
def extract_label_probs(row):
    label_probs = [row.get(label, 0.0) for label in LABELS]
    return label_probs

train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('EMOTION_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

In [None]:
# feature data
train_data.to_csv("feature_train.csv", index=False)

# ______________________________________

In [None]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "feature_train.csv"))

In [None]:
train_data["num_words_sum"] = train_data["summary"].apply(
    lambda x: len(str(x).split()))

In [None]:
max(train_data["num_words_sum"])

In [None]:
# ! textblob - обработка текста, генерация фич https://textblob.readthedocs.io/en/dev/quickstart.html - ничего интересного
# ! еще одна библиотека для классификации текстов https://small-text.readthedocs.io/en/latest/ не подходит? для малкеньких текстов
# ! полярность слов https://polyglot.readthedocs.io/en/latest/ - тоже? что уже получил из предобученных моделей
# ! обработка фич https://github.com/jbesomi/texthero - плохо поддерживается
# фичегенерация https://github.com/neomatrix369/nlp_profiler#Notebooks
# классификация на других предобученных моделях, перечисленных у Алерона https://github.com/a-milenkin/Competitive_Data_Science/blob/main/notebooks/9.2.1%20-%20Text_Embeddings.ipynb
# использовать эти ноутбуки для классификации https://github.com/e0xextazy/vkcup2022-first-stage/blob/main/inference.ipynb

# Small Text Classifer

In [None]:
# !pip install small_text > installer_log.txt

In [None]:
# from transformers import AutoTokenizer
# from small_text import TransformersDataset

# num_classes = np.unique(train_data['rate']).shape[0]
# target_labels = np.arange(num_classes)

# train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42, 
#                                           shuffle = True, stratify=train_data['rate'])
# train_split = train_split.reset_index()
# val_split = val_split.reset_index()

# transformer_model_name = 'ai-forever/ruBert-base'

# tokenizer = AutoTokenizer.from_pretrained(
#     transformer_model_name
# )

# train = TransformersDataset.from_arrays(train_split['text'],
#                                         train_split['rate'],
#                                         tokenizer,
#                                         max_length=150,
#                                         target_labels=target_labels)
# test = TransformersDataset.from_arrays(val_split['text'],
#                                        val_split['rate'],
#                                        tokenizer,
#                                        max_length=150,
#                                        target_labels=target_labels)

# Active Learning

In [None]:

# sentence_transformer_model_name = 'cointegrated/rubert-tiny2'

# from small_text import (
#     PoolBasedActiveLearner,
#     PredictionEntropy,
#     TransformerBasedClassificationFactory,
#     TransformerModelArguments,
#     random_initialization_balanced
# )


# def initialize_active_learner(active_learner, y_train):

#     indices_initial = random_initialization_balanced(y_train, n_samples=100)
#     active_learner.initialize_data(indices_initial, y_train[indices_initial])

#     return indices_initial

# transformer_model = TransformerModelArguments(transformer_model_name)
# clf_factory = TransformerBasedClassificationFactory(transformer_model, 
#                                                     num_classes, 
#                                                     kwargs=dict({'device': 'cuda', 
#                                                                  'mini_batch_size': 32,
#                                                                  'class_weight': 'balanced'
#                                                                 }))
# query_strategy = PredictionEntropy()
# active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
# indices_labeled = initialize_active_learner(active_learner, train.y)

# Active Learning Loop

In [None]:
# from sklearn.metrics import accuracy_score
# from small_text import KappaAverage


# num_queries = 20
# stopping_criterion = KappaAverage(num_classes, kappa=0.99)


# def evaluate(active_learner, train, test):
#     y_pred = active_learner.classifier.predict(train)
#     y_pred_test = active_learner.classifier.predict(test)
#     train_acc = accuracy_score(y_pred, train.y)

#     print('Train accuracy: {:.2f}'.format(train_acc))
#     print('Test accuracy: {:.2f}'.format(accuracy_score(y_pred_test, test.y)))
    
#     return train_acc


# results = []
# stopping_history = []

# results.append(evaluate(active_learner, train[indices_labeled], test))
# stopping_history.append(stopping_criterion.stop(predictions=active_learner.classifier.predict(train)))


# for i in range(num_queries):
#     indices_queried = active_learner.query(num_samples=50)
#     y = train.y[indices_queried]
#     active_learner.update(y)
#     indices_labeled = np.concatenate([indices_queried, indices_labeled])
    
#     print('---------------')
#     print(f'Iteration #{i} ({len(indices_labeled)} samples)')
#     results.append(evaluate(active_learner, train[indices_labeled], test))
    
#     stopping_criterion_response = stopping_criterion.stop(predictions=active_learner.classifier.predict(train))
#     print(f'Stop: {stopping_criterion_response}')
#     stopping_history.append(stopping_criterion_response)

In [None]:
# %matplotlib inline
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig = plt.figure(figsize=(12, 8))
# ax = plt.axes()

# data = np.vstack((np.arange(num_queries+1), np.array(results)))
# sns.lineplot(x=0, y=1, data=data)

# plt.xlabel('number of queries', labelpad=15)
# plt.ylabel('train accuracy', labelpad=25)

# earliest_stopping_response = np.amin([i for i, _ in enumerate(stopping_history) if stopping_history[i] is True])
# plt.axvline(x=earliest_stopping_response, ymin=0, ymax=1, color='purple', ls='--')

# sns.despine()

In [None]:
# data = TransformersDataset.from_arrays(train['summary'],
#                                         train['rate'],
#                                         tokenizer,
#                                         max_length=150,
#                                         target_labels=target_labels)
# preds = active_learner.classifier.predict(data)


# NLP profiler

In [None]:
# !pip uninstall typing      # this can cause issues on Kaggle hence removing it helps
# !pip install -U nlp_profiler

In [None]:
profiled_text_dataframe = apply_text_profiling(train_data, 'text')

# SpaCy

In [None]:
# !pip install -U pip setuptools wheel > installer_log.text
# !pip install spacy > installer_log.text

In [None]:
stopwords = stopwords.words('russian')
print (stopwords)


In [None]:
tqdm.pandas()
## Number of unique words in the text ##
train_data["num_unique_words"] = train_data["text"].progress_apply(lambda x: len(set(str(x).split())))
# test_data["num_unique_words"] = test["text"].apply(lambda x: len(set(str(x).split())))
## Number of stopwords in the text ##
train_data["num_stopwords"] = train_data["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
# test_data["num_stopwords"] = test_data["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))

In [None]:
# Features of POS\o

class TextPOSAnalysis:
    def __init__(self):
        self.nlp_ru = ru_core_news_md.load()
        self.df_pos = self.load_pos_table()
        self.m = Mystem()

    def load_pos_table(self):
        table = """
        A       ADJ
        ADV     ADV
        ADVPRO  ADV
        ANUM    ADJ
        APRO    DET
        COM     ADJ
        CONJ    SCONJ
        INTJ    INTJ
        NONLEX  X
        NUM     NUM
        PART    PART
        PR      ADP
        S       NOUN
        SPRO    PRON
        UNKN    X
        V       VERB
        """
        table_file = io.StringIO(table)
        df = pd.read_csv(table_file, sep="\s+", header=None, names=["token", "universal_pos"])
        return df

    def get_universal_tag(self, word):
        processed = self.m.analyze(word)[0]
        lemma = processed["analysis"][0]["lex"].lower().strip()
        pos = processed["analysis"][0]["gr"].split(',')[0]
        pos = pos.split('=')[0].strip()
        tagged = lemma + '_' + pos
        return tagged

    def add_tag(self, word):
        word = self.get_universal_tag(word)
        tag = word.split('_')[1]
        tag = self.df_pos[self.df_pos['token'] == tag]['universal_pos'].values[0] if tag in self.df_pos['token'].values else tag
        word = word.split('_')[0] + '_' + tag
        return word

    def analyze_text(self, text):
        doc = self.nlp_ru(text)
        num_adj = len([tok for tok in doc if tok.pos_ == 'ADJ'])
        num_adv = len([tok for tok in doc if tok.pos_ == 'ADV'])
        num_noun = len([tok for tok in doc if tok.pos_ == 'NOUN'])
        num_verb = len([tok for tok in doc if tok.pos_ == 'VERB'])
        return num_adj, num_noun, num_verb, num_adv

    def analyze_texts(self, texts):
        results = []
        for text in texts:
            results.append(self.analyze_text(text))
        return pd.DataFrame(results, columns=["Num_ADJ", "Num_ADV", "Num_NOUN", "Num_VERB"])


text_POS = TextPOSAnalysis()
POS_results = text_POS.analyze_texts(train_data['text'])
train_data = pd.concat([train_data, POS_results], axis=1)


In [None]:
train_data.iloc[:, 410:417].to_csv("add_feature_train.csv", index=False)

# ______________________________________

In [None]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "feature_train.csv"))
add_feature = pd.read_csv(os.path.join(PATH, "add_feature_train.csv"))
profiler_add_feature = pd.read_csv(os.path.join(PATH, "feature_profiler_train.csv"))

In [None]:
train_data = pd.concat([train_data, add_feature, profiler_add_feature], axis=1)

# Feature Selection

# Shap

In [None]:
train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42, 
                                          shuffle = True, stratify=train_data['rate'])
train_X = train_split.iloc[:, 5:441] 
train_Y = train_split['rate']

val_X = val_split.iloc[:, 5:441] 
val_Y = val_split['rate']

del_columns = ['text', 'sentiment_polarity',
       'sentiment_polarity_summarised',
       'sentiment_subjectivity', 'sentiment_subjectivity_summarised',
       'spelling_quality',
       'spelling_quality_summarised']

train_X = train_X.drop(columns = del_columns)
val_X = val_X.drop(columns = del_columns)

In [None]:
clf = CatBoostClassifier(random_seed=9,
                        thread_count=-1,
                        use_best_model=True,
                        bootstrap_type='Bernoulli')

clf.fit(train_X, train_Y,
        eval_set=(val_X, val_Y),
        verbose=100,
        plot=True,
        early_stopping_rounds=1000)

print(clf.get_best_score())

In [None]:
fi = clf.get_feature_importance(prettified=True)[:100]

In [None]:
# explainer = shap.TreeExplainer(clf)

# val_dataset = Pool(data=val_split.iloc[:, 5:417], label=val_split['rate'])
# shap_values = explainer.shap_values(val_dataset)
# shap.summary_plot(shap_values, val_split.iloc[:, 5:417], max_display = 50, plot_size = (15, 5))

In [None]:
# for i in range(len(np.unique(val_split['rate'].values))):
#     print(f'Class {i}')
#     shap.summary_plot(shap_values[i], val_split.iloc[:, 5:417], max_display = 50, color_bar=True, plot_size = (15, 5))

In [None]:
train_X = train_X[fi['Feature Id'].to_list()]
val_X = val_X[fi['Feature Id'].to_list()]

# Рекурсивный feature_selection Catboost

In [None]:
summary = clf.select_features(train_X, train_Y, 
                      eval_set=(val_X, val_Y),
                      features_for_select='0-99',
                      num_features_to_select=50,
                      steps=1,
                      train_final_model=False,
                      logging_level='Silent')

# Save new_train

In [None]:
summary['selected_features_names'].extend(['text', 'summary'])

In [None]:
new_train = train_data[summary['selected_features_names']]
new_train.to_csv("new_train.csv", index=False)