<a href="https://www.kaggle.com/code/akscent/ft-extraction-test?scriptVersionId=150292455" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
!pip install pymorphy2 cleantext -U pip setuptools wheel nlp_profiler textblob pymystem3 > installer_log.txt
!pip install spacy > installer_log.txt
import cudf
!load_ext cudf.pandas 
import os
import sys
import torch
import json
import spacy
import io
# import ru_core_news_md
import shap
shap.initjs()
import pandas as pd
import numpy as np

from numpy import asarray
from collections import Counter
from typing import Dict
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModel, MBartTokenizer, MBartForConditionalGeneration, BertTokenizer, BertForSequenceClassification
from textblob import TextBlob
from nlp_profiler.core import apply_text_profiling
from pymystem3 import Mystem
from nltk.corpus import stopwords
from catboost import CatBoostClassifier

# sys.path.insert(1, '/kaggle/input/ods-huawei/nlp_huawei_new2_task-master/nlp_huawei_new2_task-master/baseline_transformers')
# from dataset import *
# from model import *
# from trainer import Trainer

torch.manual_seed(42)

ModuleNotFoundError: No module named 'cudf'

In [None]:
class FiveDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        if 'summary' in dataframe:
            self.text = dataframe['summary'].tolist()
        else:
            self.text = dataframe['text'].tolist()
        self.targets = None
        if 'rate' in dataframe:
            self.targets = dataframe['rate'].tolist()
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.targets is not None:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

    def __len__(self) -> int:
        return len(self.text)
    

class ModelForClassification(torch.nn.Module):

    def __init__(self, model_path: str, config: Dict):
        super(ModelForClassification, self).__init__()
        self.model_name = model_path
        self.config = config
        self.n_classes = config['num_classes']
        self.dropout_rate = config['dropout_rate']
        self.bert = AutoModel.from_pretrained(self.model_name)
        self.pre_classifier = torch.nn.Linear(312, 768)
        self.dropout = torch.nn.Dropout(self.dropout_rate)
        self.classifier = torch.nn.Linear(768, self.n_classes)
        self.softmax = torch.nn.LogSoftmax(dim = 1)

    def forward(self, input_ids, attention_mask,):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_state = output[0]
        hidden_state = hidden_state[:, 0]
        hidden_state = self.pre_classifier(hidden_state)
        hidden_state = torch.nn.ReLU()(hidden_state)
        hidden_state = self.dropout(hidden_state)
        output = self.classifier(hidden_state)
        output = self.softmax(output)
        return output


class Trainer:
    def __init__(self, config: Dict, class_weights=None):
        self.config = config
        self.device = config['device']
        self.n_epochs = config['n_epochs']
        self.optimizer = None
        self.opt_fn = lambda model: AdamW(model.parameters(), config['lr'])
        self.model = None
        self.history = None
        if class_weights is not None:
            class_weights = class_weights.to(self.device)
            self.loss_fn = CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fn = CrossEntropyLoss()
        self.device = config['device']
        self.verbose = config.get('verbose', True)
        
    def save_history(self, path: str):
        history = {
            'train_loss': self.history['train_loss'],
            'val_loss': self.history['val_loss'],
            'val_acc': self.history['val_acc']
        }
        val_acc = sum(self.history['val_acc']) / len(self.history['val_acc'])
        print("All ACCURACY = ", val_acc)
        with open(path, 'w') as file:
            json.dump(history, file)
        
    def load_history(self, path: str):
        with open(path, 'r') as file:
            history = json.load(file)
        self.history = {
            'train_loss': history['train_loss'],
            'val_loss': history['val_loss'],
            'val_acc': history['val_acc']
        }

    def fit(self, model, train_dataloader, val_dataloader):
        self.model = model.to(self.device)
        self.optimizer = self.opt_fn(model)
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_acc': []
        }
        best_val_loss = float('inf')

        for epoch in range(self.n_epochs):
            print(f"Epoch {epoch + 1}/{self.n_epochs}")
            train_info = self.train_epoch(train_dataloader)
            val_info = self.val_epoch(val_dataloader)
            self.history['train_loss'].extend(train_info['loss'])
            self.history['val_loss'].extend([val_info['loss']])
            self.history['val_acc'].extend([val_info['acc']])

            if val_info['loss'] < best_val_loss:
                best_val_loss = val_info['loss']
                self.save_model_weights('best_model_weights.ckpt')

            self.save_history('history.json')

        return self.model.eval()

    def save_model_weights(self, path: str):
        torch.save(self.model.state_dict(), path)



    def train_epoch(self, train_dataloader):
        self.model.train()
        losses = []
        total_loss = 0
        if self.verbose:
            train_dataloader = tqdm(train_dataloader)
        for batch in train_dataloader:
            ids = batch['ids'].to(self.device, dtype=torch.long)
            mask = batch['mask'].to(self.device, dtype=torch.long)
            targets = batch['targets'].to(self.device, dtype=torch.long)

            outputs = self.model(ids, mask)
            loss = self.loss_fn(outputs, targets)
            total_loss += loss.item()
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_val = loss.item()
            if self.verbose:
                train_dataloader.set_description(f"Loss={loss_val:.3}")
            losses.append(loss_val)
        avg_loss = total_loss / len(train_dataloader)
        print("AVG LOSS = ", avg_loss)
        return {'loss': losses}

    def val_epoch(self, val_dataloader):
        self.model.eval()
        all_logits = []
        all_labels = []
        if self.verbose:
            val_dataloader = tqdm(val_dataloader)
        with torch.no_grad():
            for batch in val_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                targets = batch['targets'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                all_logits.append(outputs)
                all_labels.append(targets)
        all_labels = torch.cat(all_labels).to(self.device)
        all_logits = torch.cat(all_logits).to(self.device)
        loss = self.loss_fn(all_logits, all_labels).item()
        acc = (all_logits.argmax(1) == all_labels).float().mean().item()
        print("ACCURACY for EPOCH = ", acc)
        if self.verbose:
            val_dataloader.set_description(f"Loss={loss:.3}; Acc:{acc:.3}")
        return {
            'acc': acc,
            'loss': loss
        }

    def predict(self, test_dataloader):
        if not self.model:
            raise RuntimeError("You should train the model first")
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                preds = torch.exp(outputs)
                predictions.extend(preds.tolist())
        return asarray(predictions)

    def save(self, path: str):
        if self.model is None:
            raise RuntimeError("You should train the model first")
        checkpoint = {
            "config": self.model.config,
            "trainer_config": self.config,
            "model_name": self.model.model_name,
            "model_state_dict": self.model.state_dict()
        }
        torch.save(checkpoint, path)

    def plot_history(self):
        import matplotlib.pyplot as plt
        
        if self.history is None:
            raise RuntimeError("History is not available. Train the model first.")

        train_loss = self.history['train_loss']
        val_loss = self.history['val_loss']
        val_acc = self.history['val_acc']

        epochs = range(1, len(train_loss) + 1)

        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(epochs, train_loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'r', label='Validation loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(epochs, val_acc, 'g', label='Validation accuracy')
        plt.title('Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.show()


    @classmethod
    def load(cls, path: str):
        ckpt = torch.load(path)
        keys = ["config", "trainer_config", "model_state_dict"]
        for key in keys:
            if key not in ckpt:
                raise RuntimeError(f"Missing key {key} in checkpoint")
        new_model = ModelForClassification(
            ckpt['model_name'],
            ckpt["config"]
        )
        new_model.load_state_dict(ckpt["model_state_dict"])
        new_trainer = cls(ckpt["trainer_config"])
        new_trainer.model = new_model
        new_trainer.model.to(new_trainer.device)
        return new_trainer

In [None]:
test_data = pd.read_csv('/kaggle/input/ods-huawei/test.csv')
test_data
# test_dataset = FiveDataset(test_data, tokenizer, MAX_LEN)
# test_dataloader = DataLoader(test_dataset, **test_params)

In [None]:
#pre-cleaning
import re
import pymorphy2
ru_stopwords = stopwords.words('russian')
digits = [str(i) for i in range(10)]

TOKEN_RE = re.compile(r'[а-яё!.,?%]+')
lemmatizer = pymorphy2.MorphAnalyzer()

def is_valid_word(word):
    if not word[0].isdigit() and word not in ru_stopwords:
        parsed_word = lemmatizer.normal_forms(word)[0]
        return parsed_word
    return False

def text_cleaning(text):
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    words = text.split()
    cleaned_words = [word for word in words[:512] if is_valid_word(word) and len(word) < 15]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

tqdm.pandas()
test_data['text'] = test_data['text'].progress_apply(text_cleaning)

test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))
test_data.loc[test_data["num_words"] == 0, "text"] = "нормально"

In [None]:
def remove_infrequent_words(dataset, min_count=3):
    word_counter = Counter()
    for text in dataset:
        words = text.split()
        word_counter.update(words)
    infrequent_words = [word for word, count in word_counter.items() if count < min_count]
    def remove_infrequent(text):
        words = text.split()
        cleaned_words = [word for word in words if word not in infrequent_words]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text
    cleaned_dataset = [remove_infrequent(text) for text in tqdm(dataset, desc="Cleaning text")]

    return cleaned_dataset

cleaned_test = remove_infrequent_words(test_data['text'].tolist())

In [None]:
test_data['cleaned_text'] = cleaned_test

In [None]:
def replace_nan_with_text(row):
    if pd.isna(row['cleaned_text']):
        return row['text']
    return row['cleaned_text']

test_data['cleaned_text'] = test_data.progress_apply(replace_nan_with_text, axis=1)

In [None]:
def truncate_text(text, max_words=512):
    words = text.split()
    if len(words) > max_words:
        truncated_text = ' '.join(words[:max_words])
    else:
        truncated_text = text
    return truncated_text

test_data['cleaned_text'] = test_data['cleaned_text'].progress_apply(truncate_text)

In [None]:
model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def summary_rows(article_text):
    input_ids = tokenizer(
        [article_text],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )[0]

    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    return summary

def text_summary(text):
    if isinstance(text, str) and text.strip() and len(str(text).split()) > 150:
        return summary_rows(text)
    else:
        return text
    

test_data['summary'] = test_data['cleaned_text'].progress_apply(text_summary)

In [None]:
test_data

In [None]:
test_data.to_csv("test_cleaned.csv", index=False)

# Load Test Data

In [3]:
train_data = pd.read_csv("/kaggle/input/ods-huawei/test_cleaned.csv")
train_data

Unnamed: 0,index,text,num_words,cleaned_text,summary
0,0,Очень хороший магазин сотрудники приятный,5,Очень хороший магазин сотрудники приятный,Очень хороший магазин сотрудники приятный
1,1,"Самый обычный продуктовый магазин. Есть сыры, ...",9,"Самый обычный продуктовый магазин. Есть сыры, ...","Самый обычный продуктовый магазин. Есть сыры, ..."
2,2,Вс комфортно,2,Вс комфортно,Вс комфортно
3,3,"Маленький филиальчик, необходимое есть. Две ка...",9,"Маленький необходимое есть. Две кассы, народу ...","Маленький необходимое есть. Две кассы, народу ..."
4,4,Плохо относятся клиентам!!!!!,3,Плохо относятся,Плохо относятся
...,...,...,...,...,...
12162,12162,Персонал вежливый . Большой ассортимент,5,Персонал вежливый . Большой ассортимент,Персонал вежливый . Большой ассортимент
12163,12163,Скидки сыры. Скидки алкоголь. Приемлимые цены ...,21,Скидки сыры. Скидки алкоголь. цены фрукты . Ос...,Скидки сыры. Скидки алкоголь. цены фрукты . Ос...
12164,12164,"Рядом домом, неплохая пятерочка, персонал хоро...",12,"Рядом домом, неплохая пятерочка, персонал хоро...","Рядом домом, неплохая пятерочка, персонал хоро..."
12165,12165,Хороший магазин дома. Кассиры приветливые. Про...,7,Хороший магазин дома. Кассиры приветливые. Про...,Хороший магазин дома. Кассиры приветливые. Про...


In [5]:
def replace_nan_with_text(row):
    if pd.isna(row['summary']) :
        return 'text'
    return row['summary']

tqdm.pandas()
train_data['summary'] = train_data.progress_apply(replace_nan_with_text, axis=1)

  0%|          | 0/12167 [00:00<?, ?it/s]

In [None]:
# тональность текста
from tqdm import tqdm

def make_pipe(text):
    return pipe(text, return_all_scores=True)

def extract_label_probs(row):
    label_probs = [label['score'] for label in row[0]]
    return label_probs

pipe = pipeline(model="seara/rubert-tiny2-russian-sentiment", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['mood'] = train_data['summary'].progress_apply(make_pipe)

train_data['label_probs'] = train_data['mood'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('MOOD_')], axis=1)

del train_data['label_probs']
del train_data['mood']

In [None]:
# токичность

# tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
# batch = tokenizer.encode(train_data[train_data["num_words"] > 80]['text'][48421], return_tensors='pt')
# model(batch)

pipe = pipeline(model="SkolkovoInstitute/russian_toxicity_classifier", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['toxic'] = train_data['summary'].progress_apply(make_pipe)
train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('TOXIC_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

In [8]:
params = {
        'HIGH_LEVEL_OPTION': False,
        'GRANULAR_OPTION': True,
        'GRAMMAR_CHECK_OPTION': False,  # default: False as slow process but can Enabled
        'SPELLING_CHECK_OPTION': False,  # default: True although slightly slow process but can Disabled
        'EASE_OF_READING_CHECK_OPTION': False,
        'PARALLELISATION_METHOD_OPTION': 'DEFAULT_PARALLEL_METHOD',
    }

profiled_text_dataframe = apply_text_profiling(train_data, 'text')

final params: {'high_level': True, 'granular': True, 'grammar_check': False, 'spelling_check': True, 'parallelisation_method': 'default'}


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

  0%|                                                                                                         …

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data] Downloading package stopwords to /usr/share/nltk_

  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


  0%|                                                                                                         …

Unable to cache to disk. Possibly a race condition in the creation of the directory. Exception: cannot pickle '_hashlib.HMAC' object.


In [9]:
profiled_text_dataframe

Unnamed: 0,text,sentences_count,characters_count,spaces_count,count_words,duplicates_count,chars_excl_spaces_count,emoji_count,whole_numbers_count,alpha_numeric_count,...,noun_phase_count,sentiment_polarity_score,sentiment_polarity,sentiment_polarity_summarised,sentiment_subjectivity_score,sentiment_subjectivity,sentiment_subjectivity_summarised,spelling_quality_score,spelling_quality,spelling_quality_summarised
0,Очень хороший магазин сотрудники приятный,1,41,4,5,0,37,0,0,0,...,5,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.000000,Very bad,Bad
1,"Самый обычный продуктовый магазин. Есть сыры, ...",2,67,8,9,1,59,0,0,0,...,10,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.230769,Pretty bad,Bad
2,Вс комфортно,1,12,1,2,0,11,0,0,0,...,2,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.000000,Very bad,Bad
3,"Маленький филиальчик, необходимое есть. Две ка...",2,74,8,9,2,66,0,0,0,...,9,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.357143,Pretty bad,Bad
4,Плохо относятся клиентам!!!!!,2,29,2,3,1,27,0,0,0,...,3,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.625000,Bad,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12162,Персонал вежливый . Большой ассортимент,2,39,4,4,0,35,0,0,0,...,4,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.200000,Pretty bad,Bad
12163,Скидки сыры. Скидки алкоголь. Приемлимые цены ...,7,146,20,20,2,126,0,0,0,...,18,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.296296,Pretty bad,Bad
12164,"Рядом домом, неплохая пятерочка, персонал хоро...",2,98,11,12,1,87,0,0,0,...,12,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.333333,Pretty bad,Bad
12165,Хороший магазин дома. Кассиры приветливые. Про...,3,58,6,7,1,52,0,0,0,...,6,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.222222,Pretty bad,Bad


In [10]:
profiled_text_dataframe.to_csv("profiled_text_data_test.csv", index=False)