# Подготовим все библиотеки и пропишем пути к файлам

In [None]:
!pip install imbalanced-learn

In [None]:
!pip install nltk

In [None]:
!pip install catboost

In [None]:
nltk.download("stopwords")

In [None]:
!pip install pymorphy2

In [None]:
!pip install shap

# Все необходимые библиотеки установлены, можно преступать к обработке данных

In [4]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel

import os

from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import Pool, CatBoostClassifier, cv
from catboost.text_processing import Tokenizer

from pymorphy2 import MorphAnalyzer
import string

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [5]:
dir_ = os.getcwd()

In [6]:
dir_

'C:\\Users\\krellQ\\PycharmProjects\\SOM\\sentiment_analysis'

In [7]:
file_name = '\\data'

In [8]:
folder_path = os.path.join(dir_ + file_name)
folder_path
os.listdir(folder_path)

['neg', 'neu', 'pos']

In [9]:
ru_stopwords = stopwords.words("russian")

tokenizer = Tokenizer(lowercasing=True,
                      separator_type='ByDelimiter',
                      token_types=['Word', 'Punctuation'],
                      languages =['russian'] )

# Рассмотрим идею работы с этими данными

Функции подготовки данных для градиентного бустинга стоит вынести отдельно т.к. библиотека Catboost предполагает использование встроенных инструментов векторизации и очистки текста.

In [10]:
def tokenize_texts(texts):
    return [tokenizer.tokenize(text) for text in texts]

def remove_stop_words(texts, words):
    texts_copy = []
    words_set = set(words)
    for text in tokenize_texts(texts):
        text_copy = []
        for token in text:
            if token not in words_set:
                text_copy.append(token)
        texts_copy.append(' '.join(text_copy))
    return texts_copy

def lemmatize_text(text):
    morph = MorphAnalyzer()
    words = text.split()
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    return ' '.join(lemmatized_words)

def remove_stopwords(text):
    stop_words = ru_stopwords
    filtered_text = ''.join([word for word in text if word.lower() not in stop_words])
    return filtered_text

def quantize_emotion(emotion):
    if emotion == 'neg':
        return -1
    elif emotion == 'neu':
        return 0
    elif emotion == 'pos':
        return 1
    else:
        return None

In [11]:
def prepare_data_for_catboost_model(folder_path):
    folder_path = folder_path + "/"
    df = pd.DataFrame(columns=['text', 'sentiment'])
    for directory in os.listdir(folder_path):
        if os.path.isdir(folder_path + directory):
            files = np.array(os.listdir(folder_path + directory))
            for file in files:
                with open(os.path.join(folder_path + directory + '/', file), encoding='utf-8') as f:
                    review = f.read()
                    review = remove_stopwords(review)
                    current_df = pd.DataFrame({'text': [review], 'sentiment': directory})
                    df = df._append(current_df, ignore_index=True)
    
    
    random_under_sampler = RandomUnderSampler(random_state = 0)
    #df_review_bal, df_review_bal['sentiment'] = random_under_sampler.fit_resample(df[['text']], df['sentiment'])
    #train, test = train_test_split(df_review_bal, test_size = 0.25, random_state = 42,)

    train, test = train_test_split(df, test_size = 0.3, random_state = 0)
    
    #train['text'] = tokenize_texts(train['text'])
    #test['text'] = tokenize_texts(test['text'])
    
    train_x, train_y = train['text'], train['sentiment']
    test_x, test_y = test['text'], test['sentiment']

    X_train = train_x.to_frame('text')
    X_test = test_x.to_frame('text')
    return X_train, train_y, X_test, test_y

In [12]:
train_x, train_y, test_x, test_y = prepare_data_for_catboost_model(folder_path)

In [13]:
def train_catboost(X_train, X_test, y_train, y_test, catboost_params = {}, verbose = 100):
    text_features = ['text']
    learning_pool = Pool(
        X_train,
        y_train,
        text_features = text_features,
        feature_names = list(X_train)
    )
    test_pool = Pool(
        X_test,
        y_test,
        text_features = text_features,
        feature_names = list(X_train)
    )
    
    catboost_default_params = {
        'iterations': 1000,
        'learning_rate': 0.03,
        'eval_metric': 'TotalF1'
    }
    
    catboost_default_params.update(catboost_params)
    model = CatBoostClassifier(**catboost_default_params)
    model.fit(learning_pool, eval_set=test_pool, verbose=verbose)
    
    return model

In [None]:
my_model = train_catboost(train_x,
               test_x,
               train_y,
               test_y,
               catboost_params={
                'text_processing': [
                'NaiveBayes+Word|BoW+Word,BiGram'
        ]
    }
)

Без удаления Stopwords получили F1:                                     
bestTest 0.6598218546= 
При равном распределении выборки, но уменьшении общего объёма данных    bestTest = 0.6992715462
После удаления:                                                         bestTest 0.672708907146

Получились низкие показатели по сравнению с даже куда более простыми алгоритмами, но это связано в первую очередь с тем, что у меня не получилось
имплементировать нормально векторизацию и алгоритм bag_of_words, который планировался в реализации, но первоочередной для меня было изучить 
принцип работы catboost и понять процесс работы алгоритмов обработки текста.


In [130]:
def train(folder_path):
    train_x, train_y, test_x, test_y = prepare_data_for_catboost_model(folder_path)
    model = train_catboost(train_x,
               test_x,
               train_y,
               test_y,
               catboost_params={
                'text_processing': [
                'NaiveBayes+Word|BoW+Word,BiGram'
        ]
    }
    )
    return model

In [132]:
def test_model(model, folder_path):
    result = []
    for file in os.listdir(folder_path):
         with open(os.path.join(folder_path + '/', file), encoding='utf-8') as f:
             text = f.read()
             ans = model.predict()
             result.append(quantize_emotion(ans))
    return result

In [3]:
print(claccification_report(my_model))

NameError: name 'claccification_report' is not defined