In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
from nltk.corpus import stopwords
import pymorphy2
import re
from string import punctuation


class WordEncoder:
    ru_stopwords = set(stopwords.words("russian"))
    morph = pymorphy2.MorphAnalyzer()
    
    @staticmethod
    def fit_transform(sentences: list):
        one_hot_words_columns = set()
        for sentence in tqdm(sentences):
            sentence = WordEncoder.preprocess_sentence(sentence)
            for word in sentence:
                parse_word = WordEncoder.morph.parse(word)
                normal_form = parse_word[0].normal_form
                if not ((normal_form in WordEncoder.ru_stopwords) or normal_form.isnumeric() or WordEncoder.is_name(parse_word) or any(x.isdigit() for x in normal_form)):
                    one_hot_words_columns.add(normal_form)
        one_hot_words_columns = list(one_hot_words_columns)
        data = [[None] * len(one_hot_words_columns) for _ in range(len(sentences))]
        for i in tqdm(range(len(data))):
            sentence = WordEncoder.preprocess_sentence(sentences[i])
            for j in range(len(data[i])):
                data[i][j] = 1 if one_hot_words_columns[j] in sentence else 0
        return pd.DataFrame(data, columns=one_hot_words_columns)
    
    @staticmethod
    def remove_symbols_from_text(text: str, symbols: str) -> str:
        return "".join([ch for ch in text if ch not in symbols])
    
    @staticmethod
    def preprocess_sentence(sentence: str) -> list[str]:
        threshold = 0.5
        sentence = sentence.lower()
        spec_chars = punctuation + '\n\t…—«»'
        sentence = WordEncoder.remove_symbols_from_text(sentence, spec_chars)
        words = sentence.split()
        words = list(filter(lambda word: not re.match(r'[a-z]+', word), words)) # remove english words
        return words
    
    @staticmethod
    def is_name(parse_word, threshold_prob = 0.5) -> bool:
        for p in parse_word:
            if 'Name' in p.tag and p.score >= threshold_prob:
                return True
        return False

In [3]:
data = pd.read_csv('../static/datasets/modified/bin_classification/train_data.csv', sep=',')

In [4]:
encoder = WordEncoder()

def get_encoded_df_label(file_path: str):
    data = pd.read_csv(file_path, sep=',')
    data.dropna(inplace=True)
    sentences = data['sentence'].astype(str).tolist()
    return encoder.fit_transform(sentences), data['label']

In [5]:
X_train, y_train = get_encoded_df_label('../static/datasets/modified/bin_classification/train_data.csv')

X_train.to_csv(path_or_buf='../static/datasets/modified/bin_classification/bayes_train.csv', index=False)
y_train.to_csv(path_or_buf='../static/datasets/modified/bin_classification/bayes_test.csv', index=False)

100%|██████████| 1862/1862 [00:05<00:00, 358.76it/s]
100%|██████████| 1862/1862 [00:01<00:00, 1100.42it/s]


In [6]:
X_train, y_train = get_encoded_df_label('../static/datasets/modified/multi_classification/train_data.csv')

X_train.to_csv(path_or_buf='../static/datasets/modified/multi_classification/bayes_train.csv', index=False)
y_train.to_csv(path_or_buf='../static/datasets/modified/multi_classification/bayes_test.csv', index=False)

100%|██████████| 6634/6634 [00:17<00:00, 378.49it/s]
100%|██████████| 6634/6634 [00:09<00:00, 672.58it/s]
