In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import pymorphy2
import re
import string


class WordEncoder:
    ru_stopwords = set(stopwords.words("russian"))
    morph = pymorphy2.MorphAnalyzer()
    
    @staticmethod
    def fit_transform(sentences: list):
        one_hot_words_columns = set()
        for sentence in tqdm(sentences):
            sentence = WordEncoder.preprocess_sentence(sentence)
            for word in sentence:
                parse_word = WordEncoder.morph.parse(word)
                normal_form = parse_word[0].normal_form
                if not (normal_form in WordEncoder.ru_stopwords or normal_form.isnumeric() or WordEncoder.is_name(parse_word) or any(x.isdigit() for x in normal_form)):
                    one_hot_words_columns.add(normal_form)
        one_hot_words_columns = list(one_hot_words_columns)
        data = [[0] * len(one_hot_words_columns)] * len(sentences)
        for i in tqdm(range(len(data))):
            sentence = WordEncoder.preprocess_sentence(sentences[i])
            for j in range(len(data[i])):
                data[i][j] = 1 if one_hot_words_columns[j] in sentence else 0
        return pd.DataFrame(data, columns=one_hot_words_columns)
    
    @staticmethod
    def remove_symbols_from_text(text: str, symbols: str) -> list[str]:
        return "".join([ch for ch in text if ch not in symbols])
    
    @staticmethod
    def preprocess_sentence(sentence: str) -> list[str]:
        threshold = 0.5
        sentence = sentence.lower()
        spec_chars = string.punctuation + '\n\t…—«»'
        sentence = WordEncoder.remove_symbols_from_text(sentence, spec_chars)
        words = sentence.split()
        words = list(filter(lambda word: not re.match(r'[a-z]+', word), words)) # remove english words
        return words
    
    @staticmethod
    def is_name(parse_word, threshold_prob = 0.5) -> bool:
        for p in parse_word:
            if 'Name' in p.tag and p.score >= threshold_prob:
                return True
        return False

In [10]:
encoder = WordEncoder()

def get_encoded_df_label(file_path: str):
    data = pd.read_csv(file_path, sep='\t')
    sentences = data['sentence'].astype(str).tolist()
    return encoder.fit_transform(sentences), data['label']
    

X_train, y_train = get_encoded_df_label('../static/datasets/modified/bin_classification/train_data.csv')
X_test, y_test = get_encoded_df_label('../static/datasets/modified/bin_classification/validation_data.csv')

100%|██████████████████████████████████████| 1863/1863 [00:07<00:00, 260.36it/s]
100%|██████████████████████████████████████| 1863/1863 [00:02<00:00, 707.42it/s]
100%|██████████████████████████████████████| 2845/2845 [00:09<00:00, 286.96it/s]
100%|██████████████████████████████████████| 2845/2845 [00:05<00:00, 566.78it/s]


In [30]:
sys.path.append('..')
import src.metrics.binary_classification as metrics
from src.models.baselines import NaiveBayesClassifier

In [31]:
naive_bayes = NaiveBayesClassifier()
naive_bayes.fit(X_train, y_train)

KeyError: 0

In [25]:
col = X_train.shape[1]
X_train.iloc[:, 0]

0       0
1       0
2       0
3       0
4       0
       ..
1858    0
1859    0
1860    0
1861    0
1862    0
Name: подконтрольный, Length: 1863, dtype: int64