In [1]:
import json
from pathlib import Path

import numpy as np

In [2]:
RANDOM_STATE = 42

In [3]:
with open(Path.cwd().parent/'dataset/train.json') as f:
    train_documents = json.load(f)

Разделим выборку на обучение и валидацию

In [4]:
labels = [doc['label'] for doc in train_documents]

In [5]:
from sklearn.model_selection import train_test_split


indices = np.arange(len(labels))
_, _, train_indices, test_indices = train_test_split(labels, indices, test_size=0.25, random_state=RANDOM_STATE)
# TODO: add stratification

In [6]:
train_split = []
test_split = []

for train_idx in train_indices:
    train_split.append(train_documents[train_idx])

for test_idx in test_indices:
    test_split.append(train_documents[test_idx])

In [7]:
with open('data/train_split.json', 'w') as train_f, open('data/test_split.json', 'w') as test_f:
    json.dump(train_split, train_f)
    json.dump(test_split, test_f)

Построим словарь токенов

In [8]:
from collections import defaultdict

from nltk.tokenize import wordpunct_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [19]:
collected_garbage = [
    'E',
    'I',
    'II',
    'III',
    'IV',
    'N',
    'PAGE',
    'V',
    'VII',
    'VIII',
    'Word',
    'Zi',
    'footnoteRef',
    'https',
    'rts',
    'ast',
    'doc',
    'docx',
    'e',
    'gov',
    'http',
    'mail',
    'mailto',
    'rosatom',
    'roseltorg',
    'ru',
    'sberbank',
    'tender',
    'unknown',
    'www',
    'yandex',
    'zakupki',
]
stop_words = set(stopwords.words('russian'))


def filter_words(token_counter, min_count):
    filtered_words = sorted([
        word
        for word in token_counter
        if (token_counter[word] >= min_count and word.isalpha() and word not in stop_words and word not in collected_garbage)
    ])

    return filtered_words


def _make_word_idx_map(documents: dict, min_count=10) -> dict:
    token_counter = defaultdict(int)
    stemmer = SnowballStemmer('russian')
    for entry in documents:
        for token in wordpunct_tokenize(entry['text']):
            token = stemmer.stem(token)
            token_counter[token] += 1

    filtered_words = filter_words(token_counter, min_count)

    word_idx = {
        '<pad>': 0,
        '<unk>': 1,
        '<end>': 2,
    }
    word_idx.update(
        dict(zip(
            filtered_words,
            range(len(word_idx), len(filtered_words) + len(word_idx))
    )))

    return word_idx, token_counter

In [20]:
word_idx, token_counter = _make_word_idx_map(train_documents)

In [21]:
filter_words(token_counter, 10)

['абзац',
 'аванс',
 'авансирован',
 'авансов',
 'август',
 'автоматизирова',
 'автоматическ',
 'автомобил',
 'автомобильн',
 'автономн',
 'автотранспорт',
 'агентств',
 'административн',
 'администрац',
 'адрес',
 'ак',
 'академ',
 'акт',
 'акционерн',
 'александр',
 'алтайск',
 'альтернативн',
 'анализ',
 'анализатор',
 'аналогичн',
 'аномальн',
 'антидемпингов',
 'аппарат',
 'аппаратн',
 'апрел',
 'аптек',
 'арбитражн',
 'арест',
 'архангельск',
 'астрахан',
 'атомн',
 'аукцион',
 'аэс',
 'б',
 'бакале',
 'балаковск',
 'балл',
 'банк',
 'банковск',
 'барнаул',
 'башкортоста',
 'безвозмездн',
 'бездейств',
 'безналичн',
 'безопасн',
 'безотзывн',
 'белгород',
 'белгородск',
 'белоярск',
 'бенефициар',
 'бензин',
 'бесспорн',
 'бик',
 'биологическ',
 'благоустройств',
 'блок',
 'блокирован',
 'бол',
 'больниц',
 'больш',
 'брянск',
 'будут',
 'бумаг',
 'бумажн',
 'бурят',
 'бухгалтер',
 'бухгалтерск',
 'быт',
 'бытов',
 'бюджет',
 'бюджетн',
 'важн',
 'валют',
 'вариант',
 'введ',
 'в

In [22]:
with open('data/word_idx_snowball.json', 'w') as f:
    json.dump(word_idx, f)

In [23]:
len(word_idx)

1570

Сделаем разбиение по названиям пункта

In [5]:
with open('train_split.json') as train_f, open('test_split.json') as test_f:
    train_split = json.load(train_f)
    test_split = json.load(test_f)

In [16]:
first_label_train = [doc for doc in train_split if doc['label'] == 'обеспечение исполнения контракта']
second_label_train = [doc for doc in train_split if doc['label'] != 'обеспечение исполнения контракта']

In [20]:
len(first_label_train), len(second_label_train)

(730, 619)

In [19]:
first_label_test = [doc for doc in test_split if doc['label'] == 'обеспечение исполнения контракта']
second_label_test = [doc for doc in test_split if doc['label'] != 'обеспечение исполнения контракта'] 

In [21]:
len(first_label_test), len(second_label_test)

(258, 192)

In [27]:
with open('first_label_train.json', 'w') as first_train, \
    open('second_label_train.json', 'w') as second_train, \
    open('first_label_test.json', 'w') as first_test, \
    open('second_label_test.json', 'w') as second_test:
    json.dump(first_label_train, first_train)
    json.dump(second_label_train, second_train)
    json.dump(first_label_test, first_test)
    json.dump(second_label_test, second_test)
