In [2]:
import json
from pathlib import Path

import numpy as np

In [3]:
RANDOM_STATE = 42

In [4]:
with open(Path.cwd().parent/'dataset/train.json') as f:
    train_documents = json.load(f)

Разделим выборку на обучение и валидацию

In [4]:
labels = [doc['label'] for doc in train_documents]

In [5]:
from sklearn.model_selection import train_test_split


indices = np.arange(len(labels))
_, _, train_indices, test_indices = train_test_split(labels, indices, test_size=0.25, random_state=RANDOM_STATE)
# TODO: add stratification

In [6]:
train_split = []
test_split = []

for train_idx in train_indices:
    train_split.append(train_documents[train_idx])

for test_idx in test_indices:
    test_split.append(train_documents[test_idx])

In [7]:
with open('train_split.json', 'w') as train_f, open('test_split.json', 'w') as test_f:
    json.dump(train_split, train_f)
    json.dump(test_split, test_f)

Построим словарь токенов

In [7]:
from collections import defaultdict

from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [80]:
collected_garbage = [
    'arbitr',
    'ast',
    'c',
    'doc',
    'docx',
    'e',
    'footnoteref',
    'gov',
    'http',
    'i',
    'ii',
    'iii',
    'iv',
    'mail',
    'mailto',
    'n',
    'page',
    'rosatom',
    'roseltorg',
    'rt',
    'ru',
    'sberbank',
    'tender',
    'unknown',
    'v',
    'vii',
    'viii',
    'word',
    'www',
    'yandex',
    'zakupki',
    'zi'
]
stop_words = set(stopwords.words('russian'))


def filter_words(token_counter, min_count):
    filtered_words = sorted([
        word
        for word in token_counter
        if (token_counter[word] >= min_count and word.isalpha() and word not in stop_words and word not in collected_garbage)
    ])

    return filtered_words


def _make_word_idx_map(documents: dict, min_count=11) -> dict:
    token_counter = defaultdict(int)
    stemmer = PorterStemmer()
    for entry in documents:
        for token in wordpunct_tokenize(entry['text']):
            token = stemmer.stem(token)
            token_counter[token] += 1

    filtered_words = filter_words(token_counter, min_count)

    word_idx = dict(zip(filtered_words, range(2, len(filtered_words) + 2)))
    word_idx['PAD'] = 0
    word_idx['UNK'] = 1

    return word_idx, token_counter

In [81]:
word_idx, token_counter = _make_word_idx_map(train_documents)

In [88]:
with open('word_idx.json', 'w') as f:
    json.dump(word_idx, f)

In [85]:
len(word_idx)

2432

Сделаем разбиение по названиям пункта

In [5]:
with open('train_split.json') as train_f, open('test_split.json') as test_f:
    train_split = json.load(train_f)
    test_split = json.load(test_f)

In [16]:
first_label_train = [doc for doc in train_split if doc['label'] == 'обеспечение исполнения контракта']
second_label_train = [doc for doc in train_split if doc['label'] != 'обеспечение исполнения контракта']

In [20]:
len(first_label_train), len(second_label_train)

(730, 619)

In [19]:
first_label_test = [doc for doc in test_split if doc['label'] == 'обеспечение исполнения контракта']
second_label_test = [doc for doc in test_split if doc['label'] != 'обеспечение исполнения контракта'] 

In [21]:
len(first_label_test), len(second_label_test)

(258, 192)

In [27]:
with open('first_label_train.json', 'w') as first_train, \
    open('second_label_train.json', 'w') as second_train, \
    open('first_label_test.json', 'w') as first_test, \
    open('second_label_test.json', 'w') as second_test:
    json.dump(first_label_train, first_train)
    json.dump(second_label_train, second_train)
    json.dump(first_label_test, first_test)
    json.dump(second_label_test, second_test)
