In [None]:
!pip install transformers

In [None]:
!pip install nltk

In [None]:
!pip install catboost

In [1]:
!pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py): started
  Building wheel for docx2txt (setup.py): finished with status 'done'
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3980 sha256=415e10a6a36e976327442a40476b437b52cc9c9ca065d0cb14b048ddd93a1afd
  Stored in directory: c:\users\mensh\appdata\local\pip\cache\wheels\40\75\01\e6c444034338bde9c7947d3467807f889123465c2371e77418
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [1]:
import pandas as pd
import os
import json
from transformers import AutoTokenizer, AutoModel
import torch
from collections import Counter
import math
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from IPython.display import clear_output
import numpy as np
from itertools import groupby
import ast
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import docx2txt
import re

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mensh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Глобальные параметры

In [2]:
TAG_RANGE = list(range(1,40))

TOKENIZER_DIR = ''
TOKENIZER = TOKENIZER_DIR + "DeepPavlov/rubert-base-cased"

MODEL_DIR = './datasets/'
DATASET_NAME = MODEL_DIR + 'dataset4.csv'
MODEL_NAME = MODEL_DIR + 'pretrained_multiclass_model'

# Модели для обучения

In [3]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

class TextVectorizerBERT:
    ''' Класс векторизации текстовых данных.'''

  # Загружаем предобученныею модель токенизации текста
    def __init__(self, tokenizer_path, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.model = AutoModel.from_pretrained(model_path)

  # Приводим текст в его векторное представление
    def vectorize(self, text):
        encoded_input = self.tokenizer(
            [text],
            padding=True,
            truncation=True,
            max_length=24,
            return_tensors='pt')
        
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        return sentence_embeddings[0].tolist()

In [4]:
class ClassificationModel:
    '''
    Модель классификации векторных представлений текста.
    Общий объём тегов классификации: 39
    '''
    
    dtset_col_txt_name = 'content'
    dtset_col_tag_name = 'tag'
    dtset_col_vec_name = 'vec_content'

    #Инициализируем векторизатор и классификатор
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer
        self.model = CatBoostClassifier(
          iterations=30,
          loss_function='MultiClass',
          learning_rate=0.1,
          depth=8,
          eval_metric='TotalF1:average=Macro')

    #Загружаем датасет для обучения модели-классификатора
    def load_dataset(self, path):
        self.dataset = pd.read_csv(path, sep=',')
        print(f"Drop nan. shape before {self.dataset.shape}.")
        self.dataset = self.dataset.dropna()
        print(f"Drop nan. shape after {self.dataset.shape}")

        self.dataset[self.dtset_col_tag_name] = self.dataset[self.dtset_col_tag_name].astype('int32')

        if self.dtset_col_vec_name in self.dataset.columns:
            self.dataset[self.dtset_col_vec_name] = self.dataset[self.dtset_col_vec_name].apply(lambda vec: ast.literal_eval(vec))

        counter = Counter(self.dataset[model.dtset_col_tag_name])
        print("Частота классов:")
        for k,v in counter.items():
            print(f"{k}: {v}")

    #Получение векторных представлений текста 
    def vectorize_dataset(self, dataset_name):
        if self.dtset_col_vec_name not in self.dataset.columns:
            self.vectorized_content = list()
            for i, text in enumerate(self.dataset[self.dtset_col_txt_name]):
                clear_output(wait=True)
                print(f"{i}/{self.dataset.shape[0]}")
                self.vectorized_content.append(self.vectorizer.vectorize(text))
            
            print(f"Сохраняем векторные представления в {dataset_name}")
            self.dataset[self.dtset_col_vec_name] = self.vectorized_content
            self.dataset.to_csv(dataset_name, sep=',', index=False)
        else:
            self.vectorized_content = self.dataset[self.dtset_col_vec_name]
    

    #Разбиение датасета на тренировочную и тестовую выборки
    def split_dataset(self, d_split=0.4):
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.vectorized_content,
            self.dataset[self.dtset_col_tag_name],
            test_size=d_split,
            random_state=0,
            stratify=self.dataset[self.dtset_col_tag_name])

    #Обучение модели-классификатора и сохранение её в файл
    def train_model(self, model_name):
        self.model.fit(
            self.X_train,
            self.y_train,
            eval_set=(self.X_val, self.y_val),
            plot=True)
    
        self.model.save_model(model_name)

    #Загрузка предобученной модели классификатора
    def load_model(self, model_path):
        self.model = CatBoostClassifier()
        self.model.load_model(model_path)

    #Получение предсказания для заданного векторного представления текста.
    #Формат: (предсказанный класс, точность предсказания)
    def predict_tag(self, vector):
        prediction = self.model.predict_proba(vector)
        highest_score = max(zip(TAG_RANGE,prediction), key=lambda pair: pair[1])
        return highest_score

# Парсинг документа

In [10]:
def parse_doc(doc_path):
    raw_text = docx2txt.process(doc_path)
  
    # Удаляем шапку документа
    ignorecase_head_ptr = re.compile('\n{2,}утверждены\n{2,}постановлением правительства\n{2,}Российской Федерации\n{2,}', re.IGNORECASE)
    pos = ignorecase_head_ptr.search(raw_text).span()[1]
    raw_text = raw_text[pos:]

    print('here1')

    # Удаляем дополнительные Приложения в хвосте документа
    tail_ptr1 = re.compile('\n{3,}приложение( \w+){,2}\n{2}', re.IGNORECASE)
    ptr = tail_ptr1.search(raw_text)
    if ptr:
        pos = ptr.span()[0]
        raw_text = raw_text[:pos]

    print('here2')
  
    # Удаляем новые заголовки с текстом в хвосте документа
    tail_ptr2 = re.compile('(([А-Я,"]+[ ]*)+\n{2,}){3,}')
    title_span = tail_ptr2.search(raw_text).span()
  
    print('here3')

    ptr = tail_ptr2.search(raw_text[title_span[1]:])
    if ptr:
        pos = ptr.span()[0]
        raw_text = raw_text[pos+title_span[1]:]
    raw_text = raw_text[title_span[1]:]

    # Удаляем мусор из анализируемой части документа
    parts = list(filter(lambda v: v and 'www.consultant.ru' not in v 
                        and 'Список изменяющих документов\n\n(в ред. Постановлений Правительства РФ' not in v
                        and v not in [' ', '  '],
                        re.split('\n{5,}', raw_text)))
  
    raw_text = '\n\n'.join(map(lambda v: v.strip('\n'), parts))
    raw_text = re.sub('\n{4} \n{4}', ' ', raw_text)

    # Разбиваем текст на абзацы
    parts = list(filter(lambda v: v != ' ' and v, re.split('\n\n', raw_text)))
   
    # Разбиваем абзацы на предложения
    sents = []
    for part in parts:
        sents += sent_tokenize(part)
    sents = list(filter(lambda v: len(v) > 5,sents))

    return sents

Инициализация 

In [5]:
vectorizer = TextVectorizerBERT(TOKENIZER, TOKENIZER)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
model = ClassificationModel(vectorizer)

# Обучение модели

In [33]:
#Разбиваем датасет на части (чтобы всё влезло в память при векторизации)
PART_SIZE = 10000 
PARTS_DIR = MODEL_DIR + 'parts_dataset_dir/'

full_dataset = pd.read_csv(DATASET_NAME, sep=',')
part_dataset_names = []

start = 0
end = 0
rows_count = full_dataset.shape[0]
parts = math.ceil(rows_count / PART_SIZE)
for i in range(parts):
    if start + PART_SIZE <= rows_count:
        start = end
        end = start + PART_SIZE
    else:
        start = end
        end = rows_count
        
    new_dataset_name = f"{PARTS_DIR}{DATASET_NAME}_part{i}.csv"
    full_dataset.iloc[start:end,:].to_csv(new_dataset_name, index=False)
    part_dataset_names.append(new_dataset_name)

In [34]:
#Векторизуем каждый датасет
for part_name in part_dataset_names:
    model.load_dataset(part_name)
    model.vectorize_dataset(part_name)

3762/3763
Сохраняем векторные представления в ./parts_dataset_dir/./dataset4.csv_part6.csv


In [38]:
#Объединяем векторизированные датасеты в один
full_dataset = None
for part_name in part_dataset_names:
    tmp_df = pd.read_csv(part_name,sep=',')
    if full_dataset is None:
        full_dataset = tmp_df
    else:
        full_dataset = pd.concat([full_dataset, tmp_df])
full_dataset.to_csv(DATASET_NAME, index=False)

In [7]:
model.load_dataset(DATASET_NAME)

Drop nan. shape before (63748, 4).
Drop nan. shape after (63748, 4)
Частота классов:
2: 1635
1: 1635
35: 1635
24: 1635
32: 1635
27: 1635
38: 1635
25: 1635
31: 1635
26: 1635
18: 1635
11: 1635
19: 1635
20: 1635
21: 1635
14: 1635
7: 1635
29: 1635
28: 1635
36: 1635
37: 1635
33: 1635
34: 1635
4: 1635
3: 1623
23: 1635
39: 1635
22: 1630
8: 1635
10: 1635
13: 1635
15: 1635
6: 1635
12: 1635
30: 1635
17: 1635
5: 1635
16: 1635
9: 1635


In [8]:
model.split_dataset()

AttributeError: 'ClassificationModel' object has no attribute 'vectorized_content'

model.train_model(MODEL_NAME)

## Получение предсказаний модели

In [None]:
model.load_model(MODEL_NAME)

In [None]:
def make_predictions(texts):
  predictions = []
  l_texts = len(texts)

  # Векторизуем и классифицируем извлечённый абзац документа
  for i, text in enumerate(texts):
    print(f"{i}/{l_texts}")
    clear_output(wait=True)
    
    text_embedding = model.vectorizer.vectorize(text)
    predict = model.predict_tag(text_embedding)
    # Сохраняем тег, полученный при классификации
    predictions.append(predict[0])

  return predictions

104/105


Формирование статистики

In [35]:
def create_statistics_json(predictions):
    tags_list = list(map(lambda v: v[0], predictions))
    
    # присутствующие теги
    present_tags = list(set(tags_list))
    
    # отсутствующие теги
    missed_tags = set(TAG_RANGE).difference(set(tags_list))

    # частота присутствующих признаков
    tag_frequency = dict(Counter(tags_list))

    tag_accur_median = dict()
    tag_accur_var = dict()
    for key, group_items in groupby(predictions, key=lambda p: p[0]):
        accurs = list(map(lambda p: p[1],group_items))
        # медиана точности по каждому тегу
        tag_accur_median[key] = np.median(accurs)
        
    # дисперсия точности по каждому тегу    
    tag_accur_var[key] = np.var(accurs)
  


    # формируем единый json-файл со всей статистикой
    data = dict()
    data['present tags'] = present_tags
    data['missed tags'] = missed_tags
    data['tags freq'] = tag_frequency
    data['accur median'] = tag_accur_median
    data['accur var'] = tag_accur_var
    data['pred list'] = predictions

    with open('statistics_document.json', 'w') as f:
        json.dump(data, f)

Формируем единый датасет для тестирования модели

In [None]:
TEST_DATASET_DIR = MODEL_DIR + './test_dataset/'
FULL_DATASET_NAME = MODEL_DIR + 'test_dataset.csv'
INFO_DATASET_NAME = MODEL_DIR + 'test_dataset_predictions.csv'

FIELD_PRED_TAG = 'class'
FIELD_FILE_I = 'file_i'
FILED_ID = 'id'
FIELD_TEXT = 'content'

doc_names = list(map(lambda v: TEST_DATASET_DIR + v,os.listdir(TEST_DATASET_DIR)))

In [None]:
full_dataset = pd.DataFrame(columns=[FIELD_FILE_I,FILED_ID,FIELD_TEXT])

doc_file_i_s = []
doc_ids = []
doc_part_txt = []

for i, name in enumerate(doc_names):
  doc_id = int(name.split('.')[-2].split('/')[-1])

  raw_text = docx2txt.process(name)
  raw_text = re.sub('\n+', ' ', raw_text)

  patterns = re.findall(r'\{\d+\}.*?\{\d+\}', raw_text)

  for j, ptr in enumerate(patterns):
    clear_output(wait=True)
    print(f"{i}/{len(doc_names)} : {j}/{len(patterns)}")

    text = re.findall('\}(.*?)\{',ptr)[0]
    num = int(re.findall('\{(.*?)\}',ptr)[0])

    doc_file_i_s.append(doc_id)
    doc_ids.append(num)
    doc_part_txt.append(text)

full_dataset[FIELD_FILE_I] = doc_file_i_s
full_dataset[FILED_ID] = doc_ids
full_dataset[FIELD_TEXT] = doc_part_txt

full_dataset.to_csv(FULL_DATASET_NAME,index=False,sep=',')

In [None]:
vectorizer = TextVectorizerBERT(TOKENIZER, TOKENIZER)
model = ClassificationModel(vectorizer)

In [None]:
model.load_dataset(FULL_DATASET_NAME)

In [None]:
predictions = make_predictions(model.dataset[model.dtset_col_txt_name])
model.dataset[FIELD_PRED_TAG] = predictions

In [None]:
df_info = model.dataset
df_info =df_info.drop(columns=['vec_content','content'])
df_info.to_csv(INFO_DATASET_NAME, index=False,sep=',')