# Preparing dataset

## Module importing

In [165]:
import os
import re
import string

import numpy as np
import pandas as pd
import tokenize_uk
import yaml

In [167]:
from pathlib import Path
from typing import Dict, List

from tqdm import tqdm

## Settings and useful functions

In [32]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [33]:
PATH2ROOT = Path('..')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [34]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [35]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

In [144]:
PUNCTUATION = string.punctuation + '«' + '»' + '–'

## Preparing dataset

In [36]:
def get_filenames() -> Dict[str, Dict[str, str]]:
    result = dict()
    for filename in os.listdir(PATH2COURPUS):
        file_tokenize = filename.split('.')

        if file_tokenize[-2] != 'tok':
            continue
        elif file_tokenize[-1] == 'ann':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['ann'] = filename
            else:
                result[t] = {'ann': filename}
            pass
        elif file_tokenize[-1] == 'txt':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['txt'] = filename
            else:
                result[t] = {'txt': filename}
    return result

In [240]:
def process_annotation(annotation: str) -> List[str]:
    types_dict = {
        'ПЕРС': 'PERS',
        'ЛОК': 'LOC',
        'ОРГ': 'ORG',
        'РІЗН': 'MISC',
        'PERS': 'PERS',
        'LOC': 'LOC',
        'ORG': 'ORG',
        'MISC': 'MISC',
    }
    ann_list = annotation.split('\t')

    type_annotation = types_dict[ann_list[1].split(' ')[0]]
    tokens = tokenize_uk.tokenize_words(ann_list[2])
    return list(map(lambda x: (x, f'{type_annotation}'), tokens))

In [241]:
def raw_text_to_parallel() -> pd.DataFrame:
    df = pd.DataFrame()
    for filename, paths in tqdm(get_filenames().items()):
        with open(PATH2COURPUS / paths['txt'], 'r') as file:
            txt = file.read()

        with open(PATH2COURPUS / paths['ann'], 'r') as file:
            ann = file.read()

        tokens = list()
        list(
            map(
                lambda x: tokens.extend(process_annotation(x)) if x else x,
                ann.split('\n'),
            )
        )
        txt = map(
            lambda sentence: tokenize_uk.tokenize_words(sentence),
            tokenize_uk.tokenize_sents(' '.join(tokenize_uk.tokenize_words(txt))),
        )

        for sentence in txt:
            words = []
            tags = []
            for word in sentence:
                if tokens and word == tokens[0][0]:
                    word, tag = tokens.pop(0)
                    words.append(word)
                    tags.append(tag)
                else:
                    words.append(word)
                    tags.append('O')

        df = df.append([{'filename': filename, 'text': words, 'tags': tags}])
    return df.reset_index(drop=True)

In [242]:
# result = raw_text_to_tabular()
result = raw_text_to_parallel()

100%|██████████| 264/264 [00:01<00:00, 211.46it/s]


## Saving results

In [243]:
result.to_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

## Read it!

In [244]:
result = pd.read_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

In [245]:
result.iloc[1:2]['tags'].values

array([list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'O'])],
      dtype=object)