# Preparing dataset

## Module importing

In [20]:
import os
import re
import string

import joblib
import numpy as np
import pandas as pd
import tokenize_uk
import yaml

In [21]:
from pathlib import Path
from typing import Dict, List

from tqdm import tqdm

## Settings and useful functions

In [22]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [23]:
PATH2ROOT = Path('..')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [24]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [25]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

In [26]:
PUNCTUATION = string.punctuation + '«' + '»' + '–'

## Preparing dataset

In [27]:
def get_filenames() -> Dict[str, Dict[str, str]]:
    result = dict()
    for filename in os.listdir(PATH2COURPUS):
        file_tokenize = filename.split('.')

        if file_tokenize[-2] != 'tok':
            continue
        elif file_tokenize[-1] == 'ann':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['ann'] = filename
            else:
                result[t] = {'ann': filename}
            pass
        elif file_tokenize[-1] == 'txt':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['txt'] = filename
            else:
                result[t] = {'txt': filename}
    return result

In [28]:
def process_annotation(annotation: str) -> List[str]:
    types_dict = {
        'ПЕРС': 'PERS',
        'ЛОК': 'LOC',
        'ОРГ': 'ORG',
        'РІЗН': 'MISC',
        'PERS': 'PERS',
        'LOC': 'LOC',
        'ORG': 'ORG',
        'MISC': 'MISC',
    }
    ann_list = annotation.split('\t')

    type_annotation = types_dict[ann_list[1].split(' ')[0]]
    tokens = tokenize_uk.tokenize_words(ann_list[2])
    return list(map(lambda x: (x, f'{type_annotation}'), tokens))

In [29]:
def raw_text_to_parallel() -> pd.DataFrame:
    df = pd.DataFrame()
    for filename, paths in tqdm(get_filenames().items()):
        with open(PATH2COURPUS / paths['txt'], 'r') as file:
            txt = file.read()

        with open(PATH2COURPUS / paths['ann'], 'r') as file:
            ann = file.read()

        tokens = list()
        list(
            map(
                lambda x: tokens.extend(process_annotation(x)) if x else x,
                ann.split('\n'),
            )
        )
        txt = map(
            lambda sentence: tokenize_uk.tokenize_words(sentence),
            tokenize_uk.tokenize_sents(' '.join(tokenize_uk.tokenize_words(txt))),
        )

        for sentence in txt:
            words = []
            tags = []
            for word in sentence:
                if tokens and word == tokens[0][0]:
                    word, tag = tokens.pop(0)
                    words.append(word)
                    tags.append(tag)
                else:
                    words.append(word)
                    tags.append('O')

            df = df.append([{'filename': filename, 'text': words, 'tags': tags}])
    return df.reset_index(drop=True)

In [30]:
# result = raw_text_to_tabular()
result = raw_text_to_parallel()

100%|██████████| 264/264 [00:17<00:00, 15.51it/s]


In [31]:
result

Unnamed: 0,filename,text,tags
0,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Не, встигла, новостворена, П’ядицька, ОТГ, за...","[O, O, O, LOC, LOC, O, O, O, O, O, O, O, O, O,..."
1,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Як, наслідок, ,, на, коломийському, стадіоні,...","[O, O, O, O, O, O, O, ORG, O, O, O, O, O, O, O..."
2,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Турці, .]","[LOC, O]"
3,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Мовляв, ,, тренерам, повідомили, про, звільне...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Присутні, створили, ініціативну, групу, ,, об...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
12009,I_Ivanychuk_1_Torhovytsia_2013(5),"[На, початку, Личаківської, звернув, у, Круняр...","[O, O, LOC, O, O, LOC, LOC, O, O, O, O, O, O, ..."
12010,I_Ivanychuk_1_Torhovytsia_2013(5),"[Юрків, родич, ,, член, ОУН, ,, день, відо, дн...","[PERS, O, O, O, ORG, O, O, O, O, O, O, O, O, O..."
12011,I_Ivanychuk_1_Torhovytsia_2013(5),"[Примістив, його, в, потаємній, комірці, ,, де...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
12012,I_Ivanychuk_1_Torhovytsia_2013(5),"[Професор, Сербин, приносив, щораз, тривожніші...","[O, PERS, O, O, O, O, O]"


In [32]:
result.iloc[0].text

['Не',
 'встигла',
 'новостворена',
 'П’ядицька',
 'ОТГ',
 'запрацювати',
 'на',
 'повну',
 'силу',
 ',',
 'як',
 'вже',
 'виникли',
 'перші',
 'проблеми',
 'з',
 'прийняттям',
 'важливих',
 'рішень',
 ',',
 'йдеться',
 'на',
 'місцевих',
 'інформаційних',
 'ресурсах',
 '.']

## Saving results

In [33]:
joblib.dump(result, PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

['../data/preproc-data/preproc-data.bin']

## Read it!

In [34]:
result = joblib.load(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

In [35]:
result.iloc[1:2]['tags'].values

array([list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'LOC'])],
      dtype=object)