# Preparing dataset

## Module importing

In [30]:
import os

import numpy as np
import pandas as pd
import yaml

In [31]:
from pathlib import Path
from typing import Dict

from tqdm import tqdm

## Settings and useful functions

In [32]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [33]:
PATH2ROOT = Path('..')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [34]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [35]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

## Preparing dataset

In [36]:
def get_filenames() -> Dict[str, Dict[str, str]]:
    result = dict()
    for filename in os.listdir(PATH2COURPUS):
        file_tokenize = filename.split('.')

        if file_tokenize[-2] != 'tok':
            continue
        elif file_tokenize[-1] == 'ann':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['ann'] = filename
            else:
                result[t] = {'ann': filename}
            pass
        elif file_tokenize[-1] == 'txt':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['txt'] = filename
            else:
                result[t] = {'txt': filename}
    return result

In [37]:
def raw_text_to_tabular() -> pd.DataFrame:
    df = pd.DataFrame()
    for filename, paths in tqdm(get_filenames().items()):
        try:
            ann = (
                pd.read_csv(PATH2COURPUS / paths['ann'], sep='\t', header=None)
                .dropna()
                .reset_index(drop=True)
            )
        except pd.errors.EmptyDataError:
            continue
        ann.columns = ['index', 'tag', 'phrase']

        with open(PATH2COURPUS / paths['txt'], 'r') as file:
            txt = file.read()

        parallel_txt = txt

        for row in ann.values:
            tag = row[1].split()[0]
            phrase = row[2]
            n_tokens_in_phrase = len(phrase.split())
            parallel_txt = parallel_txt.replace(
                phrase, ' '.join([tag] * n_tokens_in_phrase), 1
            )

        assert len(parallel_txt.split()) == len(txt.split())

        sublist = []
        for idx, (sentence, parallel_sentence) in enumerate(
            zip(txt.split('\n'), parallel_txt.split('\n'))
        ):
            if sentence == '':
                continue

            assert len(sentence.split()) == len(parallel_sentence.split())
            for token, parallel_token in zip(sentence.split(), parallel_sentence.split()):
                if token == parallel_token:
                    sublist.append(
                        {
                            'filename': filename,
                            'sentence': f'sentence {idx+1}',
                            'token': token,
                            'tag': 'None',
                        }
                    )
                else:
                    sublist.append(
                        {
                            'filename': filename,
                            'sentence': f'sentence {idx+1}',
                            'token': token,
                            'tag': parallel_token,
                        }
                    )
        df = df.append(sublist)
    return df.reset_index(drop=True)

In [44]:
def raw_text_to_parallel() -> pd.DataFrame:
    df = pd.DataFrame()
    for filename, paths in tqdm(get_filenames().items()):
        try:
            ann = (
                pd.read_csv(PATH2COURPUS / paths['ann'], sep='\t', header=None)
                .dropna()
                .reset_index(drop=True)
            )
        except pd.errors.EmptyDataError:
            continue
        ann.columns = ['index', 'tag', 'phrase']

        with open(PATH2COURPUS / paths['txt'], 'r') as file:
            txt = file.read()

        parallel_txt = txt

        for row in ann.values:
            tag = row[1].split()[0]
            phrase = row[2]
            n_tokens_in_phrase = len(phrase.split())
            parallel_txt = parallel_txt.replace(
                phrase, ' '.join([tag] * n_tokens_in_phrase), 1
            )

        assert len(parallel_txt.split()) == len(txt.split())

        sublist = []
        for idx, (sentence, parallel_sentence) in enumerate(
            zip(txt.split('\n'), parallel_txt.split('\n'))
        ):
            if sentence == '':
                continue

            res_text = []
            res_tag = []

            assert len(sentence.split()) == len(parallel_sentence.split())
            for token, parallel_token in zip(sentence.split(), parallel_sentence.split()):
                if token == parallel_token:
                    res_text.append(token)
                    res_tag.append('None')
                else:
                    res_text.append(token)
                    res_tag.append(parallel_token)
            sublist.append({'filename': filename, 'text': res_text, 'tag': res_tag})
        df = df.append(sublist)
    return df.reset_index(drop=True)

In [45]:
# result = raw_text_to_tabular()
result = raw_text_to_parallel()

100%|██████████| 264/264 [00:01<00:00, 190.16it/s]


In [46]:
result

Unnamed: 0,filename,text,tag
0,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Не, встигла, новостворена, П’ядицька, ОТГ, за...","[None, None, None, LOC, LOC, None, None, None,..."
1,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Як, наслідок, ,, на, коломийському, стадіоні,...","[None, None, None, None, None, None, «ORG», No..."
2,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Мовляв, ,, тренерам, повідомили, про, звільне...","[None, None, None, None, None, None, None, Non..."
3,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Присутні, створили, ініціативну, групу, ,, об...","[None, None, None, None, None, None, None, Non..."
4,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Тренери, ,, зі, свого, боку, ,, нагадали, ,, ...","[None, None, None, None, None, None, None, Non..."
...,...,...,...
13574,I_Ivanychuk_1_Torhovytsia_2013(5),"[На, початку, Личаківської, звернув, у, Круняр...","[None, None, LOC, None, None, LOC, LOC, None, ..."
13575,I_Ivanychuk_1_Torhovytsia_2013(5),"[Юрків, родич, ,, член, ОУН, ,, день, відо, дн...","[PERS, None, None, None, ORG, None, None, None..."
13576,I_Ivanychuk_1_Torhovytsia_2013(5),"[Примістив, його, в, потаємній, комірці, ,, де...","[None, None, None, None, None, None, None, Non..."
13577,I_Ivanychuk_1_Torhovytsia_2013(5),"[Професор, Сербин, приносив, щораз, тривожніші...","[None, PERS, None, None, None, None, None]"


## Saving results

In [47]:
result.to_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

## Read it!

In [48]:
result = pd.read_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

In [49]:
result.head(20)

Unnamed: 0,filename,text,tag
0,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Не, встигла, новостворена, П’ядицька, ОТГ, за...","[None, None, None, LOC, LOC, None, None, None,..."
1,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Як, наслідок, ,, на, коломийському, стадіоні,...","[None, None, None, None, None, None, «ORG», No..."
2,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Мовляв, ,, тренерам, повідомили, про, звільне...","[None, None, None, None, None, None, None, Non..."
3,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Присутні, створили, ініціативну, групу, ,, об...","[None, None, None, None, None, None, None, Non..."
4,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Тренери, ,, зі, свого, боку, ,, нагадали, ,, ...","[None, None, None, None, None, None, None, Non..."
5,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Усю, опубліковану, інформацію, про, закриття,...","[None, None, None, None, None, None, None, Non..."
6,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Запевняє, ,, що, до, нього, ніхто, не, зверта...","[None, None, None, None, None, None, None, Non..."
7,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[А, він, із, задоволенням, зустрівся, б, із, н...","[None, None, None, None, None, None, None, Non..."
8,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Переведення, в, ОТГ]","[None, None, None]"
9,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,"[Нагадаємо, ,, що, 29, грудня, відбулися, вибо...","[None, None, None, None, None, None, None, Non..."
