# Preparing dataset

## Module importing

In [1]:
import os

import numpy as np
import pandas as pd
import yaml

In [2]:
from pathlib import Path
from typing import Dict

from tqdm import tqdm

## Settings and useful functions

In [3]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [4]:
PATH2ROOT = Path('..')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [5]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [6]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

## Preparing dataset

In [7]:
def get_filenames() -> Dict[str, Dict[str, str]]:
    result = dict()
    for filename in os.listdir(PATH2COURPUS):
        file_tokenize = filename.split('.')

        if file_tokenize[-2] != 'tok':
            continue
        elif file_tokenize[-1] == 'ann':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['ann'] = filename
            else:
                result[t] = {'ann': filename}
            pass
        elif file_tokenize[-1] == 'txt':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['txt'] = filename
            else:
                result[t] = {'txt': filename}
    return result

In [15]:
def raw_text_to_tabular() -> pd.DataFrame:
    df = pd.DataFrame()
    for filename, paths in tqdm(get_filenames().items()):
        try:
            ann = (
                pd.read_csv(PATH2COURPUS / paths['ann'], sep='\t', header=None)
                .dropna()
                .reset_index(drop=True)
            )
        except pd.errors.EmptyDataError:
            continue
        ann.columns = ['index', 'tag', 'phrase']

        with open(PATH2COURPUS / paths['txt'], 'r') as file:
            txt = file.read()

        parallel_txt = txt

        for row in ann.values:
            tag = row[1].split()[0]
            phrase = row[2]
            n_tokens_in_phrase = len(phrase.split())
            parallel_txt = parallel_txt.replace(
                phrase, ' '.join([tag] * n_tokens_in_phrase), 1
            )

        #         txt_split = txt.split()
        #         for idx, token in enumerate(txt_split):
        #             if token in ABBREVIATIONS and idx < len(txt_split)-1 and txt_split[idx+1] in DELIMITERS:
        #                 pass
        #         del txt_split

        assert len(parallel_txt.split()) == len(txt.split())

        sublist = []
        for idx, (sentence, parallel_sentence) in enumerate(
            zip(txt.split('\n'), parallel_txt.split('\n'))
        ):
            if sentence == '':
                continue

            assert len(sentence.split()) == len(parallel_sentence.split())
            for token, parallel_token in zip(sentence.split(), parallel_sentence.split()):
                if token == parallel_token:
                    sublist.append(
                        {
                            'filename': filename,
                            'sentence': f'sentence {idx+1}',
                            'token': token,
                            'tag': 'None',
                        }
                    )
                else:
                    sublist.append(
                        {
                            'filename': filename,
                            'sentence': f'sentence {idx+1}',
                            'token': token,
                            'tag': parallel_token,
                        }
                    )
        df = df.append(sublist)
    return df

In [16]:
result = raw_text_to_tabular()

100%|██████████| 264/264 [00:03<00:00, 72.86it/s]


## Saving results

In [18]:
result.to_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

## Read it!

In [19]:
result = pd.read_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

In [21]:
result.head(5)

Unnamed: 0,filename,sentence,token,tag
0,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,sentence 1,Не,
1,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,sentence 1,встигла,
2,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,sentence 1,новостворена,
3,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,sentence 1,П’ядицька,LOC
4,A_Halytskyi_korespondent_Fedoliak_Baseyn_peret...,sentence 1,ОТГ,LOC
