# Preparing dataset

## Module importing

In [120]:
import os

import numpy as np
import pandas as pd
import yaml

In [121]:
from pathlib import Path
from typing import Dict

from tqdm import tqdm

## Settings and useful functions

In [122]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [123]:
PATH2ROOT = Path('..')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [124]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [125]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

## Preparing dataset

In [None]:
def get_filenames() -> Dict[str, Dict[str, str]]:
    result = dict()
    for filename in os.listdir(PATH2COURPUS):
        file_tokenize = filename.split('.')

        if file_tokenize[-2] != 'tok':
            continue
        elif file_tokenize[-1] == 'ann':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['ann'] = filename
            else:
                result[t] = {'ann': filename}
            pass
        elif file_tokenize[-1] == 'txt':
            t = ' '.join(file_tokenize[:-2])
            if t in result:
                result[t]['txt'] = filename
            else:
                result[t] = {'txt': filename}
    return result

In [117]:
def raw_text_to_tabular() -> pd.DataFrame:
    df = pd.DataFrame(columns=['word', 'tag'])
    for filename, paths in tqdm(get_filenames().items()):
        try:
            ann = (
                pd.read_csv(PATH2COURPUS / paths['ann'], sep='\t', header=None)
                .dropna()
                .reset_index(drop=True)
            )
        except pd.errors.EmptyDataError:
            continue
        ann.columns = ['index', 'tag', 'phrase']

        with open(PATH2COURPUS / paths['txt'], 'r') as file:
            txt = file.read()

        parallel_txt = txt

        for row in ann.values:
            tag = row[1].split()[0]
            phrase = row[2]
            n_words_in_phrase = len(phrase.split())
            parallel_txt = parallel_txt.replace(
                phrase, ' '.join([tag] * n_words_in_phrase), 1
            )

        assert len(parallel_txt.split()) == len(txt.split())

        sublist = []
        for word, parallel_word in zip(txt.split(), parallel_txt.split()):
            if word == parallel_word:
                sublist.append({'filename': filename, 'word': word, 'tag': 'None'})
            else:
                sublist.append({'filename': filename, 'word': word, 'tag': parallel_word})
        df = df.append(sublist)
    return df

In [119]:
result = raw_text_to_tabular()

100%|██████████| 264/264 [00:02<00:00, 95.83it/s] 


## Saving results

In [126]:
result.to_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])