# Import Packages

In [1]:
import os
import re
import numpy as np
import pandas as pd
import pyarrow as pa
from transformers import (AutoTokenizer,)
from datasets import Dataset, DatasetDict

import utility.utility as util

# Below import and instructions simply for display
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

# Config

In [2]:
# Class Labels
_labels = {"cons": 1, "uncons": 0}

# langugaes
"""
options:
    fr: French
    de: German
    da: Danish
"""
_languages = ["de"]

# columns of interest
_coi = ["filename", "language", "label"]

# number of instances per class per lang
no_inst_lang = {}
no_inst_class_lang = {}

# data splits sizes
_split_sizes = {"train": .7, "validation": .15, "test": .15}
# data splits groupby arguments
_strat_split_groupby = []

# 
files_per_split_per_label = {}

# Random Number Generator seed
_seed = 10

# Directory and File Paths

In [3]:
# directories
path_data = os.path.join(os.getcwd(), "raw_data")
paths_data_classes = {}

# files
paths_meta_files = {}

# dataset_dict
path_dataset_dict = os.path.join(os.getcwd(), "datasets")

In [4]:
for _class in os.listdir(path_data):
    paths_data_classes[_class] = os.path.join(path_data, _class)
    for file in os.listdir(paths_data_classes[_class]):
        if file.split(".")[1] == "xlsx":
            paths_meta_files[_class] = os.path.join(paths_data_classes[_class], file) 

# Load Meta Data

In [5]:
meta = pd.DataFrame()
meta_per_class = {}

def path_join_df(args):
    return os.path.join(*args)
    

for _class in paths_meta_files:
    tmp_meta = pd.read_excel(paths_meta_files[_class])
    tmp_meta.columns = tmp_meta.columns.str.lower()

    # filter for desired languages
    tmp_meta = tmp_meta[tmp_meta["language"].isin(_languages)].copy()
    
    meta_per_class[_class] = tmp_meta
    meta_per_class[_class]["label"] = _labels[_class]
    meta_per_class[_class].drop([col for col in meta_per_class[_class].columns if col not in _coi], axis=1, inplace=True)
    
    meta_per_class[_class]["filepath"] = meta_per_class[_class].apply(lambda x: path_join_df([paths_data_classes[_class], str(x.filename) + ".txt"]), axis=1)
    
    
    meta = pd.concat([meta, meta_per_class[_class]])

## Number of files per year

In [6]:
for lang in _languages:
    no_inst_class_lang[lang] = {}
    no_inst_lang[lang] = 0
    for _class in meta_per_class:
        tmp_meta = meta_per_class[_class]
        no_inst_class_lang[lang][_class] = tmp_meta[tmp_meta["language"] == lang].shape[0]
        no_inst_lang[lang] += no_inst_class_lang[lang][_class]

In [7]:
no_inst_class_lang

{'de': {'cons': 5881, 'uncons': 180}}

In [8]:
no_inst_lang

{'de': 6061}

# Create splits on file names

In [9]:
data_splits_meta = {}

for lang in _languages:
    print(lang)
    data_splits_meta[lang] = udp.createStratifiedSplit(meta[meta.language == lang], "label", _split_sizes , seed = _seed)

de
{1: {'train': [0, 4117], 'validation': [4117, 4999], 'test': [4999, 5881]}, 0: {'train': [0, 126], 'validation': [126, 153], 'test': [153, 180]}}


# Parse Text Files

In [10]:
def parse_txt(file_path):
    res = None
    if os.path.isfile(file_path):
        with open(file_path, "r") as file:
            res = file.read()
            # temporary solution, data should be cleaned before creating datasets
            pattern = re.compile(r'\w+')
            if not bool(re.search(pattern, res)):
                return None
    return res

In [11]:
data_splits = {}

for s in _split_sizes:
    data_splits[s] = pd.DataFrame()
    for lang in _languages:
        data_splits[s] = pd.concat([data_splits[s], data_splits_meta[lang][s][["filepath","label"]]])

In [12]:
for s in data_splits:
    data_splits[s]["text"] = data_splits[s]["filepath"].apply(parse_txt)
    data_splits[s].drop("filepath", axis=1, inplace=True)    

In [13]:
# remove none texts
for s in data_splits:
    data_splits[s].dropna(subset=["text"], inplace=True)

# Create arrow datasets

In [14]:
for split in data_splits:
    data_splits[split] = Dataset.from_pandas(data_splits[split][["label", "text"]])
    data_splits[split] = data_splits[split].rename_column("__index_level_0__", "idx") 

In [15]:
data_splits

{'train': Dataset({
     features: ['label', 'text', 'idx'],
     num_rows: 4213
 }),
 'validation': Dataset({
     features: ['label', 'text', 'idx'],
     num_rows: 904
 }),
 'test': Dataset({
     features: ['label', 'text', 'idx'],
     num_rows: 903
 })}

# Create dataset dictionary for HuggingFace API

In [16]:
dataset_dict = DatasetDict(data_splits)

In [17]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'idx'],
        num_rows: 4213
    })
    validation: Dataset({
        features: ['label', 'text', 'idx'],
        num_rows: 904
    })
    test: Dataset({
        features: ['label', 'text', 'idx'],
        num_rows: 903
    })
})

# Export Dataset

In [18]:
dataset_dict.save_to_disk(os.path.join(path_dataset_dict, "German_ConsUncons"))

Saving the dataset (0/2 shards):   0%|          | 0/4213 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/904 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/903 [00:00<?, ? examples/s]