# Package Imports

In [1]:
import os
import re
import glob
import numpy as np
import pandas as pd
from collections import Counter
from transformers import (AutoTokenizer,)
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value, concatenate_datasets
from huggingface_hub import login

import utility.utility as util

# Below import and instructions simply for display
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

# Configuration:

In [2]:
# path to root directory
path_cwd = os.getcwd()

# name of folder with raw dataset
_name_raw_dataset_dir = "french_german"

# name for processed dataset dict
_name_datasetdict = "test_fr_ger_da"

# path to directory with raw datasets
path_raw_dataset = os.path.join(path_cwd, "raw_data", _name_raw_dataset_dir)

# path to directory with datasetdicts
path_datasetdict = os.path.join(path_cwd, "datasets", _name_datasetdict)

# set column names for columns containing micro classes and their data types over which to stratify
_micro_strat_labels = {"language":"string", "year":"string"}

# set split sizes
_split_sizes = {"train":.7, "val":.15, "test":.15}

# file format to parse - currently not needed, incase different parsing schemes provided
_file_format = ".txt"

# random number generator seed
_seed = 10

# Set file paths to class folders and if present meta excel file

In [3]:
paths_class_dirs = {}
meta_file = None
labels = []

for item in os.listdir(path_raw_dataset):
    if item.endswith(".csv"):
        meta_file = os.path.join(path_raw_dataset, item)
    else:
        paths_class_dirs[item] = os.path.join(path_raw_dataset, item)
        labels.append(item)

# Set label ids

Note, if you want specific labels to represent specific integers adjust below variable "label2id", otherwise they are set randomly

In [4]:
label2id = {l:id for id, l in enumerate(labels)}
# manual adjust
label2id = {"cons":1, "uncons":0}

In [5]:
id2label = {v:k for k, v in label2id.items()}

Adjust label to integer value in dataframe

# Parse Class Folders

In [6]:
paths_class_files = {}

for _class in paths_class_dirs:
    paths_class_files[_class] = []
    for item in os.listdir(paths_class_dirs[_class]):
        paths_class_files[_class].append(item)

# Create Data DataFrame

In [7]:
data_df = pd.DataFrame()

for _class in paths_class_files:
    tmp = pd.DataFrame({"filename":paths_class_files[_class]})
    tmp["label"] = label2id[_class]
    tmp["filepath"] = tmp.apply(lambda x: os.path.join(paths_class_dirs[_class], x["filename"]), axis=1)
    data_df = pd.concat([data_df, tmp])

# If present, read meta file and merge with data

In [8]:
meta_df = None
if meta_file:
    meta_df = pd.read_csv(meta_file)
    meta_df = meta_df[["filename"] + list(_micro_strat_labels.keys())].copy()
    meta_df["filename"] = meta_df["filename"].astype(str)
    # note adjust this if file format variable and we cleared up on how to standardize input format
    meta_df["filename"] = meta_df["filename"] + ".txt"
    data_df = pd.merge(data_df, meta_df, on= "filename", how="inner")

# Ratio of Label Classes:

In [9]:
counts = Counter(data_df.label)
total = sum(counts.values())
{c:counts[c]/total for c in counts}

{1: 0.8655044458447105, 0: 0.1344955541552895}

 # Create Data Splits

In [10]:
datasets = util.create_stratified_split(data_df, split_sizes=_split_sizes, micro_labels=list(_micro_strat_labels.keys()), seed=_seed)

# Parse Text Files

In [11]:
for split in datasets:
    datasets[split]["text"] = datasets[split]["filepath"].apply(util.parse_txt)
    datasets[split].drop(["filepath"] + list(_micro_strat_labels), axis = 1, inplace=True)
    datasets[split].dropna(subset=["text"], inplace=True)
    datasets[split].reset_index(drop=True, inplace=True)

# Adjust data types

In [12]:
for split in datasets:
    datasets[split]["text"] = datasets[split]["text"].astype("string")
    datasets[split]["filename"] = datasets[split]["filename"].astype("string")

# Create DatasetDict

In [13]:
class_names = [id2label[i] for i in range(len(id2label.keys()))]
features = Features({
    "filename": Value("string"),
    "text": Value("string"),
    "label": ClassLabel(names = class_names),
})

In [14]:
datasetdict = util.create_dataset_dict(datasets, features)

In [15]:
datasetdict

DatasetDict({
    train: Dataset({
        features: ['filename', 'text', 'label'],
        num_rows: 9987
    })
    val: Dataset({
        features: ['filename', 'text', 'label'],
        num_rows: 2134
    })
    test: Dataset({
        features: ['filename', 'text', 'label'],
        num_rows: 2138
    })
})

# Save DatasetDict

In [16]:
datasetdict.save_to_disk(path_datasetdict)

Saving the dataset (0/5 shards):   0%|          | 0/9987 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2134 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2138 [00:00<?, ? examples/s]