# Import Packages

In [112]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
from transformers import (AutoTokenizer,)
from datasets import Dataset, DatasetDict
from huggingface_hub import login

import utility.utility as util

# Below import and instructions simply for display
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Config

In [86]:
# name directory with desired raw data
_name_dir_raw_data = "english"

# name meta file - note this file needs to be within the folder with the raw data files (text, json, etc.)
_name_meta_file = "data.xlsx"

# name to be used for dataset and resepctive folder
_name_dataset_and_folder = "english_uncons_cons"

# columns of interest
_coi = ["filename", "label"]

# label column name
_col_label = "label"

# column with filepath to text file
_col_file = "filename"

# split sizes - proportion of each dataset (e.g., training, validation/hold-out, test)
_split_sizes = {"train": .7, "validation": .15, "test": .15}

# random state/seed for random number generator
_seed = 10

# Directory and file paths

In [87]:
# CWD
path_cwd = os.getcwd()

"""
Directory paths
"""
# path raw data directory
path_raw_data = os.path.join(path_cwd, "raw_data", _name_dir_raw_data)
# path DatasetDict directory
path_dataset_dict = os.path.join(path_cwd, "datasets", _name_dataset_and_folder)

"""
File paths
"""
path_meta_file = os.path.join(path_raw_data, _name_meta_file)

# Load Meta Data

In [96]:
meta = pd.read_excel(path_meta_file)

# Keep columns of interest

In [97]:
meta.drop([col for col in meta.columns if col not in _coi], axis=1, inplace=True)

# Number of files per class

In [98]:
count = Counter(meta[_col_label])
count

Counter({1: 1257, 0: 1091})

In [99]:
meta

Unnamed: 0,filename,label
0,62545838.txt,1
1,62429795.txt,1
2,62428876.txt,1
3,62405849.txt,1
4,62405848.txt,1
...,...,...
2343,LV0000100881_2012_AR_english_01.txt,0
2344,LV0000100949_2009_AR_english_01.txt,0
2345,NL0000289817_2005_AR_english_01.txt,0
2346,SI0031103805_2009_AR_english_01.txt,0


# Path to file name

In [100]:
meta[_col_file] = meta.apply(lambda x: os.path.join(path_raw_data, x[_col_file]), axis=1)

# Create splits

In [102]:
data_split = util.create_stratified_split(meta, _col_label, _split_sizes, _seed)

# Parse Text Files

In [104]:
for s in data_split:
    # parse text file
    data_split[s]["text"] = data_split[s][_col_path].apply(util.parse_txt)
    # drop column with file path
    data_split[s].drop([_col_path], axis=1, inplace=True)

# Remove rows with none-values - indicate a missing, corrupt, or empty file.

In [105]:
for s in data_split:
    data_split[s].dropna(subset = ["text"], inplace=True)

# Create Arrow Dataset

In [106]:
dataset_dict = util.create_dataset_dict(data_split, [_col_label, "text"])

In [107]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 1632
    })
    validation: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 348
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 346
    })
})

# Export dataset

## Local Export

In [110]:
dataset_dict.save_to_disk(os.path.join(path_dataset_dict, _name_dataset_and_folder))

Saving the dataset (0/1 shards):   0%|          | 0/1632 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/348 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/346 [00:00<?, ? examples/s]

## Export to HF Hub

In [114]:
# loging to hub
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [115]:
# username HF Hub
_user = "IMN"
_HFHub_dataset = _user + "/" + "test"

In [116]:
dataset_dict.push_to_hub(_HFHub_dataset)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/IMN/test/commit/20ce68d6d56e70aa4fc4c7244824b5669e35cd3b', commit_message='Upload dataset', commit_description='', oid='20ce68d6d56e70aa4fc4c7244824b5669e35cd3b', pr_url=None, pr_revision=None, pr_num=None)