# Import Packages

In [16]:
import os
import re
import numpy as np
import pandas as pd
import pyarrow
from collections import Counter
from transformers import (AutoTokenizer,)
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value, concatenate_datasets
from huggingface_hub import login

import utility.utility as util

# Below import and instructions simply for display
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Configuration

In [5]:
# path to root directory
path_cwd = os.getcwd()

# name of folder with raw dataset
_name_raw_dataset_dir = "en_pred"

# name for processed dataset
_name_datasetdict = "test_en_predict"

# path to directory with raw datasets
path_raw_dataset = os.path.join(path_cwd, "raw_data", _name_raw_dataset_dir)

# path to directory with datasetdicts
path_processed_dataset = os.path.join(path_cwd, "datasets", _name_datasetdict)

# columns of interest to keep from meta file
_coi = ["eu"]

# file format to parse - currently not needed, incase different parsing schemes provided
_file_format = ".txt"

# create partitions
_flag_partition = True
# number of partitions
_num_partitions = 10

# random number generator seed
_seed = 10

# Set file paths

In [6]:
path_data = None
meta_file = None

for item in os.listdir(path_raw_dataset):
    if item.endswith(".csv"):
        meta_file = os.path.join(path_raw_dataset, item)
    else:
        path_data = os.path.join(path_raw_dataset, item)

In [7]:
path_data

'C:\\Users\\ilias\\Desktop\\UniMaResearch2023\\DatasetPrep\\raw_data\\en_pred\\predict'

# Parse files

In [8]:
files = []

for item in os.listdir(path_data):
    files.append(item)

# Create DataFrame

In [9]:
data_df = pd.DataFrame()

data_df["filename"] = files
data_df["filepath"] = data_df.apply(lambda x: os.path.join(path_data, x["filename"]), axis=1)

# If present, read meta file 

In [10]:
meta_df = None
if meta_file:
    meta_df = pd.read_csv(meta_file)
    # reduce to coi
    meta_df = meta_df[["filename"] + _coi]
    meta_df["filename"] = meta_df["filename"].astype(str)
    # tell hala to save entire document name
    meta_df["filename"] = meta_df["filename"] + ".txt"
    
    # Here we limit to only EU files - modify according to needs
    meta_df = meta_df[meta_df["eu"] == 1]
    meta_df.drop("eu", axis=1, inplace=True)
    
    data_df = pd.merge(data_df, meta_df, on= "filename", how="inner")

# Indexes for processing

In [11]:
indexes = []
if _flag_partition:
    indexes = [i/_num_partitions for i in range(_num_partitions+1)]
    indexes = [(int(indexes[i] * data_df.shape[0]),int(indexes[i+1] * data_df.shape[0])) for i in range(len(indexes)-1)]
else:
    indexes = [(0, data_df.shape[0])]

# Define Features

In [12]:
features = Features({
    "filename": Value("string"),
    "text": Value("string"),
})

# Process Partitions and Save to Disk

In [21]:
import time

for i, index in enumerate(indexes):
    print(i, index)
    tmp_df = data_df.iloc[index[0]:index[1], :].copy()
    tmp_df["text"] = tmp_df["filepath"].apply(util.parse_txt)
    
    tmp_df.drop(columns=["filepath"], inplace=True)
    tmp_df.dropna(subset=["text"], inplace=True)
    tmp_df["filename"] = tmp_df["filename"].astype("string")
    tmp_df["text"] = tmp_df["text"].astype("string")
    tmp_df.reset_index(drop=True, inplace=True)

    tmp_df = Dataset.from_pandas(tmp_df, features)
    tmp_df = DatasetDict({"predict":tmp_df})

    tmp_df.save_to_disk(path_processed_dataset + "_" + str(i))

    del tmp_df

0 (0, 5442)



KeyboardInterrupt

