# Sample codes for Hands-on

## Step 1: Download datasets

In [None]:
#
# sample dataset used in this example
# https://huggingface.co/datasets/tyqiangz/multilingual-sentiments
#
from datasets import load_dataset

dataset_path = "tyqiangz/multilingual-sentiments"
data = load_dataset(dataset_path, "all")

print(data)

In [None]:
train = data["train"]
print(train)

In [None]:
import itertools
for row in itertools.islice(train, 5):
    print(row)

## Step 2: Convert datasets into Parquet files

In [None]:
print(dataset_path)

In [None]:
import os
original_data_path = os.path.join(dataset_path, "original")
!mkdir -p {original_data_path}

In [None]:
for d in data:
    data[d].to_parquet(os.path.join(original_data_path, d + ".parquet"))

In [None]:
!ls -l {original_data_path}

from pandas import read_parquet
d = read_parquet(os.path.join(original_data_path, "test.parquet"))
d.to_json("test.json", orient="records")
d.to_csv("test.csv")

## Step 3-1: Apply Language Identification with Transform Wrapper

In [None]:
#
# Please replace c with your credential (access token).
#
# The following page gives an instruction to generate tokens.
# https://huggingface.co/docs/hub/security-tokens
#
c = "YOUR_TOKEN"

In [None]:
lang_id_output_path = os.path.join(dataset_path, "lang_id")

In [None]:
from dpk_lang_id.transform_python import LangId

lang_id_transform = LangId(input_folder=original_data_path,
                           output_folder=lang_id_output_path,
                           lang_id_model_credential=c,
                           lang_id_model_kind="fasttext",
                           lang_id_model_url="facebook/fasttext-language-identification",
                           lang_id_content_column_name="text")

lang_id_transform.transform()

In [None]:
from pandas import read_parquet
d = read_parquet(os.path.join(lang_id_output_path, "train.parquet"))
print(d.loc[0:10])

## Step 3-2: Apply Language Identification with Transform API

In [None]:
from dpk_lang_id.transform import (
    LangIdentificationTransform,
    model_credential_key,
    model_kind_key,
    model_url_key,
    content_column_name_key,
)

conf = {
    model_credential_key: c,
    model_kind_key: "fasttext",
    model_url_key: "facebook/fasttext-language-identification",
    content_column_name_key: "text",
}

transform = LangIdentificationTransform(conf)

from data_processing.data_access import DataAccessLocal
data_access = DataAccessLocal()

import glob
for p in glob.glob(os.path.join(original_data_path, "*.parquet")):
    print(p)
    table, _ = data_access.get_table(p)
    #print(type(table))
    table, metadata = transform.transform(table)
    data_access.save_table(os.path.join(lang_id_output_path, os.path.basename(p)), table[0])

In [None]:
d = read_parquet(os.path.join(lang_id_output_path, "train.parquet"))
print(d.loc[0:10])

## Step 4-1: Apply Filter to choose English texts only

In [None]:
filter_output_path = os.path.join(dataset_path, "filter")

In [None]:
from dpk_filter.runtime import Filter

filter_criteria = [
    "lang = 'en'",
    "score >= 0.9",
]
filter_logical_operator = "AND"

Filter(input_folder=lang_id_output_path,
       output_folder=filter_output_path,
       filter_criteria_list=filter_criteria,
       filter_logical_operator=filter_logical_operator).transform()

In [None]:
d = read_parquet(os.path.join(filter_output_path, "train.parquet"))
print(d.loc[0:10])

## Step 4-2: Apply Filter with Ray

In [None]:
from dpk_filter.ray.runtime import Filter

filter_criteria = [
    "lang = 'en'",
    "score >= 0.9",
]
filter_logical_operator = "AND"

Filter(input_folder=lang_id_output_path,
       output_folder=filter_output_path,
       filter_criteria_list=filter_criteria,
       filter_logical_operator=filter_logical_operator,
       runtime_num_workers=2,
       run_locally=True).transform()

In [None]:
d = read_parquet(os.path.join(filter_output_path, "train.parquet"))
print(d.loc[0:10])

## Step 5: Run Your Transform with Transform API

In [None]:
#
# use importlib to reflect updates in your code to this notebook immediately
#
import my_transform
import importlib
importlib.reload(my_transform)

from my_transform import (
    MyTransform,
    column_key,
)

conf = {
    column_key: "text",
}

my_transform_output_path = os.path.join(dataset_path, "my_transform")

transform = MyTransform(conf)

from data_processing.data_access import DataAccessLocal
data_access = DataAccessLocal()

import glob
for p in glob.glob(os.path.join(filter_output_path, "*.parquet")):
    print(p)
    table, _ = data_access.get_table(p)
    table, metadata = transform.transform(table)
    data_access.save_table(os.path.join(my_transform_output_path, os.path.basename(p)), table[0])

In [None]:
d = read_parquet(os.path.join(my_transform_output_path, "train.parquet"))
print(d.loc[0:10])