# Install pyrecdp from github

In [None]:
!pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

# Install jdk for pyspark running

In [None]:
!DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre

# Prepare test data

In [None]:
%mkdir -p /content/test_data
%cd /content/test_data
!wget https://raw.githubusercontent.com/intel/e2eAIOK/main/RecDP/tests/data/llm_data/arxiv_sample_100.jsonl

# Import toxicity score function

In [2]:
from pyrecdp.primitives.llmutils import toxicity_score_spark, toxicity_score
from pyrecdp.core import SparkDataProcessor

JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/




# Specify variables

In [4]:
data_file = '/content/test_data/arxiv_sample_100.jsonl'
save_path = '/content/test_data/output/toxicity_score'

# Load data

In [6]:
rdp = SparkDataProcessor()
spark = rdp.spark
spark_df = spark.read.json(data_file)
spark_df.show()

Will assign 1 cores and 10386 M memory for spark
per core memory size is 10.143 GB and shuffle_disk maximum capacity is 8589934592.000 GB
+--------------------+--------------------+
|                meta|                text|
+--------------------+--------------------+
|{2203.15369, en, ...|\section{Introduc...|
|{math/9807097, en...|\section{Introduc...|
|{2008.06948, en, ...|\section{Introduc...|
|{cond-mat/9807071...|\section{Introduc...|
|{2210.10650, en, ...|\section{\label{s...|
|{astro-ph/9807119...|\section{Introduc...|
|{2111.03152, en, ...|\section{Introduc...|
|{1606.04992, en, ...|\n\n\section{Intr...|
|{1608.03404, en, ...|\section{introduc...|
|{1904.10101, en, ...|\section{Introduc...|
|{cond-mat/9807275...|\section{Introduc...|
|{2109.05334, en, ...|\section{Introduc...|
|{1512.06966, en, ...|\section{Introduc...|
|{2112.04926, en, ...|\section{Introduc...|
|{2202.01000, en, ...|\section{Introduc...|
|{2209.13421, en, ...|\section{Introduc...|
|{1103.5603, en, 2...|\sec

# Process the 'text' column and generate toxicity score which will determine whether the row kept or not(spark dataframe interface)

In [7]:
toxicity_score_spark_df = toxicity_score_spark(spark_df)
toxicity_score_spark_df.show()

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.4-alpha/multilingual_debiased-0b549669.ckpt" to /root/.cache/torch/hub/checkpoints/multilingual_debiased-0b549669.ckpt
100%|██████████| 1.04G/1.04G [00:04<00:00, 252MB/s]


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

+--------------------+--------------------+--------------------+
|                meta|                text|       text_toxicity|
+--------------------+--------------------+--------------------+
|{2203.15369, en, ...|\section{Introduc...|2.222859766334295...|
|{math/9807097, en...|\section{Introduc...|2.392058377154171...|
|{2008.06948, en, ...|\section{Introduc...|3.047155332751572E-4|
|{cond-mat/9807071...|\section{Introduc...|2.939336118288338E-4|
|{2210.10650, en, ...|\section{\label{s...|2.322625950910151E-4|
|{astro-ph/9807119...|\section{Introduc...|2.402916288701817...|
|{2111.03152, en, ...|\section{Introduc...|2.399834484094753...|
|{1606.04992, en, ...|\n\n\section{Intr...|2.001557877520099...|
|{1608.03404, en, ...|\section{introduc...|1.896593603305518...|
|{1904.10101, en, ...|\section{Introduc...|3.666119009722024E-4|
|{cond-mat/9807275...|\section{Introduc...|2.205321216024458...|
|{2109.05334, en, ...|\section{Introduc...|1.802597107598558E-4|
|{1512.06966, en, ...|\se

# Process the 'text' column and generate toxicity score and save data as parquet format.

In [5]:
toxicity_score(data_file, save_path, data_file_type="jsonl")
!ls $save_path

init ray
init ray with total mem of 8167961395, total core of 1


2023-10-17 13:41:06,195	INFO worker.py:1642 -- Started a local Ray instance.


execute with ray started ...


2023-10-17 13:41:07,953	INFO read_api.py:406 -- To satisfy the requested parallelism of 5, each read task output is split into 5 smaller blocks.
2023-10-17 13:41:13,137	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadText->SplitBlocks(5)] -> TaskPoolMapOperator[Map(convert_json)->Map(<lambda>)->Write]
2023-10-17 13:41:13,140	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-17 13:41:13,153	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/25 [00:00<?, ?it/s]

2023-10-17 13:44:18,661	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadText->SplitBlocks(5)] -> TaskPoolMapOperator[Map(convert_json)->Map(<lambda>)]
2023-10-17 13:44:18,664	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-17 13:44:18,666	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/25 [00:00<?, ?it/s]



execute with ray took 403.94510068599993 sec
b63fe36529b64d6c8bcc397fdc69cb5f_000000_000000.parquet
b63fe36529b64d6c8bcc397fdc69cb5f_000001_000000.parquet
b63fe36529b64d6c8bcc397fdc69cb5f_000002_000000.parquet
b63fe36529b64d6c8bcc397fdc69cb5f_000003_000000.parquet
b63fe36529b64d6c8bcc397fdc69cb5f_000004_000000.parquet
