# RecDP LLM - Downsize public dataset

This notebook shows how to use RecDP tools and pipeline to downsize a public fine tuning dataset and calculate kinds of evaluation scores.

# Get Started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -q -y openjdk-8-jre
# ! pip install recdp
! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. Prepare your data

In [2]:
!mkdir -p content/test_data
!cp ../../../tests/data/alpaca/alpaca_data_50.jsonl /content/test_data
!cp ../../../tests/data/dolly/dolly_sample_50.parquet /content/test_data
!cp ../../../tests/data/openorca/openorca_sample_50.parquet /content/test_data

In [None]:
from pyrecdp.LLM import TextPipeline, ResumableTextPipeline
from pyrecdp.primitives.operations import *
import os

alpaca_prompt = ResumableTextPipeline()
alpaca_prompt.enable_statistics()
out_dir = "ResumableTextPipeline_output-alpaca"
ops = [
    JsonlReader("/content/test_data/alpaca_data_50.jsonl"),
    TextPrompt(dataset_name="alpaca", prompt_name="causal_llm_1"),
    ParquetWriter(out_dir)
]
alpaca_prompt.add_operations(ops)
ret = alpaca_prompt.execute()
del alpaca_prompt

openorca_prompt = ResumableTextPipeline()
openorca_prompt.enable_statistics()
out_dir = "ResumableTextPipeline_output-openorca"
ops = [
    ParquetReader("/content/test_data/openorca_sample_50.parquet"),
    TextPrompt(dataset_name="openorca", prompt_name="causal_llm_1"),
    ParquetWriter(out_dir)
]
openorca_prompt.add_operations(ops)
ret = openorca_prompt.execute()
del openorca_prompt

dolly_prompt = ResumableTextPipeline()
dolly_prompt.enable_statistics()
out_dir = "ResumableTextPipeline_output-dolly"
ops = [
    ParquetReader("/content/test_data/dolly_sample_50.parquet"),
    TextPrompt(dataset_name="dolly", prompt_name="causal_llm_1"),
    ParquetWriter(out_dir)
]
dolly_prompt.add_operations(ops)
ret = dolly_prompt.execute()
del dolly_prompt

recdp_promptsource: /host/mnt/DP_disk1/code/recllm/e2eAIOK/RecDP/pyrecdp/promptsource
promptsource_templates_path: /opt/conda/envs/chatbot-finetuning/lib/python3.9/site-packages/promptsource/templates
recdp_promptsource: /host/mnt/DP_disk1/code/recllm/e2eAIOK/RecDP/pyrecdp/promptsource
promptsource_templates_path: /opt/conda/envs/chatbot-finetuning/lib/python3.9/site-packages/promptsource/templates
[DatasetReader, PerfileSourcedJsonlReader, TextPrompt, PerfileParquetWriter]
init ray with total mem of 162212234035


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
2023-12-04 03:11:54,463	INFO worker.py:1642 -- Started a local Ray instance.
2023-12-04 03:11:57,227	INFO read_api.py:406 -- To satisfy the requested parallelism of 144, each read task output is split into 144 smaller blocks.
2023-12-04 03:11:57,254	INFO dataset.py:2380 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-12-04 03:11:57,258	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadJSON->SplitBlocks(144)] -> LimitOperator[limit=20]
2023-12-04 03:11:57,260	INFO streaming_executor.py

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}
{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.'}
{'instruction': 'Describe the structure of an atom.', 'input': '', 'output': 'An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.'}
{'instruction': 'How can we reduce air pollution?', 'input': '', 'output': 'There are a number of ways to reduce air pollution, such as shifting to renewab

2023-12-04 03:11:58,479	INFO read_api.py:406 -- To satisfy the requested parallelism of 144, each read task output is split into 144 smaller blocks.
2023-12-04 03:11:58,495	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadJSON->SplitBlocks(144)] -> TaskPoolMapOperator[Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)]
2023-12-04 03:11:58,499	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-04 03:11:58,500	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20736 [00:00<?, ?it/s]

ResumableTextPipeline, current on alpaca_data_50.jsonl:   0%|          | 0/1 [00:00<?, ?it/s]2023-12-04 03:12:00,635	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadJSON->SplitBlocks(144)] -> TaskPoolMapOperator[Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write]
2023-12-04 03:12:00,636	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-04 03:12:00,637	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


dataset_name:alpaca, prompt_name:causal_llm_1, subset_name:None


Running 0:   0%|          | 0/20736 [00:00<?, ?it/s]

[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=54483)[0m Skipping writing empty dataset with UUID 9cc563c56c61472dadfed22e817182ea at ResumableTextPipeline_output-alpaca/alpaca_data_50.jsonl


[32m2023-12-04 03:12:18.173[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m285[0m - [1mTextPrompt: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-04 03:12:18.181[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m285[0m - [1mPerfileParquetWriter: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m


ResumableTextPipeline, current on alpaca_data_50.jsonl: 100%|██████████| 1/1 [00:17<00:00, 17.58s/it]

[32m2023-12-04 03:12:18.191[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mexecute[0m:[36m425[0m - [1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output-alpaca for verification.[0m



[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=54507)[0m Skipping writing empty dataset with UUID 9cc563c56c61472dadfed22e817182ea at ResumableTextPipeline_output-alpaca/alpaca_data_50.jsonl[32m [repeated 93x across cluster][0m


recdp_promptsource: /host/mnt/DP_disk1/code/recllm/e2eAIOK/RecDP/pyrecdp/promptsource
promptsource_templates_path: /opt/conda/envs/chatbot-finetuning/lib/python3.9/site-packages/promptsource/templates
recdp_promptsource: /host/mnt/DP_disk1/code/recllm/e2eAIOK/RecDP/pyrecdp/promptsource
promptsource_templates_path: /opt/conda/envs/chatbot-finetuning/lib/python3.9/site-packages/promptsource/templates
[DatasetReader, PerfileSourcedParquetReader, TextPrompt, PerfileParquetWriter]
init ray with total mem of 162212234035


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
2023-12-04 03:15:06,725	INFO worker.py:1642 -- Started a local Ray instance.


(pid=59535) Parquet Files Sample 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(_get_reader pid=59535)[0m   pq_ds.pieces, **prefetch_remote_args
[2m[36m(_get_reader pid=59535)[0m   self._pq_pieces = [_SerializedPiece(p) for p in pq_ds.pieces]
[2m[36m(_get_reader pid=59535)[0m   self._pq_paths = [p.path for p in pq_ds.pieces]
2023-12-04 03:15:10,367	INFO read_api.py:406 -- To satisfy the requested parallelism of 144, each read task output is split into 144 smaller blocks.
ResumableTextPipeline, current on openorca_sample_50.parquet:   0%|          | 0/1 [00:00<?, ?it/s]2023-12-04 03:15:10,425	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet->SplitBlocks(144)] -> TaskPoolMapOperator[Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write]
2023-12-04 03:15:10,426	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enab

dataset_name:openorca, prompt_name:causal_llm_1, subset_name:None


Running 0:   0%|          | 0/20736 [00:00<?, ?it/s]

[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=59532)[0m Skipping writing empty dataset with UUID 26cfaa1c1eac4a198b0dab09036e0abc at ResumableTextPipeline_output-openorca/openorca_sample_50.parquet
[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=59532)[0m Skipping writing empty dataset with UUID 26cfaa1c1eac4a198b0dab09036e0abc at ResumableTextPipeline_output-openorca/openorca_sample_50.parquet


[32m2023-12-04 03:15:29.285[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m285[0m - [1mTextPrompt: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-04 03:15:29.290[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m285[0m - [1mPerfileParquetWriter: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m


ResumableTextPipeline, current on openorca_sample_50.parquet: 100%|██████████| 1/1 [00:18<00:00, 18.89s/it]

[32m2023-12-04 03:15:29.296[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mexecute[0m:[36m425[0m - [1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output-openorca for verification.[0m



[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=59510)[0m Skipping writing empty dataset with UUID 26cfaa1c1eac4a198b0dab09036e0abc at ResumableTextPipeline_output-openorca/openorca_sample_50.parquet[32m [repeated 92x across cluster][0m


recdp_promptsource: /host/mnt/DP_disk1/code/recllm/e2eAIOK/RecDP/pyrecdp/promptsource
promptsource_templates_path: /opt/conda/envs/chatbot-finetuning/lib/python3.9/site-packages/promptsource/templates
recdp_promptsource: /host/mnt/DP_disk1/code/recllm/e2eAIOK/RecDP/pyrecdp/promptsource
promptsource_templates_path: /opt/conda/envs/chatbot-finetuning/lib/python3.9/site-packages/promptsource/templates
[DatasetReader, PerfileSourcedParquetReader, TextPrompt, PerfileParquetWriter]
init ray with total mem of 162212234035


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
2023-12-04 03:18:18,229	INFO worker.py:1642 -- Started a local Ray instance.
[2m[36m(_get_reader pid=64482)[0m   pq_ds.pieces, **prefetch_remote_args
[2m[36m(_get_reader pid=64482)[0m   self._pq_pieces = [_SerializedPiece(p) for p in pq_ds.pieces]
[2m[36m(_get_reader pid=64482)[0m   self._pq_paths = [p.path for p in pq_ds.pieces]


(pid=64482) Parquet Files Sample 0:   0%|          | 0/1 [00:00<?, ?it/s]

2023-12-04 03:18:21,765	INFO read_api.py:406 -- To satisfy the requested parallelism of 144, each read task output is split into 144 smaller blocks.

  0%|          | 0/1 [00:00<?, ?it/s][A
ResumableTextPipeline, current on dolly_sample_50.parquet:   0%|          | 0/1 [00:00<?, ?it/s][A2023-12-04 03:18:21,841	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet->SplitBlocks(144)] -> TaskPoolMapOperator[Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write]
2023-12-04 03:18:21,842	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-04 03:18:21,844	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


dataset_name:dolly, prompt_name:causal_llm_1, subset_name:None


Running 0:   0%|          | 0/20736 [00:00<?, ?it/s]

[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=64485)[0m Skipping writing empty dataset with UUID ed229cdea4a4496da3e49f5a6c3d44e3 at ResumableTextPipeline_output-dolly/dolly_sample_50.parquet
[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=64485)[0m Skipping writing empty dataset with UUID ed229cdea4a4496da3e49f5a6c3d44e3 at ResumableTextPipeline_output-dolly/dolly_sample_50.parquet
[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=64485)[0m Skipping writing empty dataset with UUID ed229cdea4a4496da3e49f5a6c3d44e3 at ResumableTextPipeline_output-dolly/dolly_sample_50.parquet
[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=64485)[0m Skipping writing empty dataset with UUID ed229cdea4a4496da3e49f5a6c3d44e3 at ResumableTextPipeline_output-dolly/dolly_sample_50.parquet
[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Ma

[32m2023-12-04 03:18:40.686[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m285[0m - [1mTextPrompt: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-04 03:18:40.694[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m285[0m - [1mPerfileParquetWriter: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m



ResumableTextPipeline, current on dolly_sample_50.parquet: 100%|██████████| 1/1 [00:18<00:00, 18.90s/it][A

[32m2023-12-04 03:18:40.705[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mexecute[0m:[36m425[0m - [1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output-dolly for verification.[0m



[2m[36m(Map(<lambda>)->Map(<lambda>)->MapBatches(<lambda>)->Map(<lambda>)->Write pid=64449)[0m Skipping writing empty dataset with UUID ed229cdea4a4496da3e49f5a6c3d44e3 at ResumableTextPipeline_output-dolly/dolly_sample_50.parquet[32m [repeated 86x across cluster][0m


## 3. Downsize dataset

### 3.1 Pipeline on Spark

In [1]:
from pyrecdp.LLM import TextPipeline, ResumableTextPipeline
from pyrecdp.primitives.operations import *
import os
alpaca_pipeline = ResumableTextPipeline()
alpaca_pipeline.enable_statistics()
out_dir = "ResumableTextPipeline_output_alpaca_result"
ops = [
    JsonlReader("/content/test_data/alpaca_data_50.jsonl"),
    TextPrompt(dataset_name="alpaca", prompt_name="causal_llm_1"),
    RandomSelect(fraction=0.3),
    TextToxicity(huggingface_config_path="/root/.cache/huggingface/hub/models--xlm-roberta-base"),
    TextDiversityIndicate(out_dir=out_dir, language="en", first_sent=False),
    TextQualityScorer(model="gpt3"),
    RougeScoreDedup(max_ratio=0.7, batch_size=10,score_store_path=os.path.join(out_dir,'RougeScorefiltered.parquet')),
    ParquetWriter(out_dir)
]
alpaca_pipeline.add_operations(ops)
ret = alpaca_pipeline.execute()
del alpaca_pipeline

JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/




['psutil', 'tqdm', 'pyyaml', 'pandas', 'pyarrow', 'transformers', 'graphviz', 'requests', 'distro', 'pyspark==3.4.0', 'matplotlib', 'datasketch==1.5.9', 'ftfy==6.1.1', 'jsonlines==3.1.0', 'networkit==10.1', 'nltk==3.8.1', 'regex==2023.6.3', 'scipy==1.10.1', 'typer>=0.6.1', 'phonenumbers', 'fasttext==0.9.2', 'wget==3.2', 'alt-profanity-check==1.3.0', 'huggingface-hub', 'loguru==0.7.2', 'tabulate==0.9.0', 'sentencepiece', 'selectolax', 'spacy', 'torch', 'Faker', 'ray==2.7.1', 'loguru', 'detoxify', 'emoji==2.2.0', 'kenlm', 'rouge-score']


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




[DatasetReader, PerfileSourcedJsonlReader, TextPrompt, RandomSelect, TextToxicity, TextDiversityIndicate, TextQualityScorer, RougeScoreDedup, PerfileParquetWriter]
Will assign 36 cores and 206263 M memory for spark


23/12/04 23:45:52 WARN Utils: Your hostname, sr533 resolves to a loopback address: 127.0.1.1; using 10.0.2.133 instead (on interface eno0)
23/12/04 23:45:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/04 23:45:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


per core memory size is 5.595 GB and shuffle_disk maximum capacity is 8589934592.000 GB
execute with spark for global tasks started ...
DatasetReader
[32m2023-12-04 23:45:57.083[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mDatasetReader: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
execute with spark for global tasks took 0.0023543089628219604 sec
PerfileSourcedJsonlReader


ResumableTextPipeline, current on alpaca_data_50.jsonl:   0%|          | 0/1 [00:00<?, ?it/s]

alpaca_data_50.jsonl
TextPrompt


                                                                                

RandomSelect
TextToxicity
statistics_decorator spark


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to tru

TextDiversityIndicate
statistics_decorator spark
[32m2023-12-04 23:47:19.244[0m | [1mINFO    [0m | [36mpyrecdp.core.model_utils[0m:[36mprepare_diversity_model[0m:[36m195[0m - [1mLoading spacy model [en_core_web_md-3.5.0]...[0m


                                                                                

TextQualityScorer
statistics_decorator spark
model_name is gpt3
[32m2023-12-04 23:47:35.000[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_qualityscorer[0m:[36mprepare_model[0m:[36m122[0m - [1mPreparing scorer model in [/root/.cache/recdp/models/gpt3_quality_model]...[0m
real_model_path is /root/.cache/recdp/models/gpt3_quality_model


[Stage 45:>                                                         (0 + 1) / 1]

[32m2023-12-04 23:47:38.434[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_qualityscorer[0m:[36mpredict[0m:[36m252[0m - [1mStart scoring dataset...[0m


                                                                                

RougeScoreDedup
statistics_decorator spark



  0%|          | 0/2 [00:00<?, ?it/s][A

Round 0 started ...




[32m2023-12-04 23:47:49.248[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_compare_dedup[0m:[36mprocess_spark[0m:[36m105[0m - [1mRound 0: total processing num_samples is 55, detected high score num_samples is 0[0m


                                                                                
 50%|█████     | 1/2 [00:09<00:09,  9.56s/it][A

Round 0 took 9.555126171559095 sec
Round 1 started ...




[32m2023-12-04 23:47:52.515[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_compare_dedup[0m:[36mprocess_spark[0m:[36m105[0m - [1mRound 1: total processing num_samples is 0, detected high score num_samples is 0[0m


                                                                                
100%|██████████| 2/2 [00:12<00:00,  6.18s/it][A


Round 1 took 2.8002537600696087 sec
generate_connected_components => duplicates started ...



0it [00:00, ?it/s][A

generate_connected_components => duplicates took 0.03802505135536194 sec





[32m2023-12-04 23:47:53.768[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mTextPrompt: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-04 23:47:53.771[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mRandomSelect: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-04 23:47:53.774[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m284[0m - [1mTextToxicity: A total of 11 rows of data were processed, using 69.80172777175903 seconds, Get max toxicity 0.0010136101627722383, Get min toxicity 0.0001716656406642869, Get average toxicity 0.00036636507800060576,Get the std of toxicity 0.0002271174606949808[0m
[32m2023-12-04 23:47:53.781[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[

ResumableTextPipeline, current on alpaca_data_50.jsonl: 100%|██████████| 1/1 [01:47<00:00, 107.56s/it]

[32m2023-12-04 23:47:53.796[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mexecute[0m:[36m429[0m - [1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output_alpaca_result for verification.[0m





In [4]:
from pyrecdp.LLM import TextPipeline, ResumableTextPipeline
from pyrecdp.primitives.operations import *
import os
openorca_pipeline = ResumableTextPipeline()
openorca_pipeline.enable_statistics()
out_dir = "ResumableTextPipeline_output_openorca_result"
ops = [
    ParquetReader("/content/test_data/openorca_sample_50.parquet"),
    TextPrompt(dataset_name="openorca", prompt_name="causal_llm_1"),
    RandomSelect(fraction=0.3),
    # TextToxicity(huggingface_config_path="/root/.cache/huggingface/hub/models--xlm-roberta-base"),
    TextDiversityIndicate(out_dir=out_dir, language="en", first_sent=False),
    TextQualityScorer(model="gpt3"),
    RougeScoreDedup(max_ratio=0.7, batch_size=10,score_store_path=os.path.join(out_dir,'RougeScorefiltered.parquet')),
    ParquetWriter(out_dir)
]
openorca_pipeline.add_operations(ops)
ret = openorca_pipeline.execute()
del openorca_pipeline

[DatasetReader, PerfileSourcedParquetReader, TextPrompt, RandomSelect, TextDiversityIndicate, TextQualityScorer, RougeScoreDedup, PerfileParquetWriter]
Will assign 36 cores and 206263 M memory for spark
per core memory size is 5.595 GB and shuffle_disk maximum capacity is 8589934592.000 GB
execute with spark for global tasks started ...
DatasetReader
[32m2023-12-05 00:12:04.078[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mDatasetReader: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
execute with spark for global tasks took 0.0032089054584503174 sec
PerfileSourcedParquetReader


ResumableTextPipeline, current on openorca_sample_50.parquet:   0%|          | 0/1 [00:00<?, ?it/s]

openorca_sample_50.parquet
TextPrompt


                                                                                

RandomSelect
TextDiversityIndicate
statistics_decorator spark


                                                                                

TextQualityScorer
statistics_decorator spark
model_name is gpt3
[32m2023-12-05 00:12:20.480[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_qualityscorer[0m:[36mprepare_model[0m:[36m122[0m - [1mPreparing scorer model in [/root/.cache/recdp/models/gpt3_quality_model]...[0m
real_model_path is /root/.cache/recdp/models/gpt3_quality_model
[32m2023-12-05 00:12:21.237[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_qualityscorer[0m:[36mpredict[0m:[36m252[0m - [1mStart scoring dataset...[0m
RougeScoreDedup
statistics_decorator spark



  0%|          | 0/2 [00:00<?, ?it/s][A

Round 0 started ...




[32m2023-12-05 00:12:28.534[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_compare_dedup[0m:[36mprocess_spark[0m:[36m105[0m - [1mRound 0: total processing num_samples is 55, detected high score num_samples is 0[0m


                                                                                
 50%|█████     | 1/2 [00:06<00:06,  6.89s/it][A

Round 0 took 6.891998417675495 sec
Round 1 started ...




[32m2023-12-05 00:12:30.800[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_compare_dedup[0m:[36mprocess_spark[0m:[36m105[0m - [1mRound 1: total processing num_samples is 0, detected high score num_samples is 0[0m


                                                                                
100%|██████████| 2/2 [00:09<00:00,  4.62s/it][A


Round 1 took 2.336260261014104 sec
generate_connected_components => duplicates started ...



0it [00:00, ?it/s][A

generate_connected_components => duplicates took 0.015356998890638351 sec





[32m2023-12-05 00:12:31.702[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mTextPrompt: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-05 00:12:31.705[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mRandomSelect: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-05 00:12:31.713[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m284[0m - [1mTextDiversityIndicate: A total of 11 rows of data were processed, using 14.471717834472656 seconds, Get max diversity types 11, Get average diversity types 1.826086956521739,Get the std of diversity types 2.328717436956612[0m
[32m2023-12-05 00:12:31.715[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m284[0m -

ResumableTextPipeline, current on openorca_sample_50.parquet: 100%|██████████| 1/1 [00:27<00:00, 27.38s/it]

[32m2023-12-05 00:12:31.724[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mexecute[0m:[36m429[0m - [1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output_openorca_result for verification.[0m





In [5]:
from pyrecdp.LLM import TextPipeline, ResumableTextPipeline
from pyrecdp.primitives.operations import *
import os

dolly_pipeline = ResumableTextPipeline()
dolly_pipeline.enable_statistics()
out_dir = "ResumableTextPipeline_output_dolly_result"
ops = [
    ParquetReader("/content/test_data/dolly_sample_50.parquet"),
    TextPrompt(dataset_name="dolly", prompt_name="causal_llm_1"),
    RandomSelect(fraction=0.3),
    # TextToxicity(huggingface_config_path="/root/.cache/huggingface/hub/models--xlm-roberta-base"),
    TextDiversityIndicate(out_dir=out_dir, language="en", first_sent=False),
    TextQualityScorer(model="gpt3"),
    RougeScoreDedup(max_ratio=0.7, batch_size=10,score_store_path=os.path.join(out_dir,'RougeScorefiltered.parquet')),
    ParquetWriter(out_dir)
]
dolly_pipeline.add_operations(ops)
ret = dolly_pipeline.execute()
del dolly_pipeline

[DatasetReader, PerfileSourcedParquetReader, TextPrompt, RandomSelect, TextDiversityIndicate, TextQualityScorer, RougeScoreDedup, PerfileParquetWriter]
Will assign 36 cores and 206263 M memory for spark
per core memory size is 5.595 GB and shuffle_disk maximum capacity is 8589934592.000 GB
execute with spark for global tasks started ...
DatasetReader
[32m2023-12-05 00:13:50.953[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mDatasetReader: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
execute with spark for global tasks took 0.00310460664331913 sec
PerfileSourcedParquetReader


ResumableTextPipeline, current on dolly_sample_50.parquet:   0%|          | 0/1 [00:00<?, ?it/s]

dolly_sample_50.parquet
TextPrompt


                                                                                

RandomSelect
TextDiversityIndicate
statistics_decorator spark


                                                                                

TextQualityScorer
statistics_decorator spark
model_name is gpt3
[32m2023-12-05 00:14:08.386[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_qualityscorer[0m:[36mprepare_model[0m:[36m122[0m - [1mPreparing scorer model in [/root/.cache/recdp/models/gpt3_quality_model]...[0m
real_model_path is /root/.cache/recdp/models/gpt3_quality_model
[32m2023-12-05 00:14:09.059[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_qualityscorer[0m:[36mpredict[0m:[36m252[0m - [1mStart scoring dataset...[0m
RougeScoreDedup
statistics_decorator spark



  0%|          | 0/2 [00:00<?, ?it/s][A

Round 0 started ...




[32m2023-12-05 00:14:15.341[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_compare_dedup[0m:[36mprocess_spark[0m:[36m105[0m - [1mRound 0: total processing num_samples is 55, detected high score num_samples is 0[0m


                                                                                
 50%|█████     | 1/2 [00:06<00:06,  6.10s/it][A

Round 0 took 6.102168790996075 sec
Round 1 started ...




[32m2023-12-05 00:14:17.457[0m | [1mINFO    [0m | [36mpyrecdp.primitives.operations.text_compare_dedup[0m:[36mprocess_spark[0m:[36m105[0m - [1mRound 1: total processing num_samples is 0, detected high score num_samples is 0[0m


                                                                                
100%|██████████| 2/2 [00:08<00:00,  4.20s/it][A


Round 1 took 2.2937835920602083 sec
generate_connected_components => duplicates started ...



0it [00:00, ?it/s][A

generate_connected_components => duplicates took 0.014340104535222054 sec





[32m2023-12-05 00:14:18.406[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mTextPrompt: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-05 00:14:18.410[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m288[0m - [1mRandomSelect: A total of 0 rows of data were processed, using 0 seconds, with 0 rows modified or removed, 0 rows of data remaining.[0m
[32m2023-12-05 00:14:18.419[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m284[0m - [1mTextDiversityIndicate: A total of 11 rows of data were processed, using 14.846248388290405 seconds, Get max diversity types 11, Get average diversity types 1.7692307692307692,Get the std of diversity types 2.7735009811261455[0m
[32m2023-12-05 00:14:18.421[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mop_summary[0m:[36m284[0m

ResumableTextPipeline, current on dolly_sample_50.parquet: 100%|██████████| 1/1 [00:27<00:00, 27.20s/it]

[32m2023-12-05 00:14:18.432[0m | [1mINFO    [0m | [36mpyrecdp.LLM.TextPipeline[0m:[36mexecute[0m:[36m429[0m - [1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output_dolly_result for verification.[0m





### 3.2 Check the result and score

In [10]:
import json
import pandas as pd
import glob
print('============ Alpaca Result ============')
out_dir = 'ResumableTextPipeline_output_alpaca_result'
print('The Rouge score: ', json.load(open(f'{out_dir}/RougeScoreDedup-statistics', 'r')))
print('The toxicity: ', json.load(open(f'{out_dir}/TextToxicity-statistics', 'r')))
print('The diversity intricate: ', json.load(open(f'{out_dir}/TextDiversityIndicate-statistics', 'r')))
print('The quality score :', json.load(open(f'{out_dir}/TextQualityScorer-statistics', 'r')))


origin_dataset_length = (len(pd.read_json("/content/test_data/alpaca_data_50.jsonl", lines=True))) 
downsized_dataset_length = len(pd.read_parquet(glob.glob(f'{out_dir}/alpaca_data_50.jsonl/*.parquet')))

print(f'The original dataset length: {origin_dataset_length}, the processed dataset length: {downsized_dataset_length}')

print('============ Dolly Result ============')
out_dir = 'ResumableTextPipeline_output_dolly_result'
print('The Rouge score: ', json.load(open(f'{out_dir}/RougeScoreDedup-statistics', 'r')))
# print('The toxicity: ', json.load(open(f'{out_dir}/TextToxicity-statistics', 'r')))
print('The diversity intricate: ', json.load(open(f'{out_dir}/TextDiversityIndicate-statistics', 'r')))
print('The quality score :', json.load(open(f'{out_dir}/TextQualityScorer-statistics', 'r')))


origin_dataset_length = (len(pd.read_parquet("/content/test_data/dolly_sample_50.parquet"))) 
downsized_dataset_length = len(pd.read_parquet(glob.glob(f'{out_dir}/dolly_sample_50.parquet/*.parquet')))

print(f'The original dataset length: {origin_dataset_length}, the processed dataset length: {downsized_dataset_length}')

print('============ Openorca Result ============')
out_dir = 'ResumableTextPipeline_output_openorca_result'
print('The Rouge score: ', json.load(open(f'{out_dir}/RougeScoreDedup-statistics', 'r')))
# print('The toxicity: ', json.load(open(f'{out_dir}/TextToxicity-statistics', 'r')))
print('The diversity intricate: ', json.load(open(f'{out_dir}/TextDiversityIndicate-statistics', 'r')))
print('The quality score :', json.load(open(f'{out_dir}/TextQualityScorer-statistics', 'r')))


origin_dataset_length = (len(pd.read_parquet("/content/test_data/openorca_sample_50.parquet"))) 
downsized_dataset_length = len(pd.read_parquet(glob.glob(f'{out_dir}/openorca_sample_50.parquet/*.parquet')))

print(f'The original dataset length: {origin_dataset_length}, the processed dataset length: {downsized_dataset_length}')

The Rouge score:  {'dup_num': 0, 'dup_ratio': 0.0}
The toxicity:  {'min': 0.0001716656406642869, 'max': 0.0010136101627722383, 'mean': 0.00036636507800060576, 'std': 0.0002271174606949808}
The diversity intricate:  {'max': 11, 'mean': 1.625, 'std': 2.5}
The quality score : {'mean': 0.9509534303108631}
The original dataset length: 50, the processed dataset length: 11
The Rouge score:  {'dup_num': 0, 'dup_ratio': 0.0}
The diversity intricate:  {'max': 11, 'mean': 1.7692307692307692, 'std': 2.7735009811261455}
The quality score : {'mean': 0.9174850873367049}
The original dataset length: 50, the processed dataset length: 11
The Rouge score:  {'dup_num': 0, 'dup_ratio': 0.0}
The diversity intricate:  {'max': 11, 'mean': 1.826086956521739, 'std': 2.328717436956612}
The quality score : {'mean': 0.9467587080940774}
The original dataset length: 50, the processed dataset length: 11
