# Install pyrecdp from github

In [None]:
!pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

# Install jdk for pyspark running

In [None]:
!DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre

# Prepare test data

In [None]:
%mkdir -p /content/test_data
%cd /content/test_data
!wget https://raw.githubusercontent.com/intel/e2eAIOK/main/RecDP/tests/data/llm_data/arxiv_sample_100.jsonl

# Import classifier function

In [None]:
from pyrecdp.primitives.llmutils import classify_spark, language_identify_spark
from pyrecdp.primitives.llmutils.utils import get_target_file_list
from huggingface_hub import hf_hub_download
from pyrecdp.core import SparkDataProcessor

# Specify variables

In [6]:
fasttext_model_dir = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
data_dir = "/content/test_data/"
data_file = '/content/test_data/arxiv_sample_100.jsonl'
classify_save_path = "/content/test_data/output/classify_spark"
lid_save_path = "/content/test_data/output/lid_spark"

Downloading model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

# Load data

In [7]:
rdp = SparkDataProcessor()
spark=rdp.spark
spark_df = spark.read.json(data_file)
lid_df = language_identify_spark(spark_df, fasttext_model_dir, 'text', 'lang', lid_save_path)
print("input is")
lid_df.show()

Will assign 1 cores and 10386 M memory for spark
per core memory size is 10.143 GB and shuffle_disk maximum capacity is 8589934592.000 GB
process data started ...
process data took 0.3967957800005024 sec
Save data started ...
Save data took 24.597423498999888 sec
Completed!!
    total identify the language for 100 documents
    All the processed data are saving under the folder: /content/test_data/output/lid_spark
input is
+--------------------+--------------------+--------+
|                meta|                text|    lang|
+--------------------+--------------------+--------+
|{2203.15369, en, ...|\section{Introduc...|eng_Latn|
|{math/9807097, en...|\section{Introduc...|eng_Latn|
|{2008.06948, en, ...|\section{Introduc...|eng_Latn|
|{cond-mat/9807071...|\section{Introduc...|eng_Latn|
|{2210.10650, en, ...|\section{\label{s...|eng_Latn|
|{astro-ph/9807119...|\section{Introduc...|eng_Latn|
|{2111.03152, en, ...|\section{Introduc...|eng_Latn|
|{1606.04992, en, ...|\n\n\section{Intr...|

# Split the data into sub buckets according to lang column

In [8]:
classify_spark(lid_df, "lang", classify_save_path, "file://")

Spilt data started ...
Spilt data took 1.7437168729993573 sec
Completed!!
    total classify the spark dataframe by lang for 100 documents
    All the classified data are saving under the folder: file:///content/test_data/output/classify_spark


DataFrame[meta: struct<arxiv_id:string,language:string,timestamp:string,url:string,yymm:string>, text: string, lang: string]

In [9]:
!ls $classify_save_path

'lang=eng_Latn'   _SUCCESS
