# RecDP LLM - Dataset Score Assessment

This notebook shows how to use several tools to evaluate the quality score, diversity, toxicity, perplexity and rouge of a dataset.

# Get Started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -qq -y openjdk-8-jre
! pip install -q pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. Prepare your data

In [None]:
%mkdir -p /content/test_data
%cd /content/test_data
!wget https://raw.githubusercontent.com/intel/e2eAIOK/main/RecDP/tests/data/llm_data/tiny_c4_sample_10.jsonl

## 3. Score Assement

In [None]:
from pyrecdp.LLM import TextPipeline, ResumableTextPipeline
from pyrecdp.primitives.operations import *

### 3.1 Process with toxicity and perplexity scorer

In [None]:
pipeline = ResumableTextPipeline()
pipeline.enable_statistics()
ops = [
    JsonlReader("/content/test_data/"),
    TextPerplexityScore(),
    TextToxicity(huggingface_config_path="/root/.cache/huggingface/hub/models--xlm-roberta-base"),
    ParquetWriter("ResumableTextPipeline_output-1")
]
pipeline.add_operations(ops)
ret = pipeline.execute()
del pipeline

### 3.2 Process with QualityScorer, Diversity and Rouge scorer

In [None]:
pipeline = ResumableTextPipeline()
pipeline.enable_statistics()
ops = [
    JsonlReader("/content/test_data/"),
    TextQualityScorer(),
    TextDiversityIndicate(),
    RougeScoreDedup(max_ratio=0.7, batch_size=20),
    ParquetWriter("ResumableTextPipeline_output-2")
]
pipeline.add_operations(ops)
ret = pipeline.execute()
del pipeline

### 3.3 View score 

In [None]:
ppl_score = json.load(os.path.join("ResumableTextPipeline_output-1/TextPerplexityScore-statistics"))
toxicity_score = json.load(os.path.join("ResumableTextPipeline_output-1/TextToxicity-statistics"))
quality_score = json.load(os.path.join("ResumableTextPipeline_output-2/TextQualityScorer-statistics"))
diversity_score = json.load(os.path.join("ResumableTextPipeline_output-2/TextDiversityIndicate-statistics"))
rouge_score = json.load(os.path.join("ResumableTextPipeline_output-2/RougeScoreDedup-statistics"))

print("Perplexity scores: ", ppl_score)
print("Toxicity scores: ", toxicity_score)
print("Quality scores: ", quality_score)
print("Diversity scores: ", diversity_score)
print("Rouge scores: ", rouge_score)