### 1. Download data
#### 1.1 CC WET files
- since I do not have access, I will just download 5k `.warc.wet.gz` to local.

In [None]:
! wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-18/wet.paths.gz

In [None]:
import os
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from xopen import xopen

base_url = "https://data.commoncrawl.org/"
MOUNT_DIR = Path("/home/azureuser/mount/")
N_CPU = len(os.sched_getaffinity(0))
N_WET = 100

def download_file(url, output_dir):
    filename = Path(url).name
    output_path = output_dir / filename
    
    if output_path.exists():
        return True, f"Skipped: {filename}"
    
    try:
        response = requests.get(url, stream=True)
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True, f"Downloaded: {filename}"
    except Exception as e:
        return False, f"Error {filename}: {e}"

# Read all paths
with xopen('wet.paths.gz', 'rt') as f:
    all_paths = [line.strip() for line in f]

output_dir = Path(MOUNT_DIR/"CC")
output_dir.mkdir(exist_ok=True)

# Download until we have N_WET successful downloads
successful_downloads = 0
path_idx = 0
futures = {}

with ThreadPoolExecutor(max_workers=N_CPU) as executor:
    # Submit initial batch
    while path_idx < len(all_paths) and len(futures) < N_CPU:
        url = base_url + all_paths[path_idx]
        future = executor.submit(download_file, url, output_dir)
        futures[future] = path_idx
        path_idx += 1
    
    # Process results and submit new jobs as needed
    while successful_downloads < N_WET and futures:
        done, _ = as_completed(futures), None
        
        for future in list(futures.keys()):
            if future.done():
                success, message = future.result()
                print(message)
                
                if success and not message.startswith("Skipped"):
                    successful_downloads += 1
                
                del futures[future]
                
                # Submit new job if we need more downloads
                if successful_downloads < N_WET and path_idx < len(all_paths):
                    url = base_url + all_paths[path_idx]
                    new_future = executor.submit(download_file, url, output_dir)
                    futures[new_future] = path_idx
                    path_idx += 1
                
                break

print(f"\nCompleted: {successful_downloads} successful downloads")

#### 1.2 validation data - paloma c4_100_domains - val

In [None]:
# from huggingface_hub import login
# login(token="")

from datasets import load_dataset
paloma_c4_100_domains_val = load_dataset("allenai/paloma", "c4_100_domains", split="val")
print(len(paloma_c4_100_domains_val))

14059


### 2. Processing
- The validation dataset looks all English, so I will keep only English data
    - To determine the threshold, I will run the model on validation and get the average. Got `0.95 +/- 0.05`.
- Harmful removal. 
    - To determine the threshold, I will run the model on validation and get the average. Got `0.95`.
- Quality rules.
    - Not using classifiers
- Deduplication?

In [28]:
from cs336_data.language_identification import identify_language
from cs336_data.harmful_content import classify_nsfw, classify_toxic_speech

subset = paloma_c4_100_domains_val.shuffle(seed=42).select(range(1000))  # Random 1000 examples

In [30]:
# Option 1: Using .map() - most efficient for datasets
# This applies the function to each item and returns a new dataset with the results
def process_language(example):
    lang, score = identify_language(example["text"])  # Call once, unpack results
    return {"lang": lang, "score": score}

lang_results = subset.map(
    process_language,
    num_proc=16,  # Use multiple processes for speed
    batch_size=100,  # Process in batches
    load_from_cache_file=True  # Cache results
)

Map (num_proc=16):   0%|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=16): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:19<00:00, 50.86 examples/s]


In [51]:
import numpy as np
print(np.mean(lang_results['score']))
print(np.std(lang_results['score']))

0.950964916318655
0.05012996444529241


In [None]:
def process_single_wet_file