### 1. Download data
#### 1.1 CC WET files
- since I do not have access, I will just download 5k `.warc.wet.gz` to local.

In [None]:
! wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-18/wet.paths.gz

In [None]:
import os
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from xopen import xopen

base_url = "https://data.commoncrawl.org/"
MOUNT_DIR = Path("/home/azureuser/mount/")
N_CPU = len(os.sched_getaffinity(0))
N_WET = 100

def download_file(url, output_dir):
    filename = Path(url).name
    output_path = output_dir / filename
    
    if output_path.exists():
        return True, f"Skipped: {filename}"
    
    try:
        response = requests.get(url, stream=True)
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True, f"Downloaded: {filename}"
    except Exception as e:
        return False, f"Error {filename}: {e}"

# Read all paths
with xopen('wet.paths.gz', 'rt') as f:
    all_paths = [line.strip() for line in f]

output_dir = Path(MOUNT_DIR/"CC")
output_dir.mkdir(exist_ok=True)

# Download until we have N_WET successful downloads
successful_downloads = 0
path_idx = 0
futures = {}

with ThreadPoolExecutor(max_workers=N_CPU) as executor:
    # Submit initial batch
    while path_idx < len(all_paths) and len(futures) < N_CPU:
        url = base_url + all_paths[path_idx]
        future = executor.submit(download_file, url, output_dir)
        futures[future] = path_idx
        path_idx += 1
    
    # Process results and submit new jobs as needed
    while successful_downloads < N_WET and futures:
        done, _ = as_completed(futures), None
        
        for future in list(futures.keys()):
            if future.done():
                success, message = future.result()
                print(message)
                
                if success and not message.startswith("Skipped"):
                    successful_downloads += 1
                
                del futures[future]
                
                # Submit new job if we need more downloads
                if successful_downloads < N_WET and path_idx < len(all_paths):
                    url = base_url + all_paths[path_idx]
                    new_future = executor.submit(download_file, url, output_dir)
                    futures[new_future] = path_idx
                    path_idx += 1
                
                break

print(f"\nCompleted: {successful_downloads} successful downloads")

#### 1.2 validation data - paloma c4_100_domains - val

In [None]:
# from huggingface_hub import login
# login(token="")

from datasets import load_dataset
paloma_c4_100_domains_val = load_dataset("allenai/paloma", "c4_100_domains", split="val")
print(len(paloma_c4_100_domains_val))

### 2. Processing
- TLD (top-level domain) filtering
    - checked TLD from `paloma` ds and they are quite normal
- Quality rules (number of words, lengths of words, etc.)
    - range of number of words also from `paloma`.
- The validation dataset looks all English, so I will keep only English data
    - To determine the threshold, I ran the model on `paloma` and get the average of `0.95`. Thus I will use `0.9` for filtering to be on the safe side, and do further removal if needed.
- Harmful removal. 
    - To determine the threshold, I ran the model on `paloma` and get the average of `0.99`. Thus I will use `0.9` for filtering to be on the safe side, and do further removal if needed.
- Deduplication?

#### Performance Optimizations Applied

To speed up WET file processing, the following improvements were implemented in `leaderboard_process_wet.py`:

**1. Batch Processing for ML Models** (Batch size: 64)
- Language identification, NSFW, and toxicity models now process texts in batches via `filter_batch()`
- Reduces model invocation overhead by ~64x compared to individual calls
- Uses `model.predict(batch)` to leverage vectorized operations

**2. Incremental File Writing**
- Write filtered content to JSONL as each batch completes
- Reduces memory footprint (no buffering all content in memory)
- Starts I/O earlier for better throughput
- Opens file once and writes progressively

**3. Optimized Filter Ordering**
- **Cheap filters first:** record type → URL (fast, rule-based)
- **Expensive ML models last:** language → quality → NSFW → toxic (batched)
    - quality has to be on English therefore go after language
- Early filtering reduces number of texts sent to expensive ML models
- Cascading batch filters: lang batch → nsfw batch → toxic batch → kept

**4. Parallel Processing with ProcessPoolExecutor**
- 16 workers (`concurrent.futures`) process WET files independently
- Each worker runs complete pipeline on one file
- Single progress bar tracks overall completion
- Automatic work distribution as workers finish tasks

In [None]:
from cs336_data.leaderboard_process_wet import process_single_wet_file
wet_file = "CC-MAIN-20250417135010-20250417165010-00065.warc.wet.gz"
process_single_wet_file(wet_file, "CC-MAIN-20250417135010-20250417165010-00065.jsonl")

In [None]:
from cs336_data.minhash_dedpulication import normalize_text, minhashing, get_signatures
from pathlib import Path

input_dir = Path("/home/azureuser/mount/CC-filtered-50")

In [None]:
import random
from os import PathLike
import json
from tqdm import tqdm
import string
from os import PathLike
import unicodedata
import re
import mmh3
import random
import shutil

# `normalize_text`: punctuation removed, text lowercased, NFD unicode normalization applied, accents removed, whitespace is normalized.
# `minhashing` and `get_signature`: requires arguments `num_hashes`, `ngrams`.
# `get_candidates` and `get_clusters`: requires arguments `num_bands` and `jaccard_threshold`.
# `minhash_deduplication`: put all together

# see ``NFD` unicode normalization` in my note for this lecture
def strip_accents(text):
    return "".join(c for c in unicodedata.normalize("NFD", text)
                   if not unicodedata.combining(c))

def normalize_text(text: str) -> list[str]:
    """normalize text and split into a list of words"""
    # remove punctuation
    text = "".join([" " if t in string.punctuation else t for t in text])
    # lowercase
    text = text.lower()
    # NFD normalization and remove accents
    text = strip_accents(text)
    # `\s` matches any whitespace character (space, tab, newline, etc.).
    words = re.split(r"\s+", text)
    return words

def minhashing(doc_words: list[str], ngrams: int, seed: int) -> int:
    """each seed corresponds to one hash func"""
    minhash = float("inf")

    # no need to get set of words because we are taking the min of hash values
    for i in range(len(doc_words) - ngrams): 
        ngram_str = " ".join(doc_words[i:i+ngrams])
        # Hashing a string to a 32-bit integer
        hash_32bit = mmh3.hash(ngram_str, seed)
        minhash = min(hash_32bit, minhash)
    return minhash

def get_signatures(input_files: list[str | PathLike], num_hashes: int, ngrams: int) -> list[list[int]]:
    """num_hashes minhashing of each doc"""
    signatures = []
    seeds = [random.randint(0, 2**32-1) for _ in range(num_hashes)]

    for file_path in tqdm(input_files, total=len(input_files)):
        with open(file_path) as f:
            for line in f.readlines():
                doc = json.loads(line)['text']
                doc_words = normalize_text(doc)

                signature = [minhashing(doc_words, ngrams, seed) for seed in seeds]
                signatures.append(signature)
    return signatures

def get_signatures_fast(input_files: list[str | PathLike], num_hashes: int, ngrams: int) -> list[list[int]]:
    """Optimized: compute all hashes in one pass"""
    signatures = []
    seeds = [random.randint(0, 2**32-1) for _ in range(num_hashes)]

    for file_path in tqdm(input_files, total=len(input_files)):
        with open(file_path) as f:
            for line in f.readlines():
                doc = json.loads(line)['text']
                doc_words = normalize_text(doc)

                # Initialize minhashes for all hash functions
                signature = [float("inf")] * num_hashes
                
                # Single pass through n-grams
                for i in range(len(doc_words) - ngrams):
                    ngram_str = " ".join(doc_words[i:i+ngrams])
                    # Compute all hashes for this n-gram at once
                    for j, seed in enumerate(seeds):
                        hash_val = mmh3.hash(ngram_str, seed)
                        signature[j] = min(signature[j], hash_val)
                
                signatures.append(signature)
    return signatures

In [None]:
input_files = sorted(input_dir.glob("*.jsonl"))
len(input_files)

In [None]:
sigs = get_signatures_fast(input_files[:2], 1000, 5)

In [None]:
import xxhash
import mmh3
import time

text = "the quick brown fox jumps"*1000
seed = 42

# Benchmark mmh3
start = time.time()
for _ in range(10000):
    mmh3.hash(text, seed)
print(f"mmh3: {time.time() - start:.3f}s")

# Benchmark xxh3
start = time.time()
for _ in range(10000):
    xxhash.xxh3_64_intdigest(text, seed=seed)
print(f"xxh3: {time.time() - start:.3f}s")

In [None]:
from concurrent.futures import ProcessPoolExecutor
import os

def process_file_signatures(file_path, seeds, ngrams):
    """Process a single file's signatures"""
    file_signatures = []
    with open(file_path) as f:
        for line in f.readlines():
            doc = json.loads(line)['text']
            doc_words = normalize_text(doc)
            
            signature = [float("inf")] * len(seeds)
            for i in range(len(doc_words) - ngrams):
                ngram_str = " ".join(doc_words[i:i+ngrams])
                for j, seed in enumerate(seeds):
                    hash_val = mmh3.hash(ngram_str, seed)
                    signature[j] = min(signature[j], hash_val)
            
            file_signatures.append(signature)
    return file_signatures

def get_signatures_parallel(input_files: list[str | PathLike], num_hashes: int, ngrams: int) -> list[list[int]]:
    """Parallel processing across files"""
    seeds = [random.randint(0, 2**32-1) for _ in range(num_hashes)]
    n_workers = len(os.sched_getaffinity(0))
    
    signatures = []
    with ProcessPoolExecutor(max_workers=n_workers) as executor:
        futures = [executor.submit(process_file_signatures, fp, seeds, ngrams) 
                   for fp in input_files]
        
        for future in tqdm(futures, total=len(input_files)):
            signatures.extend(future.result())
    
    return signatures

In [None]:
jl = next(input_dir.glob("*.jsonl"))

In [None]:
import json
contents = []
with open(jl) as f:
    for line in f.readlines():
        content = json.loads(line)
        contents.append(content['text'])

In [None]:
import os
os.listdir(input_dir)