### 1. Download data
#### 1.1 CC WET files
- since I do not have access, I will just download 5k `.warc.wet.gz` to local.

In [None]:
! wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-18/wet.paths.gz

In [None]:
import os
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from xopen import xopen

base_url = "https://data.commoncrawl.org/"
MOUNT_DIR = Path("/home/azureuser/mount/")
N_CPU = len(os.sched_getaffinity(0))
N_WET = 100

def download_file(url, output_dir):
    filename = Path(url).name
    output_path = output_dir / filename
    
    if output_path.exists():
        return True, f"Skipped: {filename}"
    
    try:
        response = requests.get(url, stream=True)
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True, f"Downloaded: {filename}"
    except Exception as e:
        return False, f"Error {filename}: {e}"

# Read all paths
with xopen('wet.paths.gz', 'rt') as f:
    all_paths = [line.strip() for line in f]

output_dir = Path(MOUNT_DIR/"CC")
output_dir.mkdir(exist_ok=True)

# Download until we have N_WET successful downloads
successful_downloads = 0
path_idx = 0
futures = {}

with ThreadPoolExecutor(max_workers=N_CPU) as executor:
    # Submit initial batch
    while path_idx < len(all_paths) and len(futures) < N_CPU:
        url = base_url + all_paths[path_idx]
        future = executor.submit(download_file, url, output_dir)
        futures[future] = path_idx
        path_idx += 1
    
    # Process results and submit new jobs as needed
    while successful_downloads < N_WET and futures:
        done, _ = as_completed(futures), None
        
        for future in list(futures.keys()):
            if future.done():
                success, message = future.result()
                print(message)
                
                if success and not message.startswith("Skipped"):
                    successful_downloads += 1
                
                del futures[future]
                
                # Submit new job if we need more downloads
                if successful_downloads < N_WET and path_idx < len(all_paths):
                    url = base_url + all_paths[path_idx]
                    new_future = executor.submit(download_file, url, output_dir)
                    futures[new_future] = path_idx
                    path_idx += 1
                
                break

print(f"\nCompleted: {successful_downloads} successful downloads")

#### 1.2 validation data - paloma c4_100_domains - val

In [2]:
# from huggingface_hub import login
# login(token="")

from datasets import load_dataset
paloma_c4_100_domains_val = load_dataset("allenai/paloma", "c4_100_domains", split="val")
print(len(paloma_c4_100_domains_val))

  from .autonotebook import tqdm as notebook_tqdm


14059


### 2. Processing
- TLD (top-level domain) filtering
    - checked TLD from `paloma` ds and they are quite normal
- Quality rules (number of words, lengths of words, etc.)
- The validation dataset looks all English, so I will keep only English data
    - To determine the threshold, I ran the model on `paloma` and get the average of `0.95`. Thus I will use `0.9` for filtering to be on the safe side, and do further removal if needed.
- Harmful removal. 
    - To determine the threshold, I ran the model on `paloma` and get the average of `0.99`. Thus I will use `0.9` for filtering to be on the safe side, and do further removal if needed.
- Deduplication?

In [None]:
# URL filtering strategies using tldextract
from tldextract import TLDExtract
def should_filter_url(url: str) -> bool:
    """
    Return True if URL should be filtered out (removed)
    """
    # Create the extractor instance
    extractor = TLDExtract()
    if not url:
        return True
    try:
        extracted = extractor(url)  # Use the class instance
        domain = extracted.domain.lower()
        suffix = extracted.suffix.lower()
        
        # based on exploration of paloma dataset
        # it has no intersection with `adult` and `social` domains
        allowed_tlds = {'com', 'org', 'edu', 'gov', 'net', 'uk', 'ca', 'au', 'us'}
        adult_domains = {
            'pornhub', 'xvideos', 'redtube', 'youporn', 'xhamster',
            'tube8', 'spankbang', 'chaturbate', 'cam4', 'livejasmin'
        }
        social_domains = {
            'facebook', 'twitter', 'instagram', 'tiktok', 'snapchat',
            'reddit', '4chan', '8chan', 'discord', 'telegram'
        }

        # 1. Filter by top-level domain (keep only certain TLDs)
        if suffix not in allowed_tlds:
            return True
        # 2. Filter out adult/inappropriate domains
        if domain in adult_domains:
            return True
        # 3. Filter out social media/forum content (often low quality)
        if domain in social_domains:
            return True
        return False
    except Exception:
        # Filter out URLs that can't be parsed
        return True

In [None]:
from cs336_data.language_identification import identify_language
from cs336_data.harmful_content import classify_nsfw, classify_toxic_speech

subset = paloma_c4_100_domains_val.shuffle(seed=42).select(range(1000))  # Random 1000 examples

# Option 1: Using .map() - most efficient for datasets
# This applies the function to each item and returns a new dataset with the results
def process_language(example):
    lang, score = identify_language(example["text"])  # Call once, unpack results
    return {"lang": lang, "score": score}

lang_results = subset.map(
    process_language,
    num_proc=16,  # Use multiple processes for speed
    batch_size=100,  # Process in batches
    load_from_cache_file=True  # Cache results
)

import numpy as np
print(np.mean(lang_results['score']))
print(np.std(lang_results['score']))

In [1]:
wet_file = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/CC-MAIN-20250417135010-20250417165010-00065.warc.wet.gz"

In [None]:
from fastwarc.warc import ArchiveIterator, WarcRecordType
from resiliparse.parse.encoding import detect_encoding
from cs336_data.gopher_quality_filter import gopher_quality_filter

SCORE_LANG = 0.90
SCORE_NSFW = 0.90
SCORE_TOXIC = 0.90

def process_single_wet_file(input_path: str, output_path: str) -> str:
    """
    Process a single WET file with language, toxicity, and NSFW filtering.
    Returns summary statistics.
    """
    iterator = ArchiveIterator(open(input_path, "rb"))
    for record in iterator:
        # 0. check record type
        if record.record_type != WarcRecordType.conversion:
            continue
        byte_string = record.reader.read()
        encoding = detect_encoding(byte_string)
        content = byte_string.decode(encoding, errors="ignore")

        # 1. check URL
        if should_filter_url(record.headers.get('WARC-Target-URI', '')):
            continue

        # 2. Quality check, rule-based
        if not gopher_quality_filter(content):
            continue

        # 3. Identify language
        lang, score = identify_language(content)
        if lang != "en" or score < SCORE_LANG:
            continue

        # 4. Classify NSFW
        nsfw, score = identify_language(content)
        if not nsfw.startswith("non-") or score < SCORE_NSFW:
            continue


In [64]:
record.record_type == WarcRecordType.conversion

True

In [70]:
record.headers

(('WARC-Type', 'conversion'), ('WARC-Target-URI', 'http://96-hi.com/index.phtml?PUT=gift_send&AID=207437&FID=1381841'), ('WARC-Date', '2025-04-17T14:47:56Z'), ('WARC-Record-ID', '<urn:uuid:ade66f60-c05b-4e80-8020-fed0882340e0>'), ('WARC-Refers-To', '<urn:uuid:191653f5-aefc-4a64-b8d1-e6e78c8ce753>'), ('WARC-Block-Digest', 'sha1:IHBR2RSEACFSWCDACZOHOAZZCSY6PNMX'), ('WARC-Identified-Content-Language', 'zho,eng'), ('Content-Type', 'text/plain'), ('Content-Length', '1413'))

In [None]:
LANG_BAR = 0.90
NSFW_BAR = 0.90
TOXIC_BAR = 0.90

def process_single_wet_file(input_path: str, output_path: str) -> str:
    """
    Process a single WET file with language, toxicity, and NSFW filtering.
    Returns summary statistics.
    """
    import gzip
    from fastwarc.warc import ArchiveIterator, WarcRecordType
    
    stats = {
        'total_records': 0,
        'conversion_records': 0,
        'passed_language': 0,
        'passed_nsfw': 0,
        'passed_toxic': 0,
        'final_kept': 0,
        'total_chars_kept': 0
    }
    
    with gzip.open(input_path, 'rb') as input_file, \
         gzip.open(output_path, 'wt', encoding='utf-8') as output_file:
        
        for record in ArchiveIterator(input_file):
            stats['total_records'] += 1
            
            # Only process conversion records (WET content)
            if record.record_type == WarcRecordType.conversion:
                stats['conversion_records'] += 1
                
                try:
                    # Extract text content
                    text = record.reader().read().decode('utf-8', errors='ignore')
                    
                    # Skip very short texts
                    if len(text.strip()) < 100:
                        continue
                    
                    # Get URL from headers
                    url = record.headers.get('WARC-Target-URI', '')
                    
                    # URL filtering - skip low-quality domains
                    if should_filter_url(url):
                        continue
                    
                    # Language identification
                    lang, lang_score = identify_language(text)
                    if lang != 'en' or lang_score < LANG_BAR:
                        continue
                    stats['passed_language'] += 1
                    
                    # NSFW classification
                    nsfw_score = classify_nsfw(text)
                    if nsfw_score > NSFW_BAR:
                        continue
                    stats['passed_nsfw'] += 1
                    
                    # Toxicity classification
                    toxic_score = classify_toxic_speech(text)
                    if toxic_score > TOXIC_BAR:
                        continue
                    stats['passed_toxic'] += 1
                    
                    # All filters passed - write to output
                    output_file.write(text + '\n\n')
                    stats['final_kept'] += 1
                    stats['total_chars_kept'] += len(text)
                    
                    # Progress indicator
                    if stats['final_kept'] % 100 == 0:
                        print(f"Kept {stats['final_kept']} documents so far...")
                    
                except Exception as e:
                    # Skip problematic records
                    print(f"Error processing record: {e}")
                    continue
    
    # Return summary
    summary = f"""
Processing complete for {input_path}:
- Total records: {stats['total_records']}
- Conversion records: {stats['conversion_records']}
- Passed language filter: {stats['passed_language']}
- Passed NSFW filter: {stats['passed_nsfw']}
- Passed toxicity filter: {stats['passed_toxic']}
- Final documents kept: {stats['final_kept']}
- Total characters kept: {stats['total_chars_kept']:,}
- Filter rates: Lang={stats['passed_language']/max(stats['conversion_records'],1):.2%}, NSFW={stats['passed_nsfw']/max(stats['passed_language'],1):.2%}, Toxic={stats['passed_toxic']/max(stats['passed_nsfw'],1):.2%}
"""
    
    return summary

In [5]:
from fastwarc.warc import ArchiveIterator, WarcRecordType
from tldextract import TLDExtract

wet_file = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/CC-MAIN-20250417135010-20250417165010-00065.warc.wet.gz"

iterator = ArchiveIterator(open(wet_file, 'rb'))

In [45]:
record = next(iterator)
record.record_type

<WarcRecordType.conversion: 128>

In [46]:
WarcRecordType.conversion

<WarcRecordType.conversion: 128>

In [None]:
# Test the function on a single file
test_input = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/CC-MAIN-20250417135010-20250417165010-00065.warc.wet.gz"
test_output = "/tmp/test_filtered.txt.gz"

# Run the processing function
result = process_single_wet_file(test_input, test_output)
print(result)

In [None]:
# Process all downloaded WET files in parallel
from concurrent.futures import ThreadPoolExecutor, as_completed
import glob

def process_all_wet_files(wet_dir: str, output_dir: str, max_workers: int = N_CPU):
    """Process all WET files in parallel"""
    
    # Find all WET files
    wet_files = glob.glob(f"{wet_dir}/*.warc.wet.gz")
    print(f"Found {len(wet_files)} WET files to process")
    
    # Create output directory
    Path(output_dir).mkdir(exist_ok=True, parents=True)
    
    # Process files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all jobs
        futures = {}
        for wet_file in wet_files:
            filename = Path(wet_file).stem  # Remove .gz extension
            output_file = Path(output_dir) / f"{filename}_filtered.txt.gz"
            
            future = executor.submit(process_single_wet_file, wet_file, str(output_file))
            futures[future] = wet_file
        
        # Collect results
        for future in as_completed(futures):
            wet_file = futures[future]
            try:
                result = future.result()
                print(f"✓ Completed: {Path(wet_file).name}")
                print(result)
                print("-" * 80)
            except Exception as e:
                print(f"✗ Failed: {Path(wet_file).name} - {e}")

# Example usage (uncomment to run):
# wet_directory = "/home/azureuser/mount/CC"  # Where your downloaded files are
# output_directory = "/home/azureuser/mount/CC_filtered"  # Where to save filtered files
# process_all_wet_files(wet_directory, output_directory)

In [None]:
# URL filtering strategies using tldextract
from tldextract import TLDExtract

# Create the extractor instance
extractor = TLDExtract()

def should_filter_url(url: str) -> bool:
    """
    Return True if URL should be filtered out (removed)
    """
    if not url:
        return True
    
    try:
        extracted = extractor(url)  # Use the class instance
        domain = extracted.domain.lower()
        suffix = extracted.suffix.lower()
        subdomain = extracted.subdomain.lower()
        
        # 1. Filter by top-level domain (keep only certain TLDs)
        allowed_tlds = {'com', 'org', 'edu', 'gov', 'net', 'co.uk', 'ca', 'au'}
        if suffix not in allowed_tlds:
            return True
        
        # 2. Filter out known low-quality or spam domains
        spam_domains = {
            'blogspot', 'wordpress', 'tumblr', 'livejournal', 'geocities',
            'angelfire', 'weebly', 'wix', 'squarespace', 'medium'  # Personal blogs
        }
        if domain in spam_domains:
            return True
        
        # 3. Filter out adult/inappropriate domains
        adult_domains = {
            'pornhub', 'xvideos', 'redtube', 'youporn', 'xhamster',
            'tube8', 'spankbang', 'chaturbate', 'cam4', 'livejasmin'
        }
        if domain in adult_domains:
            return True
        
        # 4. Filter out social media/forum content (often low quality)
        social_domains = {
            'facebook', 'twitter', 'instagram', 'tiktok', 'snapchat',
            'reddit', '4chan', '8chan', 'discord', 'telegram'
        }
        if domain in social_domains:
            return True
        
        # 5. Filter out e-commerce/shopping sites (product descriptions)
        ecommerce_domains = {
            'amazon', 'ebay', 'alibaba', 'aliexpress', 'etsy',
            'shopify', 'walmart', 'target', 'bestbuy'
        }
        if domain in ecommerce_domains:
            return True
        
        # 6. Filter suspicious subdomains
        suspicious_subdomains = {
            'ads', 'ad', 'advertisement', 'promo', 'spam',
            'affiliate', 'click', 'tracker', 'analytics'
        }
        if subdomain in suspicious_subdomains:
            return True
        
        # 7. Prefer high-quality domains (news, education, government)
        high_quality_domains = {
            'wikipedia', 'bbc', 'cnn', 'reuters', 'nytimes',
            'mit', 'stanford', 'harvard', 'cambridge', 'oxford',
            'nasa', 'nih', 'cdc', 'who', 'unesco'
        }
        # You could prioritize these instead of filtering
        
        # 8. Filter by URL patterns
        url_lower = url.lower()
        if any(pattern in url_lower for pattern in [
            '/ads/', '/advertisement/', '/promo/', '/affiliate/',
            'utm_source', 'utm_medium', 'utm_campaign',  # Tracking URLs
            '?ref=', '&ref=', 'referrer=',
            '/category/', '/tag/', '/search?'  # Navigation pages
        ]):
            return True
            
        return False
        
    except Exception:
        # Filter out URLs that can't be parsed
        return True

# Example usage in your processing function:
def process_single_wet_file_with_url_filter(input_path: str, output_path: str) -> str:
    """Enhanced version with URL filtering"""
    # ... (previous code) ...
    
    # Add URL filtering after extracting URL but before other filters:
    # url = record.headers.get('WARC-Target-URI', '')
    # if should_filter_url(url):
    #     continue
    
    pass

print("URL filtering functions defined!")
print("Example:")
test_urls = [
    "https://en.wikipedia.org/wiki/Machine_Learning",
    "https://myblog.blogspot.com/2023/random-post",
    "https://www.amazon.com/product/12345",
    "https://ads.google.com/click?id=123"
]

for url in test_urls:
    print(f"{url}: {'FILTER' if should_filter_url(url) else 'KEEP'}")