## Parse and extract text from `warc` file
1. With `fastwarc` and `resiliparse`
    1. https://resiliparse.chatnoir.eu/en/stable/index.html

In [None]:
from fastwarc import ArchiveIterator
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.encoding import detect_encoding

def extract_text(record):
    byte_string = record.reader.read()
    encoding = detect_encoding(byte_string)
    html_content = byte_string.decode(encoding=encoding)
    extracted_text = extract_plain_text(html_content)
    return extracted_text

warc_file = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/CC-MAIN-20250417135010-20250417165010-00065.warc.gz"
iterator = ArchiveIterator(open(warc_file, "rb"), func_filter=lambda r: r.headers.get('WARC-Identified-Payload-Type') == 'text/html')

# record = next(iterator)
# print(extract_text(record))

In [None]:
# To get random records for ensuing tests
# Method: Skip randomly through iterator
import random

def get_random_records(iterator, N=20, skip_prob=0.9):
    """Skip records randomly and return extracted text immediately"""
    random_data = []
    for i, record in enumerate(iterator):
        if len(random_data) >= N:
            break
        if random.random() > skip_prob:
            # Read and process immediately while record is still fresh
            extracted_text = extract_text(record)
            random_data.append((i, extracted_text))
    return random_data

## Language identification

In [None]:
import fasttext
model = fasttext.load_model("lid.176.bin")

# sanity check
model.predict("Hello world.")

In [None]:
def language_generator():
    for i, text in random_data:
        print(f"=== Record {i} ===")
        print(text[:200] + "..." if len(text) > 200 else text)
        lang, score = model.predict(text.replace("\n", " "))
        print(f"Language: {lang[0]}, Score: {score[0]:.4f}")
        print("-" * 50)
        yield 

# Get random records with extracted text
random_data = get_random_records(iterator)

# Create the generator
lang_gen = language_generator()


In [None]:
# Run this cell multiple times to get one result at a time
next(lang_gen)

## PII
### email address
- Length: The total length of an email address is capped at 320 characters, with 64 for the username and 255 for the domain.
- Spaces: Spaces are not allowed.
- Case sensitivity: Email addresses are generally not case-sensitive, meaning User@Example.com is the same as user@example.com.

- Special characters:
    - Periods (.), hyphens (-), and underscores (_) are often allowed in the local part.
    - They cannot be the first or last character of the local part and cannot appear consecutively (e.g., john..doe@example.com is invalid).
    - In the domain, hyphens are allowed but not at the beginning or end of a label (a part between periods). 

### US phone number
1. use pattern
    - (\+1\s*)? - optional `+1` followed by optional spaces
    - \(? - optional opening parenthesis
    - \d{3} - 3 digits
    - \)? - optional closing parenthesis
    - [\s-]? - at most one space or hyphen (the ? means zero or one)
    - \s* - zero or more additional spaces
    - \d{3} - 3 digits
    - [\s-]? - at most one space or hyphen
    - \s* - zero or more additional spaces
    - \d{4} - 4 digits

### IP address
- Use `\b` to avoid matching things like:
    - 1.2.3.4.5 (too many octets)
    - version1.2.3.4 (prefix attached)
    - 1.2.3.4th (suffix attached)

In [None]:
from mask_pii import mask_email, mask_phone, mask_ip

record = next(iterator)
text = extract_text(record)
print(text)

In [None]:
text, mask_email_counts = mask_email(text)
text, mask_phone_counts = mask_phone(text)
text, mask_ip_counts = mask_ip(text)

if any([counts > 0 for counts in [mask_email_counts, mask_phone_counts, mask_ip_counts]]):
    print(text)

## Harmful content

In [None]:
import fasttext

model = fasttext.load_model("/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/jigsaw_fasttext_bigrams_nsfw_final.bin")


In [None]:
text = (
    "SUCK MY C*CK WIKIPEDIA EDITORS...F*CKING *SSH*LE DORKS. "
    "JUST TRYING TO MAKE THE SITE BETTER YOU UPTIGHT C*NTS"
)
# text = "Umm, theres no actual article for prostitution ring.  - Crunch Captain."
text = "操你妈 你个傻逼 干死你"

model.predict(text)

In [None]:
print(text)

In [None]:
from harmful_content import classify_nsfw, classify_toxic_speech

found = False
while found == False:
    record = next(iterator)
    try:
        text = extract_text(record)
    except UnicodeDecodeError:
        continue
    # print(text)
    is_nsfw, score_nsfw = classify_nsfw(text)
    is_toxic, score_toxic = classify_toxic_speech(text)

    if is_nsfw == "nsfw" or is_toxic == "toxic":
        print(text)
        print("="*80)
        print("Harmful content detected!")
        print(is_nsfw, is_toxic)
        print(score_nsfw, score_toxic)

        found = True


## Quality Classifier

- To get positive texts; the filtering might be a bit cheating...
    - `cat enwiki-20240420-extracted_urls.txt | grep "https://en.wikipedia.org/wiki" > enwiki-20240420-extracted_urls_subset.txt`
    - `wget --tries=2 --timeout=5 -i enwiki-20240420-extracted_urls_subset.txt --warc-file=subsampled_positive_urls -O /dev/null`

In [None]:
from fastwarc import ArchiveIterator
from fastwarc.warc import WarcRecord
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.encoding import detect_encoding

def extract_text(record):
    byte_string = record.reader.read()
    encoding = detect_encoding(byte_string)
    try:
        html_content = byte_string.decode(encoding=encoding)
    except UnicodeDecodeError:
        return ""
    extracted_text = extract_plain_text(html_content)
    return extracted_text


In [None]:
def is_html_record(record: WarcRecord) -> bool:
    try:
        return record.http_headers.get('Content-Type', '').startswith('text/html')
    except:
        return False

warc_file = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/subsampled_positive_urls.warc.gz"
iterator = ArchiveIterator(open(warc_file, "rb"))

from language_identification import identify_language
positive_texts = []
for record in iterator:
    if is_html_record(record):
        text = extract_text(record)
        if len(text) > 512:  # very low bar for wikipedia articles
            lang, score = identify_language(text)
            if lang == "en" and score > 0.8:
                positive_texts.append(text)

print(f"Raw number of positive texts: {len(positive_texts)}")

In [None]:
import random
from language_identification import identify_language

def get_random_english_records_as_negative_texts(iterator, N=500, skip_prob=0.8):
    """Skip records randomly and return extracted text immediately"""
    random_data = []
    for record in iterator:
        if len(random_data) >= N:
            break
        if random.random() > skip_prob:
            # Read and process immediately while record is still fresh
            extracted_text = extract_text(record)
            lang, score = identify_language(extracted_text)
            if lang == "en" and score > 0.9:
                random_data.append(extracted_text)
    return random_data

warc_file = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/CC-MAIN-20250417135010-20250417165010-00065.warc.gz"
iterator = ArchiveIterator(open(warc_file, "rb"), func_filter=lambda r: r.headers.get('WARC-Identified-Payload-Type') == 'text/html')
negative_texts = get_random_english_records_as_negative_texts(iterator)

print(f"Raw number of negative texts: {len(negative_texts)}")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def chunk_text_simple(text, tokenizer, max_length=512, stride=64):
    """Simpler version using return_overflowing_tokens"""
    tokens = tokenizer(
        text,
        max_length=max_length,
        stride=stride,
        truncation=True,
        return_overflowing_tokens=True,
        return_tensors="pt",
        padding="max_length"
    )
    
    return [
        {
            'input_ids': tokens['input_ids'][i].tolist(),
            'attention_mask': tokens['attention_mask'][i].tolist()
        }
        for i in range(len(tokens['input_ids']))
    ]

In [None]:
positive_chunks = [chunk_text_simple(text, tokenizer) for text in positive_texts]
positive_chunks = [chunk for chunks in positive_chunks for chunk in chunks]
print(len(positive_chunks))

In [None]:
negative_chunks = [chunk_text_simple(text, tokenizer) for text in negative_texts]
negative_chunks = [chunk for chunks in negative_chunks for chunk in chunks]
print(len(negative_chunks))

In [None]:
# Shuffle chunks before splitting
import random

random.seed(42)  # for reproducibility
random.shuffle(negative_chunks)
random.shuffle(positive_chunks)

# Create train/valid/test splits
n_neg = len(negative_chunks)
n_pos = len(positive_chunks)

print(f"Total negative chunks: {n_neg}")
print(f"Total positive chunks: {n_pos}")

# Split indices
train_neg = negative_chunks[:500]
valid_neg = negative_chunks[500:600]

train_pos = positive_chunks[:500]
valid_pos = positive_chunks[500:600]

# Create datasets
from datasets import Dataset

train_chunks = train_neg + train_pos
train_labels = [0] * len(train_neg) + [1] * len(train_pos)

valid_chunks = valid_neg + valid_pos
valid_labels = [0] * len(valid_neg) + [1] * len(valid_pos)

ds_train = Dataset.from_dict({
    "input_ids": [chunk['input_ids'] for chunk in train_chunks],
    "attention_mask": [chunk['attention_mask'] for chunk in train_chunks],
    "label": train_labels
})

ds_valid = Dataset.from_dict({
    "input_ids": [chunk['input_ids'] for chunk in valid_chunks],
    "attention_mask": [chunk['attention_mask'] for chunk in valid_chunks],
    "label": valid_labels
})

print(f"\nTrain: {len(ds_train)} samples (neg: {sum(1 for l in train_labels if l == 0)}, pos: {sum(1 for l in train_labels if l == 1)})")
print(f"Valid: {len(ds_valid)} samples (neg: {sum(1 for l in valid_labels if l == 0)}, pos: {sum(1 for l in valid_labels if l == 1)})")

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="quality_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    processing_class=tokenizer,
)

trainer.train()

In [None]:
with open("/home/azureuser/localfiles/cs336-assignment4-data-mine/tests/fixtures/high_quality_wiki_reference.txt") as f:
    hq_text = f.read()
with open("/home/azureuser/localfiles/cs336-assignment4-data-mine/tests/fixtures/low_quality_cc.txt") as f:
    lq_text = f.read()

from transformers import pipeline

# hq_text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
hq_text = 'Anarchism\nFirst published Tue Oct 3, 2017; substantive revision Tue Oct 26, 2021\nAnarchism is a political theory that is skeptical of the justification of authority and power. Anarchism is usually grounded in moral claims about the importance of individual liberty, often conceived as freedom from domination. Anarchists also offer a positive theory of human flourishing, based upon an ideal of equality, community, and non-coercive consensus building. Anarchism has inspired practical efforts at establishing utopian communities, radical and revolutionary political agendas, and various forms of direct action. This entry primarily describes “philosophical anarchism”: it focuses on anarchism as a theoretical idea and not as a form of political activism. While philosophical anarchism describes a skeptical theory of political legitimation, anarchism is also a concept that has been employed in philosophical and literary theory to describe a sort of anti-foundationalism. Philosophical anarchism can mean either a theory of political life that is skeptical of attempts to justify state authority or a philosophical theory that is skeptical of the attempt to assert firm foundations for knowledge.\n\n1. Varieties of Anarchism\nThere are various forms of anarchism. Uniting this variety is the general critique of centralized, hierarchical power and authority.'

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
# classifier = pipeline("sentiment-analysis", model="quality_classifier/checkpoint-152")
classifier(hq_text)

In [None]:
lq_text