## Parse and extract text from `warc` file
1. With `fastwarc` and `resiliparse`
    1. https://resiliparse.chatnoir.eu/en/stable/index.html

In [None]:
from fastwarc import ArchiveIterator
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.encoding import detect_encoding

warc_file = "/home/azureuser/localfiles/cs336-assignment4-data-mine/cs336_data/CC-MAIN-20250417135010-20250417165010-00065.warc.gz"
iterator = ArchiveIterator(open(warc_file, "rb"), func_filter=lambda r: r.headers.get('WARC-Identified-Payload-Type') == 'text/html')

In [None]:
def extract_text(record):
    byte_string = record.reader.read()
    encoding = detect_encoding(byte_string)
    html_content = byte_string.decode(encoding=encoding)
    extracted_text = extract_plain_text(html_content)
    return extracted_text

record = next(iterator)
print(extract_text(record))

In [None]:
# To get random records for ensuing tests
# Method: Skip randomly through iterator
import random

def get_random_records(iterator, N=20, skip_prob=0.9):
    """Skip records randomly and return extracted text immediately"""
    random_data = []
    for i, record in enumerate(iterator):
        if len(random_data) >= N:
            break
        if random.random() > skip_prob:
            # Read and process immediately while record is still fresh
            extracted_text = extract_text(record)
            random_data.append((i, extracted_text))
    return random_data

## Language identification

In [None]:
import fasttext
model = fasttext.load_model("lid.176.bin")

# sanity check
model.predict("Hello world.")

In [None]:
def language_generator():
    for i, text in random_data:
        print(f"=== Record {i} ===")
        print(text[:200] + "..." if len(text) > 200 else text)
        lang, score = model.predict(text.replace("\n", " "))
        print(f"Language: {lang[0]}, Score: {score[0]:.4f}")
        print("-" * 50)
        yield 

# Get random records with extracted text
random_data = get_random_records(iterator)

# Create the generator
lang_gen = language_generator()


In [None]:
# Run this cell multiple times to get one result at a time
next(lang_gen)

## PII
### email address
- Length: The total length of an email address is capped at 320 characters, with 64 for the username and 255 for the domain.
- Spaces: Spaces are not allowed.
- Case sensitivity: Email addresses are generally not case-sensitive, meaning User@Example.com is the same as user@example.com.

- Special characters:
    - Periods (.), hyphens (-), and underscores (_) are often allowed in the local part.
    - They cannot be the first or last character of the local part and cannot appear consecutively (e.g., john..doe@example.com is invalid).
    - In the domain, hyphens are allowed but not at the beginning or end of a label (a part between periods). 

In [None]:
import re
email_address_pattern_username = re.compile(r'[a-zA-Z0-9]([._-]?[a-zA-Z0-9]){0,63}')
email_address_pattern_label = re.compile(r'[a-zA-Z0-9._-]([a-zA-Z0-9-]*[a-zA-Z0-9]){0,63}')

In [None]:
import re 
def is_valid_username(username: str) -> bool:
    # 1. allowed special char: [._-]
    # 2. cannot start or end with special chars
    # 3. length cannot exceed 64
    username_pattern = re.compile(r'[a-zA-Z0-9]([._-]?[a-zA-Z0-9]){0,63}')
    return re.match(username_pattern, username)

def is_valid_domain(domain: str) -> bool:
    # Cannot exceed 255 characters
    if len(domain) > 255:
        return False

    # Split domain into labels
    labels = domain.split('.')
    if len(labels) < 2: # at least two labels
        return False

    # 1. allowed special char: -
    # 2. cannot start or end with special chars
    # 3. length cannot exceed 64
    label_pattern = re.compile(r'[a-zA-Z0-9](-?[a-zA-Z0-9]){0,63}')
    for label in labels:
        if not re.match(label_pattern, label):
            return False

    # Last label (TLD) should be at least 2 letters
    if len(labels[-1]) < 2 or not labels[-1].isalpha():
        return False

    return True

def is_valid_email(candidate: str) -> bool:
    # Quick checks before detailed validation
    if not candidate or len(candidate) > 320:
        return False
    
    if candidate.count('@') != 1:  # Must have exactly one "@"
        return False
    
    if candidate.count('.') < 1:  # Must have at least one "."
        return False

    username, domain = candidate.split("@")
    
    # Check for empty parts
    if not username or not domain:
        return False
    
    return is_valid_username(username) and is_valid_domain(domain)

def mask_email(text: str, mask_str: str = "|||EMAIL_ADDRESS|||") -> str:
    """Find potential email addresses in text and replace it with mask string"""
    result = []
    last_end  = 0

    for match in re.finditer(r'\S+', text):
        # Add text between last match and current match (preserves spaces)
        result.append(text[last_end:match.start()])
        
        # Check if current match is email
        candidate = match.group()
        if is_valid_email(candidate):
            result.append(mask_str)
        else:
            result.append(candidate)
        
        last_end = match.end()
    
    # Add any remaining text after last match
    result.append(text[last_end:])
    
    return ''.join(result)

In [None]:
texts = [
    "Feel free to  contact me at test@gmail.com if you have any questions.",
    "Some datasets use the string |||EMAIL_ADDRESS||| to represent masked PII. ",
    "The instructors are pl@fakedomain.ai and spl@fakedomain.ai",
]

### US phone number
1. use pattern
    - (\+1\s*)? - optional `+1` followed by optional spaces
    - \(? - optional opening parenthesis
    - \d{3} - 3 digits
    - \)? - optional closing parenthesis
    - [\s-]? - at most one space or hyphen (the ? means zero or one)
    - \s* - zero or more additional spaces
    - \d{3} - 3 digits
    - [\s-]? - at most one space or hyphen
    - \s* - zero or more additional spaces
    - \d{4} - 4 digits

In [None]:
numbers = ["2831823829", "(283)-182-3829", "(283) 182 3829", "283-182-3829",  "+1-283-182-3829", "+1283-182-3829 "]
numbers = ["call me at 2831823829 or (283)-182-3829"]

In [None]:
pattern = re.compile(r'(\+1\s*)?\(?\d{3}\)?[\s-]?\s*?\d{3}[\s-]?\s*\d{4}')

def mask_phone(text: str, mask_str: str = "|||PHONE_NUMBER|||") -> str:
    return re.sub(pattern, mask_str, text)

numbers = ["2831823829", "(283)-182-3829", "(283) 182 3829", "283-182-3829"]
for number in numbers:
    text = f"Feel free to contact me at {number} if you have any questions, remember {number}."
    print(mask_phone(text))

### IP address
- Use `\b` to avoid matching things like:
    - 1.2.3.4.5 (too many octets)
    - version1.2.3.4 (prefix attached)
    - 1.2.3.4th (suffix attached)

In [None]:
pattern = re.compile(r'\b\d+\.\d+\.\d+\.\d+\b')