# Wet File Data Extraction
---

## 1. Import libraries 

In [0]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, lower, when, sum as spark_sum, collect_list, concat_ws, udf, explode
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
import re
import boto3
import botocore
import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='.*No Partition Defined for Window operation.*')

import time

time_start = time.time()

## 2. Connect to Boto3 

### 2.1 Set Secrets

In [0]:
aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

### 2.2 Intialize boto3 client

In [0]:
# Optional: build client once (faster)
s3 = boto3.client(
    's3',
    region_name='us-east-1',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

## 3. Text transformation & Basic Filtering

In [0]:
# Read raw WET file
df = spark.read.text(
    's3://mydbxbucketpractice/common_crawl/wet_files/CC-MAIN-20220629054527-20220629084527-00796.warc.wet.gz'
)

display(df)

### 4.1 Extract Header Function
- Finds and extracts specific metadata fields from WET record headers
- Example: Extracts the date, URL, or content type from the header section

In [0]:
# Function to extract header
def extract_header(text, header_name):
    """
    Extract a specific header value from WET headers.
    
    Parameters:
    text (str): The text containing headers
    header_name (str): The name of the header to extract
    
    Returns:
    str: The header value, or None if not found
    """
    # Pattern to match header with optional quotes
    pattern = rf'{re.escape(header_name)}:\s*"?([^"\n]+?)"?\s*$'
    match = re.search(pattern, text, re.MULTILINE)
    if match:
        value = match.group(1).strip()
        # Remove quotes if they're at the very start and end
        if value.startswith('"') and value.endswith('"'):
            value = value[1:-1]
        return value
    return None

### 4.2 Parse Single Record
- Takes one complete WET record and breaks it into organized fields
- Extracts: URL, date, language, content type, and the actual web page text
- Returns a structured dictionary with all extracted information

In [0]:
# Function to parse a single WET record
def parse_single_wet_record(record_text):
    """
    Parse a single WET record into structured fields.
    
    Parameters:
    record_text (str): A complete WET record as text
    
    Returns:
    dict: Parsed record with all fields
    """
    if not record_text or not isinstance(record_text, str):
        return None
    
    record = {
        'Type': None,
        'Target-URI': None,
        'Date': None,
        'Record-ID': None,
        'Refers-To': None,
        'Block-Digest': None,
        'Identified-Content-Language': None,
        'Content-Type': None,
        'Content-Length': None,
        'Content': None
    }
    
    # Find Content-Length line and extract value
    content_length_match = re.search(r'Content-Length:\s*(\d+)', record_text)
    if not content_length_match:
        return None
    
    record['Content-Length'] = content_length_match.group(1)
    
    # Split into headers and content
    # Content starts after the first blank line following Content-Length
    parts = re.split(r'\nContent-Length:\s*\d+\s*\n\n', record_text, maxsplit=1)
    if len(parts) < 2:
        return None
    
    headers = parts[0] + '\nContent-Length: ' + record['Content-Length']
    content = parts[1].strip()
    
    # Extract headers
    record['Type'] = extract_header(headers, 'WARC-Type')
    record['Target-URI'] = extract_header(headers, 'WARC-Target-URI')
    record['Date'] = extract_header(headers, 'WARC-Date')
    record['Record-ID'] = extract_header(headers, 'WARC-Record-ID')
    record['Refers-To'] = extract_header(headers, 'WARC-Refers-To')
    record['Block-Digest'] = extract_header(headers, 'WARC-Block-Digest')
    record['Identified-Content-Language'] = extract_header(headers, 'WARC-Identified-Content-Language')
    record['Content-Type'] = extract_header(headers, 'Content-Type')
    record['Content'] = content
    
    return record

### 4.3 Convert to Dataframe
- MAIN FUNCTION: Converts raw WET file into a clean, structured table
- Process:
  - Identifies where each web page record starts and ends in the file
  - Groups all lines belonging to each record together
  - Parses each record to extract metadata and content
  - Removes system records (warcinfo) that aren't actual web pages
- Input: Raw WET file loaded line-by-line
- Output: Table with one row per web page, columns for URL, date, content, etc.

In [0]:
# Main function that converts the text to columnar format
def convert_wet_dataframe(df):
    """
    Convert a Spark DataFrame (loaded via spark.read.text) containing WARC data 
    into columnar format.
    
    Parameters:
    df: Spark DataFrame with 'value' column containing lines from WARC file
    
    Returns:
    DataFrame: Parsed DataFrame with columns:
        Type, Target-URI, Date, Record-ID, Refers-To, Block-Digest,
        Identified-Content-Language, Content-Type, Content-Length, Content
    """
    from pyspark.sql.functions import monotonically_increasing_id
    
    # Add row number for ordering
    df_numbered = df.withColumn("row_id", monotonically_increasing_id())
    
    # Identify lines that start a new WARC record
    df_marked = df_numbered.withColumn(
        "is_record_start",
        when(col("value") == "WARC/1.0", 1).otherwise(0)
    )
    
    # Create record_id by cumulative sum
    window_spec = Window.orderBy("row_id").rowsBetween(Window.unboundedPreceding, Window.currentRow)
    df_grouped = df_marked.withColumn(
        "record_id",
        spark_sum("is_record_start").over(window_spec)
    )
    
    # Group all lines belonging to same record and concatenate with newlines
    df_records = df_grouped.groupBy("record_id").agg(
        concat_ws("\n", collect_list("value")).alias("record_text")
    )
    
    # Filter out empty records
    df_records = df_records.filter(col("record_text") != "")
    
    # Define schema for parsed output
    record_schema = StructType([
        StructField('Type', StringType(), True),
        StructField('Target-URI', StringType(), True),
        StructField('Date', StringType(), True),
        StructField('Record-ID', StringType(), True),
        StructField('Refers-To', StringType(), True),
        StructField('Block-Digest', StringType(), True),
        StructField('Identified-Content-Language', StringType(), True),
        StructField('Content-Type', StringType(), True),
        StructField('Content-Length', StringType(), True),
        StructField('Content', StringType(), True)
    ])
    
    # UDF to parse each record
    parse_udf = udf(parse_single_wet_record, record_schema)
    
    # Parse records
    df_parsed = df_records.withColumn("parsed", parse_udf(col("record_text")))
    
    # Filter out null parsed records and expand struct
    df_final = df_parsed.filter(col("parsed").isNotNull()).select("parsed.*")

    # Remove warcinfo records
    df_final = df_final.filter(col("Type") != "warcinfo")
    
    return df_final

In [0]:
df_clean = convert_wet_dataframe(df)

display(df_clean.limit(10))

In [0]:
df_clean.count()

### 4.4 Basic Filtering
- Filters the parsed data to keep only relevant records
- Keeps only:
  * English-only content (no mixed languages)
  * Websites from .com, .org, .edu, or .gov domains
- Input: Parsed WET table
- Output: Filtered table with only English content from specified domains

In [0]:
# Function to apply basic filtering
def filter_wet_records(df):
    """
    Filter WET records to only include:
    - URLs with .com, .org, .edu, or .gov domains
    - Records where Identified-Content-Language is 'eng'
    
    Parameters:
    df: Parsed WET DataFrame with columns including 'Target-URI' and 'Identified-Content-Language'
    
    Returns:
    DataFrame: Filtered DataFrame
    """
    from pyspark.sql.functions import lower, trim
    
    # Filter for English content
    df_filtered = df.filter(
        (lower(trim(col("Identified-Content-Language"))) == "eng")
    )
    
    # Filter for specific domains (.com, .org, .edu, .gov)
    # Using regex to match domains properly (not matching .com.au, etc.)
    df_filtered = df_filtered.filter(
        col("Target-URI").rlike(r'://[^/]*\.(com|org|edu|gov)(/|$)')
    )
    
    return df_filtered

In [0]:
df_filter = filter_wet_records(df_clean)

display(df_filter.limit(10))

In [0]:
df_filter.count()

## 4. Keyword Filtering

### 4.1 Keyword Strategy

In [0]:
# Define product-level keywords
keywords_product = [
        "u.s. census bureau",
        "us census bureau", 
        "uscb",
        "census bureau",
        "census.gov",
        "data.census.gov",
        "factfinder.census.gov",
        "american community survey",
    ]

# Geographic Terms
keywords_geographic = [
  "zip code",
  "zip codes",
  "census tract",
  "block group",
  "county",
  "state",
  "metropolitan area",
  "city",
  "radius",
  "geographic area",
  "location"
  ]



**Subject Keywords**:

The Census Bureau defines all subjects covered in the American Community Survey on their website. https://www.census.gov/programs-surveys/acs/guidance/subjects.html#descriptionaccordion-73370cfb1f-item-264b8c4d39

In [0]:
# Define subject-level keywords
keywords_subject = [

    # Social Keywords
    "ancestry",
    "citizen voting-age population",
    "citizenship status",
    "disability status",
    "educational attainment",
    "fertility",
    "grandparets as caregivers",
    "language spoken at home",
    "marital history",
    "marital status",
    "migration residence 1 year ago",
    "place of birth",
    "school enrollment",
    "undergraduate field of degree",
    "veterans status",
    "period of military service",
    "year of entry",

    # Economic keywords
    "class of worker",
    "commuting and place of work",
    "employment status",
    "food stamps",
    "supplemental nutrition assistance program",
    "health insurance coverage",
    "income and earnings",
    "income and earnings",
    "industry",
    "occupation",
    "poverty status",
    "work status last year",

    # Housing keywords
    "bedrooms",
    "computer & internet use",
    "house heating fuel",
    "kitchen facilities",
    "occupancy/vacancy status",
    "occupants per room",
    "rent",
    "rooms",
    "selected monthly owner costs",
    "telephone service available",
    "tenure (owner/renter)"
    "units in structure",
    "value of home",
    "vehicles available",
    "year householder moved in unit",
    "year structure built"

    # Demographic keywords
    "age; sex",
    "group quarters population",
    "hispanic or latino origin",
    "race",
    "relationship to householder",
    "total population"
]

In [0]:
# Flattened list of all keywords (useful for regex or filtering)
def keyword_string(keyword_list):
    """Returns a text string for rlike filtering"""    
    return '|'.join(keyword_list)


In [0]:
keyword_string(keywords_product)

### 4.2 Subject Filtering Strategy

In [0]:
df_bottomup_1 = df_filter.filter(
    lower(col("Content")).rlike(keyword_string(keywords_subject)))

print(f"Row Count: {df_bottomup_1.count()}")

In [0]:
df_bottomup_2 = df_bottomup_1.filter(
    lower(col("Content")).rlike(keyword_string(keywords_geographic)))

print(f"Row Count: {df_bottomup_2.count()}")

In [0]:
df_bottomup_3 = df_bottomup_2.filter(
    lower(col("Content")).rlike(keyword_string(keywords_product)))

print(f"Row Count: {df_bottomup_3.count()}")

In [0]:
display(df_bottomup_3)

### 4.3 Product Filtering Strategy

In [0]:
df_top_1 = df_filter.filter(
    lower(col("Content")).rlike(keyword_string(keywords_product)))

print(f"Row Count: {df_top_1.count()}")



In [0]:
time_end = time.time()
print(f"Elapsed time: {time_end - time_start:.2f} seconds")