# End-to-End WET Files Pipeline
---


## 1. Import Libraries


In [0]:
import sys
sys.path.append(
    '/Workspace/Shared/gu_census_crawl/common_crawl/nlp_pipeline/src/wet_file_extraction_pipeline'
)

# Import all modules
from helpers import (
    sample_crawls, get_key_list, copy_wet_files,
    convert_wet_dataframe, filter_by_domain, filter_by_keywords, keyword_string,
    KEYWORDS_PRODUCT, KEYWORDS_GEOGRAPHIC, KEYWORDS_SUBJECT
)
from utils import get_s3_client, read_wet_crawls, list_s3_files, read_text_files
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
from pyspark.sql import Row
import time


## 2. STEP 1: EXTRACTION - Read and Sample Crawls


In [0]:
print("=" * 80)
print("STEP 1: Reading and Sampling WET Crawls")
print("=" * 80)

# Read WET crawls from bronze
df_all_crawls = read_wet_crawls(spark)
total = df_all_crawls.count()
print(f"Total WET crawls available: {total:,}")
display(df_all_crawls)


In [0]:
# Sample crawls by years
years = ['2021', '2022', '2023', '2024', '2025']
df_samples = sample_crawls(df_all_crawls, years, limit=2)

if df_samples is None:
    print("No samples found. Exiting.")
    exit()

print(f"Sampled {len(df_samples)} crawls")
display(df_samples)


## 3. STEP 2: EXTRACTION - Save Samples to Bronze


In [0]:
print("\n" + "=" * 80)
print("STEP 2: Saving Samples to Bronze Table")
print("=" * 80)

df_samples_spark = spark.createDataFrame(df_samples)
df_samples_spark.write.mode("append").saveAsTable(
    "census_bureau_capstone.bronze.sample_size_raw"
)
print("Samples saved to bronze.sample_size_raw")


## 4. STEP 3: EXTRACTION - Copy WET Files from Common Crawl to Destination


In [0]:
print("\n" + "=" * 80)
print("STEP 3: Copying WET Files from Common Crawl")
print("=" * 80)

# Get key list and copy files
key_list = get_key_list(df_samples)
s3 = get_s3_client(dbutils)
copy_wet_files(s3, key_list, "commoncrawl", "mydbxbucketpractice")


## 5. STEP 4: TRANSFORMATION - Read and Parse WET Files


In [0]:
print("\n" + "=" * 80)
print("STEP 4: Reading and Parsing WET Files")
print("=" * 80)

# List and read WET files
file_paths = list_s3_files(dbutils, "mydbxbucketpractice", "common_crawl/wet_files/")
df_raw = read_text_files(spark, file_paths)

total_records_raw = df_raw.count()
print(f"Total records in raw data: {total_records_raw:,}")


In [0]:
# Convert to structured format
df_clean = convert_wet_dataframe(df_raw)
total_clean_count = df_clean.count()
print(f"Total records after parsing: {total_clean_count:,}")
display(df_clean.limit(10))


## 6. STEP 5: TRANSFORMATION - Language Detection


In [0]:
print("\n" + "=" * 80)
print("STEP 5: Language Detection")
print("=" * 80)

# Note: Language detection requires fasttext model
# Uncomment and provide model_path if you have fasttext model
from language_detection import detect_language_status_fasttext_with_fallback
df_clean = detect_language_status_fasttext_with_fallback(
     df_clean, text_col="Content", out_col="lang_status", add_bool=False
 )

# For now, filter by Content-Length and add row numbers
window_spec = Window.orderBy("Record-ID")
df_clean = (
    df_clean
    .withColumn("row_num", row_number().over(window_spec))
    .filter(col("Content-Length").cast("int") > 30)
)

# If language detection was run, filter for English only:
# df_clean = df_clean.filter(col("lang_status") == "english_only")

total_english_records_found = df_clean.count()
print(f"Total records after filtering: {total_english_records_found:,}")
display(df_clean)


## 7. STEP 6: TRANSFORMATION - Filter by Domain


In [0]:
print("\n" + "=" * 80)
print("STEP 6: Filtering by Domain (.com, .org, .edu, .gov)")
print("=" * 80)

df_filter = filter_by_domain(df_clean)
domain_filter_count = df_filter.count()
print(f"Records after domain filtering: {domain_filter_count:,}")
display(df_filter)


## 8. STEP 7: TRANSFORMATION - Filter by Product Keywords


In [0]:
print("\n" + "=" * 80)
print("STEP 7: Filtering by Product Keywords")
print("=" * 80)

df_top_1 = filter_by_keywords(df_filter, KEYWORDS_PRODUCT)
total_products_found = df_top_1.count()
print(f"Records matching product keywords: {total_products_found:,}")
display(df_top_1)


## 9. STEP 8: SAVE RESULTS


In [0]:
print("\n" + "=" * 80)
print("STEP 8: Saving Results")
print("=" * 80)

# Save to silver layer
df_top_1.write.mode("append").saveAsTable(
    "census_bureau_capstone.silver.census_product_cleaned"
)
print("Saved to silver.census_product_cleaned")


In [0]:
# Create summary
summary_df = spark.createDataFrame([
    Row(
        total_products_found=total_products_found,
        total_clean_count=total_clean_count,
        total_english_records_found=total_english_records_found,
        total_records_processed=total_records_raw
    )
])
display(summary_df)
summary_df.write.mode("append").saveAsTable("census_bureau_capstone.gold.nlp_pipeline_summary")
print("Summary saved to gold.nlp_pipeline_summary")


In [0]:
# Check for duplicates
df_check = spark.read.table("census_bureau_capstone.silver.census_product_cleaned")
total_duplicates = df_check.groupBy("Record-ID").count().filter(col("count") > 1).count()
print(f"Total duplicates in silver table: {total_duplicates}")


## 10. STEP 9: CLEANUP - Remove Temporary Files


In [0]:
print("\n" + "=" * 80)
print("STEP 9: Cleaning Up Temporary Files")
print("=" * 80)

for file_path in file_paths:
    dbutils.fs.rm(file_path)
print("Temporary files removed")

print("\n" + "=" * 80)
print("PIPELINE COMPLETE!")
print("=" * 80)
