# **Incremental Ingestion**
- Description: Ingests data incrementally from common crawl from most recent crawl leveraging boto3.

---

## 1. Import helper files
- run magic command on helper file

In [0]:
import sys

sys.path.append(
    '/Workspace/Shared/gu_census_crawl/common_crawl/cc_segement_ingestion/src/cc_segement_ingestion'
)

from helpers import *

## 2. Set secret variables
- Sets secret variables for aws s3 access

In [0]:
aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

## 3. Intialize boto3 client
- Connect to client for boto3

In [0]:
# Connect to s3 utlizing boto 3 client
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

## 4. Read most recent crawl
- Get most recent crael from silver layer table
- Sort by decending 
- Set to a python variable

In [0]:
most_recent_crawl = (
    spark.read.table("census_bureau_capstone.silver.cleaned_master_crawls_2025")
    .orderBy('master_index', ascending=False)
    .select('master_index')
    .limit(1) \
    .collect()[0]['master_index']
)

display(most_recent_crawl)

## 5. Create recent crawl df
- Utlize cc class from helper functions 
- Utlize the list crawls method and pass the most recent crawl as argument to retreive data from common crawl

In [0]:
df_recent_crawl = cc.list_crawls(most_recent_crawl)

In [0]:
display(df_recent_crawl)

In [0]:
from pyspark.sql.functions import lit
df_recent_crawl = spark.createDataFrame(df_recent_crawl).withColumn("crawl_prefix", lit(most_recent_crawl))

## 6. Get Count

In [0]:
total_rows = df_recent_crawl.count()
print(f'Total rows in most recent crawl: {total_rows}')

## 7. Check duplicates

In [0]:
from pyspark.sql.functions import col

duplicates_key = (
    df_recent_crawl
    .groupBy("Key")
    .count()
    .filter(col("count") > 1)
    .count()
)

print(f'Total duplicate rows in most recent crawl: {duplicates_key}')

## 8. Read all Crawls from Bronze

In [0]:
df_bronze = spark.read.table("census_bureau_capstone.bronze.raw_all_crawls")

## 9. Update recent crawls 
- Use left_anit join to add only unmatched rows by key


In [0]:
df_new = (
    df_recent_crawl
    .join(
        df_bronze.select("Key"),
        on="Key",
        how="left_anti"
    ).select("Key", "LastModified", "ETag", "Size", 
             "StorageClass", "ChecksumAlgorithm", "crawl_prefix")
)


df_bronze_updated = df_bronze.unionByName(df_new)

In [0]:
display(df_new)

In [0]:
display(df_bronze_updated)

## 10. Write to bronze table

In [0]:
(df_bronze_updated
 .write
 .mode("overwrite")
 .saveAsTable("census_bureau_capstone.bronze.raw_all_crawls")
)
