# **S3 CC Data Ingestion**
***

## 1: Bronze Layer

### Step 1.1: import libraries
- import cc (commona crawl) class from src folder
- import all other required libraries

In [0]:
import sys
sys.path.append(
    '/Workspace/Shared/gu_census_crawl/common_crawl/cc_segement_ingestion/src/cc_segement_ingestion'
)

In [0]:
import pandas as pd
from helpers import *
import boto3

### Step 1.2: Set Secret Variables
- Call secrets that are stored in databricks utilities for aws credentials

In [0]:
aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

In [0]:
# Optional: build client once (faster)
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

In [0]:
# Read the cleaned master crawls table from the silver layer into a Spark DataFrame
df_master_crawls_2025 = spark.read.table("census_bureau_capstone.silver.cleaned_master_crawls_2025")

# Convert the Spark DataFrame to a Pandas DataFrame
df_master_crawls_2025_pandas = df_master_crawls_2025.toPandas()

### Step 1.4: Call list crawls method
- Call the list_crawls() function from the helpers 
- utlize batching to iterate and union crawls from filtered master crawls dataframe

In [0]:
# Generate a list of crawls using the batch_crawl_list method from the cc class
crawl_list = cc.batch_crawl_list(df_master_crawls_2025_pandas, "master_index")

In [0]:
# Ingest the crawls from the crawl list using the batch_ingest_crawls method
df_crawls = cc.batch_ingest_crawls(crawl_list)

# Write the ingested crawls DataFrame to the bronze layer table, overwriting any existing data
df_crawls.write. \
    mode("overwrite"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")