# **S3 CC Data Ingestion**
***

## 1: Bronze Layer

### Step 1.1: import libraries
- import cc (commona crawl) class from src folder
- import all other required libraries

In [0]:
import sys
sys.path.append(
    '/Workspace/Shared/gu_census_crawl/common_crawl/cc_segement_ingestion/src/cc_segement_ingestion'
)

In [0]:
import pandas as pd
from helpers import *
import boto3

### Step 1.2: Set Secret Variables
- Call secrets that are stored in databricks utilities for aws credentials

In [0]:
aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

In [0]:
# Optional: build client once (faster)
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

### Step 1.3: Call list master indexes method
- Save as table for unitity catalog 


In [0]:
# Create Pandas DataFrame
df_master_crawls = cc.list_master_indexes()

# Convert to Spark DataFrame
df_master_crawls = spark.createDataFrame(df_master_crawls)

# Get totals
total = df_master_crawls.count()
print(f"Total Master Indexes: {total}")

#Save to bronze table
df_master_crawls.write.mode("overwrite").saveAsTable("`census_bureau_capstone`.bronze.raw_master_crawls")

## 2: Silver Layer

---

### Step 2.1: Filter crawls

In [0]:
# Filter the master indexes for the year 2025
df_master_crawls_2025 = df_master_crawls.filter(
  df_master_crawls.master_index.contains("crawl-data/CC-MAIN-2025"))

### Step 2.2: Save To silver layer
 - overwrite existing table

In [0]:
# Save the filtered DataFrame to the silver layer table
df_master_crawls_2025.write \
  .mode("overwrite") \
  .saveAsTable("`census_bureau_capstone`.silver.cleaned_master_crawls_2025")