# **S3 CC Data Ingestion**
***

## 1: Bronze Layer

### Step 1.1: import libraries
- import cc (commona crawl) class from src folder
- import all other required libraries

In [0]:
import sys
sys.path.append(
    '/Workspace/Shared/gu_census_crawl/common_crawl/cc_segement_ingestion/src/cc_segement_ingestion'
)

In [0]:
import pandas as pd
from helpers import *
import boto3

### Step 1.2: Set Secret Variables
- Call secrets that are stored in databricks utilities for aws credentials

In [0]:
aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

In [0]:
# Optional: build client once (faster)
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

In [0]:
# Read the cleaned master crawls table from the silver layer into a Spark DataFrame
df_master_crawls = spark.read.table("census_bureau_capstone.bronze.raw_master_crawls")

df_2020 = (df_master_crawls
                    # Filter rows where master_index matches the pattern "2020" to "2025"
                    .where(df_master_crawls.master_index.contains("2020")))

df_2021 = (df_master_crawls
                    # Filter rows where master_index matches the pattern "2020" to "2025"
                    .where(df_master_crawls.master_index.contains("2021")))

df_2022 = (df_master_crawls
                    # Filter rows where master_index matches the pattern "2020" to "2025"
                    .where(df_master_crawls.master_index.contains("2022")))

df_2023 = (df_master_crawls
                    # Filter rows where master_index matches the pattern "2020" to "2025"
                    .where(df_master_crawls.master_index.contains("2023")))

df_2024 = (df_master_crawls
                    # Filter rows where master_index matches the pattern "2020" to "2025"
                    .where(df_master_crawls.master_index.contains("2024")))

df_master_crawls_pandas = df_master_crawls.toPandas()

df_2020 = df_2020.toPandas()
df_2021 = df_2021.toPandas()
df_2022 = df_2022.toPandas()
df_2023 = df_2023.toPandas()
df_2024 = df_2024.toPandas()



### Step 1.4: Call list crawls method
- Call the list_crawls() function from the helpers 
- utlize batching to iterate and union crawls from filtered master crawls dataframe

In [0]:
# Generate a list of crawls using the batch_crawl_list method from the cc class
crawl_list_2020 = cc.batch_crawl_list(df_2020, "master_index")
crawl_list_2021 = cc.batch_crawl_list(df_2021, "master_index")
crawl_list_2022 = cc.batch_crawl_list(df_2022, "master_index")
crawl_list_2023 = cc.batch_crawl_list(df_2023, "master_index")
crawl_list_2024 = cc.batch_crawl_list(df_2024, "master_index")



In [0]:
df_crawls = cc.batch_ingest_crawls(crawl_list_2024)

# Write the ingested crawls DataFrame to the bronze layer table, appending to any existing data
df_crawls.write. \
    mode("append"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")

In [0]:
# Ingest the crawls from the crawl list using the batch_ingest_crawls method
df_crawls = cc.batch_ingest_crawls(crawl_list_2020)

# Write the ingested crawls DataFrame to the bronze layer table, appending to any existing data
df_crawls.write. \
    mode("append"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")

df_crawls = cc.batch_ingest_crawls(crawl_list_2021)

# Write the ingested crawls DataFrame to the bronze layer table, appending to any existing data
df_crawls.write. \
    mode("append"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")

df_crawls = cc.batch_ingest_crawls(crawl_list_2022)

# Write the ingested crawls DataFrame to the bronze layer table, appending to any existing data
df_crawls.write. \
    mode("append"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")

df_crawls = cc.batch_ingest_crawls(crawl_list_2023)

# Write the ingested crawls DataFrame to the bronze layer table, appending to any existing data
df_crawls.write. \
    mode("append"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")

df_crawls = cc.batch_ingest_crawls(crawl_list_2024)

# Write the ingested crawls DataFrame to the bronze layer table, appending to any existing data
df_crawls.write. \
    mode("append"). \
    saveAsTable("`census_bureau_capstone`.bronze.raw_all_crawls")