# **Common Crawl Data ingestion**

## 1. Import notebook functions
- Import helper functions to aid in data ingestion and extration

In [0]:
%run
/Workspace/Shared/gu_census_crawl/common_crawl/helper_functions

In [0]:
# Want to get only the US domain urls
us_wet_file_urls = CrawlExtractor.get_us_wet_file_urls('https://index.commoncrawl.org/CC-MAIN-2022-40-index?url=*.edu&output=json')
us_wet_file_urls.extend(CrawlExtractor.get_us_wet_file_urls('https://index.commoncrawl.org/CC-MAIN-2022-40-index?url=*.gov&output=json'))
us_wet_file_urls.extend(CrawlExtractor.get_us_wet_file_urls('https://index.commoncrawl.org/CC-MAIN-2022-40-index?url=*.us&output=json'))
us_wet_file_urls.extend(CrawlExtractor.get_us_wet_file_urls('https://index.commoncrawl.org/CC-MAIN-2022-40-index?url=*.com&output=json'))
print(f"Found {len(us_wet_file_urls)} US-specific WET files.")

In [0]:
CrawlExtractor.save_urls(us_wet_file_urls, 'us_wet_file_urls.json')

In [0]:
us_wet_file_urls

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd

schema = StructType([
    StructField("url", StringType(), True)
])

df_wet_files = spark.createDataFrame(us_wet_file_urls, schema)

display(df_wet_files)

In [0]:
df_wet_files.createOrReplaceTempView("wet_files")



In [0]:
%sql
select count(*) from wet_files;

select * from wet_files

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd

data  = pd.read_json('us_wet_file_urls.json')

df = pd.DataFrame(data)
schema = StructType([
    StructField("url", StringType(), True)
])

df = spark.createDataFrame(df, schema)
display(df)

In [0]:
total_records = df.count()
print(f"Total records: {total_records}")


In [0]:
CrawlExtractor.download_file(df_wet_files['url'], '/tmp/wet_files')

In [0]:
!pip install cdx_toolkit

In [0]:
import cdx_toolkit

# Create a client
cdx = cdx_toolkit.CDXFetcher()

# Get all available collections (crawls)
crawls = cdx.index_list()

# Print them


In [0]:
cdx_toolkit.commoncrawl.list_coll()

In [0]:
import boto3
import botocore

aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

s3 = boto3.client(
    "s3",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Check bucket ACL
try:
    acl = s3.get_bucket_acl(Bucket='commoncrawl')
    print("Bucket ACL:", acl)
except botocore.exceptions.ClientError as e:
    print("Error getting bucket ACL:", e)

# Check bucket policy
try:
    policy = s3.get_bucket_policy(Bucket='commoncrawl')
    print("Bucket Policy:", policy['Policy'])
except botocore.exceptions.ClientError as e:
    print("Error getting bucket policy:", e)

In [0]:
try:
    response = s3.list_objects_v2(Bucket='commoncrawl', MaxKeys=1)
    # Create DataFrame from the 'Contents' key if present
    if 'Contents' in response:
        df = pd.DataFrame(response['Contents'])
        display(df)
    else:
        print("No objects found in bucket.")
except botocore.exceptions.ClientError as e:
    print("List objects failed:", e)

try:
    obj = s3.get_object(Bucket='commoncrawl', Key='robots.txt')
    if 'Contents' in obj:
        df2 = pd.DataFrame(obj['Contents'])
        display(df2)
    else:
        print("Get object succeeded:", obj)
except botocore.exceptions.ClientError as e:
    print("Get object failed:", e)



In [0]:
import boto3
import botocore
import pandas as pd

aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

s3 = boto3.client(
    "s3",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

bucket_name = "commoncrawl"
prefix = "cc-index/collections/CC-MAIN/"

# List crawl index folders
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter="/")

crawl_ids = [
    content["Prefix"].split("/")[-2]
    for content in response.get("CommonPrefixes", [])
]

# Check if crawl_ids is empty before creating DataFrame
if crawl_ids:
    df = pd.DataFrame({"crawl_id": crawl_ids})
    display(df)
else:
    print("No crawl IDs found.")

In [0]:
s3.

In [0]:
import boto3
import botocore
import pandas as pd

aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

def list_crawls():
    # Set bucket name and prefix for crawl data
    bucket_name = "commoncrawl"
    prefix = "crawl-data/"
    
    # Create an S3 client with credentials and region
    s3 = boto3.client(
        "s3",
        region_name='us-east-1',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )
    
    # List objects in the specified prefix within the bucket
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

    # Extract object keys from the response
    object_keys = [
        content["Key"]
        for content in response.get("Contents", [])
    ]
    
    # Display the object keys as a DataFrame if any objects are found
    if object_keys:
        df_list_crawls = pd.DataFrame({"object_key": object_keys})
        display(df_list_crawls)
    else:
        # Print message if no objects are found
        print("No objects found.")


In [0]:
list_crawls()