# **Wet File Data Extraction**
---

## 1. Import libraries 

In [0]:
from pyspark.sql import functions as F
import boto3
import botocore
import os

## 2. Connect to Boto3 

### 2.1 Set Secrets

In [0]:
aws_access_key_id = dbutils.secrets.get(scope='aws_cc', key='aws_access_key_id')
aws_secret_access_key = dbutils.secrets.get(scope='aws_cc', key='aws_secret_access_key')

### 2.2 Intialize boto3 client

In [0]:
# Optional: build client once (faster)
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

## 3. Read Bronze Layer
- Read in all crawls from bronze


In [0]:
from pyspark.sql import functions as F

# Read the raw crawls table and filter for keys containing 'wet'
df_all_crawls = (
    spark.read.table("census_bureau_capstone.bronze.raw_all_crawls")
    .filter(F.col("key").contains("wet"))
)

# Extract crawl start and end timestamps from the 'key' column
df_all_crawls = df_all_crawls.withColumn(
    "crawl_start_raw",
    F.regexp_extract(F.col("key"), r"CC-MAIN-(\d{14})-(\d{14})-", 1)
).withColumn(
    "crawl_end_raw",
    F.regexp_extract(F.col("key"), r"CC-MAIN-(\d{14})-(\d{14})-", 2)
)

# Convert extracted timestamp strings to timestamp type
df_all_crawls = df_all_crawls.withColumn(
    "crawl_start",
    F.expr("try_to_timestamp(crawl_start_raw, 'yyyyMMddHHmmss')")
).withColumn(
    "crawl_end",
    F.expr("try_to_timestamp(crawl_end_raw, 'yyyyMMddHHmmss')")
).orderBy("crawl_start")

# Filter out rows where crawl_start could not be parsed
df_all_crawls = df_all_crawls.filter(F.col("crawl_start").isNotNull())

# Drop unnecessary columns
df_all_crawls = df_all_crawls.drop("crawl_start_raw", "crawl_end_raw", "ChecksumAlgorithm")

# Count total number of rows
total = df_all_crawls.count()

print(f"total: {total:,}")
display(df_all_crawls)

### 3.1 Sort crawls
- random sample from past 5 years for each
- limit to 1 sample for each year
- set to python list

#### 3.1.1 Random Sample

In [0]:
from pyspark.sql import functions as F

years = ['2021', '2022', '2023', '2024', '2025']
samples = []

def sample_crawls(years, samples, df):
    for year in years:
        # Filter the DataFrame for each year and 'wet' keyword, then take a single random sample
        sample = df.limit(2)
        
        # Check if the sample is not empty before appending
        if sample.count() > 0:
            samples.append(sample)

    # Initialize the combined DataFrame with the first sample if samples list is not empty
    if samples:
        df = samples[0]

        # Union the rest of the samples into the combined DataFrame
        for sample in samples[1:]:
            df = df.union(sample)

        # Convert to pandas
        df = df.toPandas()
        return df

    else:
        print("No samples found for the given criteria.")

#### 3.1.2 Set to python list 

In [0]:
key_list = df_all_crawls_samples['Key'].tolist()

print(key_list)

## 4. NLP

### 4.1 Extract CC Wet file
- Extracrts the random sample text files

In [0]:
source_bucket = "commoncrawl"
destination_bucket = 'mydbxbucketpractice'


def download_and_upload(source_key, destination_key):
    for source_key in key_list:
        destination_key = (
            'common_crawl/wet_files/' +
            source_key.split("/")[-1]
        )
        local_filename = '/tmp/' + source_key.split("/")[-1]
        
        s3.download_file(source_bucket, source_key, local_filename)
        s3.upload_file(local_filename, destination_bucket, destination_key)
        os.remove(local_filename)
        print(
            f"Copied s3://{source_bucket}/{source_key} to "
            f"s3://{destination_bucket}/{destination_key}"
        )

### 4.2. View raw file as df

In [0]:
file_paths = [f"s3://{destination_bucket}/common_crawl/wet_files/{file.name}" for file in dbutils.fs.ls(f"s3://{destination_bucket}/common_crawl/wet_files/")]


df = spark.read.text(file_paths)
display(df)
df = df

total = df.count()

print(f"total: {total:,}")

#### 4.2.1 Drop Files if needed 

In [0]:
# Drop all files from the directory
# for file_path in file_paths:
   # dbutils.fs.rm(file_path)

### 4.3. Text transformation