In [None]:
"""
**Please Note: This code is meant to be run within a DataBricks notebook.**

Securely mounts an S3 bucket to Databricks and loads data into DataFrames.

1. Load AWS credentials from a Delta table.
2. Encode the secret key for security.
3. Construct the S3 source URL.
4. Mount the S3 bucket to the Databricks File System (DBFS).
5. Load JSON data into DataFrames.

DataFrames:
- df_pin: Pinterest post data.
- df_geo: Geolocation data.
- df_user: User data.
"""

In [None]:
# Import necessary libraries
from pyspark.sql.functions import *
import urllib

In [None]:
# Step 1: Load AWS credentials from Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"
aws_keys_df = spark.read.format("delta").load(delta_table_path)

In [None]:
# Step 2: Extract and encode credentials
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
# Step 3: Define the S3 bucket name and mount point
AWS_S3_BUCKET = "user-0affe460a4c9-bucket"  # Replace with your S3 bucket name
MOUNT_NAME = "/mnt/user-bucket"  # Choose a meaningful mount name
SOURCE_URL = f"s3n://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{AWS_S3_BUCKET}"

In [None]:
# Step 4: Mount the S3 bucket
try:
    dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)
    print(f"Successfully mounted {AWS_S3_BUCKET} to {MOUNT_NAME}")
except Exception as e:
    print(f"Error mounting {AWS_S3_BUCKET}: {str(e)}")

In [None]:
# Step 5: Define paths to the JSON files in the S3 bucket
# Replace <your_UserId> with your actual user ID or the appropriate path in your S3 bucket
path_pin = f"{MOUNT_NAME}/topics/0affe460a4c9.pin/partition=0/"
path_geo = f"{MOUNT_NAME}/topics/0affe460a4c9.geo/partition=0/"
path_user = f"{MOUNT_NAME}/topics/0affe460a4c9.user/partition=0/"

In [None]:
%sql
/*Step 6: Disable format checks during the reading of Delta tables*/
SET spark.databricks.delta.formatCheck.enabled=false

In [None]:
# Step 7: Load the JSON data into DataFrames with schema inference
try:
    df_pin = spark.read.format("json") \
        .option("inferSchema", "true") \
        .load(path_pin)
    
    df_geo = spark.read.format("json") \
        .option("inferSchema", "true") \
        .load(path_geo)
    
    df_user = spark.read.format("json") \
        .option("inferSchema", "true") \
        .load(path_user)

    # Display loaded data to verify successful loading
    print("Pinterest Data:")
    display(df_pin)

    print("Geolocation Data:")
    display(df_geo)

    print("User Data:")
    display(df_user)

except Exception as e:
    print(f"Error loading data from S3: {str(e)}")