## Mounting S3 Bucket to DataBricks

### Imports required for entire Batch Processing

In [None]:
import urllib
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, TimestampType, DateType

### Retrieving AWS Credentials

In [None]:
dbutils.fs.ls("/FileStore/tables")

In [None]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

### Mount the S3 Bucket

In [None]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-0a40ea42f8d1-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-0a40ea42f8d1-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
display(dbutils.fs.ls("/mnt/user-0a40ea42f8d1-bucket/../.."))

### Read contents of S3 bucket into dataframes and display them

In [None]:

# File location and type
file_type = "json"
pin_file_path = "/mnt/user-0a40ea42f8d1-bucket/topics/0a40ea42f8d1.pin/partition=0/*.json"
geo_file_path = "/mnt/user-0a40ea42f8d1-bucket/topics/0a40ea42f8d1.geo/partition=0/*.json"
user_file_path = "/mnt/user-0a40ea42f8d1-bucket/topics/0a40ea42f8d1.user/partition=0/*.json"

# Ask Spark to infer the schema
infer_schema = "true"

# Read in JSONs from mounted S3 bucket
dirty_pin_df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(pin_file_path)

dirty_geo_df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(geo_file_path)

dirty_user_df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(user_file_path)

# Display Spark dataframe to check its content
display(dirty_pin_df)
display(dirty_geo_df)
display(dirty_user_df)

### Unmount S3 Bucket

In [None]:
# If and when needed
dbutils.fs.unmount("/mnt/user-0a40ea42f8d1-bucket")