#### Reading in data from S3 bucket
- Load in AWS access key from csv file
- Mount S3 bucket
- Read in the set of jsons from each topic stored in the AWS S3 bucket and save them into dataframes

In [0]:
'''# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")'''

# This only needed to be done once so now commented out

In [0]:
'''# AWS S3 bucket name
AWS_S3_BUCKET = "user-0ad8a60ac12f-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/tg_s3_bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

display(dbutils.fs.ls("/mnt/tg_s3_bucket"))'''

# This was only necessary to do once so now commented out 

In [0]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/tg_s3_bucket/topics/0ad8a60ac12f.pin/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframes to check its contents
#display(df_pin)

In [0]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/tg_s3_bucket/topics/0ad8a60ac12f.geo/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
#display(df_geo)

In [0]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/tg_s3_bucket/topics/0ad8a60ac12f.user/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
#display(df_user)

age,date_joined,first_name,ind,last_name
42,2017-02-18 00:31:22,Christopher,6353,Hernandez
27,2016-03-08 13:38:37,Christopher,2015,Bradshaw
59,2017-05-12 21:22:17,Alexander,10673,Cervantes
48,2016-02-27 16:57:44,Christopher,1857,Hamilton
45,2016-09-15 06:02:53,Christopher,10020,Hawkins
35,2015-10-22 22:42:23,Christopher,2041,Campbell
48,2016-06-13 17:09:14,Christopher,7031,Anderson
27,2016-03-08 13:38:37,Christopher,2015,Bradshaw
39,2016-06-29 20:43:59,Christina,6398,Davenport
20,2015-10-23 04:13:23,Alexandria,3599,Alvarado


#### Cleaning pinterest dataframe (df_pin)
- Check value counts for each column for strange repeated data and outliers in categorical columns
- Change descriptions which say "No description available" to Nones
- Change img_src values which say "Image src error" to Nones
- Change tag values which say "No tag available" to Nones
- Change title values which say "No title available" to Nones
- Change poster names which say "User Info Error" to Nones
##
- Change save location to only include filepath
- Transform follower counts to only include digits
##
- Cast index column to integer
- Cast follower counts column to integer
- Rename index column to "ind"
- Reorder columns in df



In [0]:
from pyspark.sql.functions import col
'''
columns = df_pin.columns
for column in columns:
    value_counts = df_pin.groupBy(column).count().orderBy("count", ascending=False)
    value_counts.show()'''
# This was originally done but now commented out as unecessary for data processing

In [0]:
from pyspark.sql.functions import when
df_pin = df_pin.withColumn('description', when(col('description').contains('No description available'), None).otherwise(col('description')))
 # ^^ done in this format rather than standard replace because some records contain more info after no description
df_pin = df_pin.replace({'Image src error.': None}, subset=['image_src'])
df_pin = df_pin.replace({'N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e': None}, subset=['tag_list'])
df_pin = df_pin.replace({'No Title Data Available': None}, subset=['title'])
df_pin = df_pin.replace({'User Info Error': None}, subset=['follower_count', 'poster_name'])

In [0]:
'''from pyspark.sql.functions import col
columns = df_pin.columns
for column in columns:
    value_counts = df_pin.groupBy(column).count().orderBy("count", ascending=False)
    value_counts.show()'''

In [0]:
from pyspark.sql.functions import regexp_replace
df_pin = df_pin.withColumn('save_location', regexp_replace(col('save_location'), 'Local save in ', ''))
df_pin = df_pin.withColumn('follower_count', regexp_replace(col('follower_count'), 'k', '000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace(col('follower_count'), 'M', '000000'))

In [0]:
df_pin = df_pin.withColumn("index", df_pin["index"].cast("integer"))
df_pin = df_pin.withColumn("follower_count", df_pin["follower_count"].cast("integer"))
df_pin = df_pin.withColumnRenamed("index", "ind")
df_pin = df_pin.select("ind",
"unique_id",
"title",
"description",
"follower_count",
"poster_name",
"tag_list",
"is_image_or_video",
"image_src",
"save_location",
"category")

#### Cleaning geolocation dataframe (df_geo)
- Create new column "coordinates" as an array of latitude and longitude
- Drop latitude and longitude columns
- Convert the timestamp column from a string to a timestamp data type
- Reorder columns




In [0]:
from pyspark.sql.functions import array
df_geo = df_geo.withColumn("coordinates", array("latitude", "longitude"))
df_geo = df_geo.drop("latitude", "longitude")
df_geo = df_geo.withColumn("timestamp", df_geo["timestamp"].cast("timestamp"))
df_geo = df_geo.withColumnRenamed("index", "ind")
df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")

#### Cleaning user data dataframe (df_user)
- Create a new column user_name that concatenates the information found in the first_name and last_name columns
- Drop the first_name and last_name columns 
- Convert the date_joined column from a string to a timestamp data type
- Reorder columns




In [0]:
from pyspark.sql.functions import concat, lit
df_user = df_user.withColumn("user_name", concat("first_name", lit(" "), "last_name"))
df_user = df_user.drop("first_name", "last_name")
df_user = df_user.withColumn("timestamp", df_user["date_joined"].cast("timestamp"))
df_user = df_user.withColumnRenamed("index", "ind")
df_user = df_user.select("ind", "user_name", "age", "date_joined")