#Imports

In [None]:
from pyspark.sql.functions import array, col, when, concat, lit, to_timestamp, regexp_replace

#Performing Cleaning on df_pin Dataframe

In [None]:
def topics_to_dataframe():
    
    """
    Reads JSON data from specified topics and returns an array of Spark DataFrames.

    The function reads JSON files from different topics (.pin, .geo, .user) located in the mounted S3 bucket.
    It disables format checks during the reading of Delta tables to improve performance.

    Returns:
        List: An array of Spark DataFrames, each containing data from a different topic.
    """
    
    # Disable format checks during the reading of Delta tables
    spark.conf.set("spark.databricks.delta.formatCheck.enabled", "false")
    
    # Define the list of topics (file extensions)
    topics = [".pin", ".geo", ".user"]
    df_array = []

    # Iterate through each topic
    for file_extension in topics:
        # Construct the file location based on the topic
        file_location = "/mnt/databricks-bucket/topics/0ae9e110c9db" + file_extension + "/partition=0/*.json" 
        # Specify the file type as JSON
        file_type = "json"
        # Request Spark to infer the schema during reading
        infer_schema = "true"
        # Read JSON files from the specified location into a Spark DataFrame
        df = spark.read.format(file_type).option("inferSchema", infer_schema).load(file_location)
        # Append the DataFrame to the df_array
        df_array.append(df)

    # Return the array of DataFrames for different topics
    return df_array


In [None]:
# An array of stored dataframe topics 
df_array = topics_to_dataframe()

In [None]:
def clean_df_pin(df_pin):
    """
    Cleans the input DataFrame df_pin by replacing null or irrelevant data with default values.

    Parameters:
    - df_pin: Input DataFrame to be cleaned.

    Returns:
    - Cleaned DataFrame.
    """

    # Dictionary containing default values to replace null or irrelevant data
    null_dicts = {
        "description": "No description available",
        "follower_count": "User Info Error",
        "image_src": "Image src error.",
        "poster_name": "User Info Error",
        "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
        "title": "No Title Data Available"
    }

    # Drop any duplicate rows in the dataframe (if needed)
    # df_pin = df_pin.dropDuplicates(subset=["unique_id"])

    # Replace empty and non-relevant data in each column with specified values
    for key, value in null_dicts.items():
        if key == "follower_count":
            # Replace specified value with "0" in the "follower_count" column
            df_pin = df_pin.withColumn(key, regexp_replace(key, value, "0"))
        elif key == "description":
            # Replace the description column with "None" if it starts with the specified value, otherwise keep the original value
            df_pin = df_pin.withColumn(key, when(col(key).startswith(value), "None").otherwise(col(key)))
        else:
            # Replace specified value with "None" in other columns
            df_pin = df_pin.withColumn(key, regexp_replace(key, value, "None"))

    # Convert follower_count values with "k", "M", "B" suffixes to numeric values
    df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
    df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "0000"))
    df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "B", "00000"))
    df_pin = df_pin.withColumn("follower_count", df_pin["follower_count"].cast("int"))

    # Remove "Local save in" from save_location column
    df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in", ""))

    # Rename the "index" column to "ind"
    df_pin = df_pin.withColumnRenamed("index", "ind")

    # Define the desired order of columns
    column_order_pin = [
        "ind",
        "unique_id",
        "title",
        "description",
        "follower_count",
        "poster_name",
        "tag_list",
        "is_image_or_video",
        "image_src",
        "save_location",
        "category"
    ]

    # Select and reorder the columns in the specified order
    df_pin = df_pin.select(column_order_pin)

    return df_pin

# Clean the dataframe and store the result in df_pin
df_pin = clean_df_pin(df_array[0].select(*df_array[0].columns))



# Performing Cleaning on df_geo Dataframe

In [None]:
def clean_df_geo(df_geo):
    """
    Cleans the DataFrame containing geographical data.

    Args:
        df_geo (DataFrame): Input DataFrame containing geographical data.

    Returns:
        DataFrame: Cleaned DataFrame with specified columns and transformations.
    """
    # Select relevant columns from the input DataFrame
    df_geo = df_array[1].select(*df_array[1].columns)
    
    # Uncomment the line below if duplicate rows need to be dropped
    # df_geo = df_geo.dropDuplicates()

    # Create a new column "coordinates" by combining "latitude" and "longitude"
    df_geo = df_geo.withColumn("coordinates", array("latitude", "longitude"))

    # Drop the original "latitude" and "longitude" columns
    df_geo = df_geo.drop("latitude", "longitude")

    # Convert the "timestamp" column to a timestamp type
    df_geo = df_geo.withColumn("timestamp", to_timestamp("timestamp"))

    # Define the desired order of columns
    column_order_geo = [
        "ind",
        "country",
        "coordinates",
        "timestamp"
    ]
    
    # Select and reorder the columns in the specified order
    df_geo = df_geo.select(column_order_geo)
    
    return df_geo

# Clean the geographical DataFrame and store the result in df_geo
df_geo = clean_df_geo(df_array[1].select(*df_array[1].columns))

# Performing Cleaning on df_user Dataframe

In [None]:
def clean_df_user(df_user):
    """
    Cleans the DataFrame containing user data.

    Args:
        df_user (DataFrame): Input DataFrame containing user data.

    Returns:
        DataFrame: Cleaned DataFrame with specified columns and transformations.
    """
    # Concatenate "first_name" and "last_name" columns to create a new "user_name" column
    df_user = df_user.withColumn("user_name", concat("first_name", lit(" "), "last_name"))

    # Drop the original "first_name" and "last_name" columns
    df_user = df_user.drop("first_name", "last_name")

    # Convert the "date_joined" column to a timestamp type
    df_user = df_user.withColumn("date_joined", to_timestamp("date_joined"))

    # Select the desired columns
    df_user = df_user.select("ind", "user_name", "age", "date_joined")

    return df_user

# Clean the user DataFrame and store the result in df_user
df_user = clean_df_user(df_array[2].select(*df_array[2].columns))
