In [None]:
# This script contains the dataframe cleaning methods defined for each dataset (to be used in
# both the batch and the stream processing pipelines)

In [None]:
# necessary imports
import re

import pyspark.sql.dataframe
from pyspark.sql.functions import array, col, concat_ws, to_timestamp, udf
from pyspark.sql.types import IntegerType

In [None]:
def convert_follower_count_letters_to_digits(str: str) -> str:
    """
    Callback function for a User Defined Function (follower_count_UDF, intended for use on
    the follower_count column of the dataframe containing the Pinterest post data) which
    converts letters 'M' and 'k' in a string to the number of zeros they represent in numeric
    shorthand (i.e. 'M' stands for million ('000000') and 'k' stands for thousand ('000')).

    Argument:
    --------
    str: str
        The original alphanumeric string to be transformed.
    
    Returns:
    -------
    str: The transformed string of only numeric characters.
    """
    if str is not None:
        # normalising follower count column to contain only numeric characters
        # (e.g. 15k --> 15000, 2M --> 2000000)
        str = re.sub(r'M', '000000', str) 
        str = re.sub(r'k', '000', str)
        return str
    
follower_count_UDF = udf(lambda x:convert_follower_count_letters_to_digits(x))

def strip_down_to_save_location_filepath(str: str) -> str:
    """
    Callback function for a User Defined Function (save_location_UDF, intended for use on
    the save_location column of the dataframe containing the Pinterest post data) which
    removes the phrase 'Local save in ' from the string, to leave just the filepath.
    
    Argument:
    --------
    str: str
        The original string to be transformed.
    
    Returns:
    -------
    str: The transformed string of just the filepath.
    """
    if str is not None:
        # Normalising column so it just contains the filepath
        str = re.sub(r'Local save in ', '', str)
        return str
    
save_location_UDF = udf(lambda x:strip_down_to_save_location_filepath(x))

def clean_pin_df(df_pin: pyspark.sql.dataframe) -> pyspark.sql.dataframe:
    """
    Bespoke function designed to clean a Spark DataFrame containing the Pinterest post data.
    To be used in both the Batch Layer and Stream Layer processing pipelines.

    Argument:
    --------
    df_pin: pyspark.sql.dataframe
        The uncleaned Spark DataFrame.
        
    Returns:
    -------
    pyspark.sql.dataframe: 
        The cleaned Spark DataFrame.
    """
    # replace entries with no data or no useful data with None
    dict_column_values_to_be_replaced = {
        'follower_count': {"User Info Error": None},
        'description': {"No description available": None, "No description available Story format": None},
        'tag_list': {"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e": None},
        'title': {"No Title Data Available": None}, 
        'image_src': {"Image src error.": None}
    }

    for column in dict_column_values_to_be_replaced:
        df_pin = df_pin.na.replace(dict_column_values_to_be_replaced[column], column)

    # convert all follower_count entries to numeric values
    df_pin = df_pin.withColumn("follower_count", follower_count_UDF(col("follower_count")).cast(IntegerType()))

    # clean save_location column to include just filepath
    df_pin = df_pin.withColumn("save_location", save_location_UDF(col("save_location")))

    # rename index column to ind
    df_pin = df_pin.withColumnRenamed("index", "ind")

    # reorder the columns (NB this drops the downloaded column)
    df_pin = df_pin.select(["ind", "unique_id", "title", "description", "follower_count",
                            "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category"])

    return df_pin

In [None]:
def clean_geo_df(df_geo: pyspark.sql.dataframe) -> pyspark.sql.dataframe:
    """
    Bespoke function designed to clean a Spark DataFrame containing the geolocation data.
    To be used in both the Batch Layer and Stream Layer processing pipelines.

    Argument:
    --------
    df_pin: pyspark.sql.dataframe
        The uncleaned Spark DataFrame.
        
    Returns:
    -------
    pyspark.sql.dataframe: 
        The cleaned Spark DataFrame.
    """
    # combine the latitude and longitude columns into a single column 'coordinates'
    df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
    
    # convert the timestamp column from string to timestamp type
    df_geo = df_geo.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy:MM:dd HH:mm:ss"))

    # reorder the columns and drop the longitude and latitude columns
    df_geo = df_geo.select(["ind", "country", "coordinates", "timestamp"])

    return df_geo 

In [None]:
def clean_user_df(df_user: pyspark.sql.dataframe) -> pyspark.sql.dataframe:
    """
    Bespoke function designed to clean a Spark DataFrame containing the user data.
    To be used in both the Batch Layer and Stream Layer processing pipelines.

    Argument:
    --------
    df_pin: pyspark.sql.dataframe
        The uncleaned Spark DataFrame.
        
    Returns:
    -------
    pyspark.sql.dataframe: 
        The cleaned Spark DataFrame.
    """
    # create a new column user_name that concatenates the information found in the first_name and last_name columns
    df_user = df_user.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

    # convert date_joined column from string to timestamp data type
    df_user = df_user.withColumn("date_joined", to_timestamp(col("date_joined"), "yyyy:MM:dd HH:mm:ss"))

    # reorder the columns and drop the longitude and latitude columns, dropping first_name and last_name columns
    df_user = df_user.select(["ind", "user_name", "age", "date_joined"]) 

    return df_user