In [None]:
# Define Variables
MOUNT_NAME = "/mnt/user-bucket"

path_pin = f"{MOUNT_NAME}/topics/0affe460a4c9.pin/partition=0/"
path_geo = f"{MOUNT_NAME}/topics/0affe460a4c9.geo/partition=0/"
path_user = f"{MOUNT_NAME}/topics/0affe460a4c9.user/partition=0/"

In [None]:
# Load S3 Bucket Data into Spark DataFrames
try:
    df_pin = spark.read.format("json") \
        .option("inferSchema", "true") \
        .load(path_pin)
    
    df_geo = spark.read.format("json") \
        .option("inferSchema", "true") \
        .load(path_geo)
    
    df_user = spark.read.format("json") \
        .option("inferSchema", "true") \
        .load(path_user)

    # Display loaded data to verify successful loading (Data is Pre Cleaning)
    print("Pinterest Data:")
    display(df_pin)

    print("Geolocation Data:")
    display(df_geo)

    print("User Data:")
    display(df_user)

except Exception as e:
    print(f"Error loading data from S3: {str(e)}")

In [None]:
# Import neccessary packages etc.
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, regexp_replace, concat_ws, to_timestamp, array, lit

# Define Cleaning Classes

In [None]:
class PinterestDataCleaner:
    """
    A class to clean and preprocess Pinterest DataFrames.
    
    Attributes:
        df (DataFrame): The Pinterest DataFrame to be cleaned.
    """

    def __init__(self, df: DataFrame):
        """
        Initializes the PinterestDataCleaner with a DataFrame.
        
        Parameters:
            df (DataFrame): The Pinterest DataFrame to be cleaned.
        """
        self.df = df

    def replace_empty_with_none(self):
        """
        Replaces empty entries and entries with no relevant data in each column with None.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        for column in self.df.columns:
            self.df = self.df.withColumn(column, when(col(column) == "", None).otherwise(col(column)))
        return self

    def clean_follower_count(self):
        """
        Cleans the follower_count column by converting entries with 'k' to thousands and 'M' to millions.
        Ensures that the data type is an integer.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn(
            "follower_count",
            when(col("follower_count").rlike(r'^[\d]+k$'), (regexp_replace(col("follower_count"), "k", "").cast("int") * 1000))
            .when(col("follower_count").rlike(r'^[\d]+M$'), (regexp_replace(col("follower_count"), "M", "").cast("int") * 1000000))
            .otherwise(col("follower_count").cast("int"))
        )
        return self

    def convert_columns_to_int(self, columns):
        """
        Converts specified columns to the int data type after formatting.
        
        Parameters:
            columns (list): List of columns to convert to int data type.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        for column in columns:
            self.df = self.df.withColumn(column, col(column).cast("int"))
        return self

    def convert_corrupt_record_to_boolean(self):
        """
        Converts the '_corrupt_record' column to a boolean.
        Sets 'null' values to False and all other values to True.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn(
            "_corrupt_record",
            when(col("_corrupt_record").isNull(), lit(False)).otherwise(lit(True))
        )
        return self

    def clean_save_location(self):
        """
        Cleans the save_location column to include only the save location path.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn(
            "save_location",
            regexp_replace(col("save_location"), r"https?://[^/]+/", "")
        )
        return self

    def rename_index_column(self, old_name: str, new_name: str):
        """
        Renames a specified index column.
        
        Parameters:
            old_name (str): The current name of the index column.
            new_name (str): The new name for the index column.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumnRenamed(old_name, new_name)
        return self
    
    def drop_rows_where_corrupt(self):
        """
        Filters out rows where '_corrupt_record' is True.
        
        Returns:
            self (UserDataCleaner): Returns the instance itself to allow method chaining.
            This method drops rows in the DataFrame where the '_corrupt_record' column is True.
        """
        # Filter to show rows where _corrupt_record is False
        self.df = self.df.filter(self.df._corrupt_record == False)
        return self

    def reorder_columns(self, column_order: list):
        """
        Reorders the DataFrame columns to a specified order.
        
        Parameters:
            column_order (list): A list specifying the desired column order.
        
        Returns:
            self (PinterestDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.select(column_order)
        return self

    def clean_pin_df(self):
        """
        Cleans the Pinterest DataFrame by performing the necessary transformations.
        
        Returns:
            DataFrame: The cleaned Pinterest DataFrame.
        """
        columns_to_convert = ["index", "follower_count", "downloaded"]

        self.replace_empty_with_none()\
            .clean_follower_count()\
            .convert_corrupt_record_to_boolean()\
            .clean_save_location()\
            .convert_columns_to_int(columns_to_convert)\
            .rename_index_column(old_name="index", new_name="ind")\
            .drop_rows_where_corrupt()\
            .reorder_columns([
                "ind", "unique_id", "title", "description", "follower_count",
                "poster_name", "tag_list", "is_image_or_video", "image_src",
                "save_location", "category", "downloaded", "_corrupt_record"
            ])
        return self.df

In [None]:
class GeoDataCleaner:
    """
    A class to clean and preprocess Geo DataFrames.
    
    Attributes:
        df (DataFrame): The Geo DataFrame to be cleaned.
    """

    def __init__(self, df: DataFrame):
        """
        Initializes the GeoDataCleaner with a DataFrame.
        
        Parameters:
            df (DataFrame): The Geo DataFrame to be cleaned.
        """
        self.df = df

    def create_coordinates(self):
        """
        Creates a new column 'coordinates' from the 'latitude' and 'longitude' columns,
        with named fields 'longitude' and 'latitude'.
        
        Returns:
            self (GeoDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn("coordinates", struct(
            col("longitude").alias("longitude"),
            col("latitude").alias("latitude")
        ))
        return self

    def drop_columns(self, columns):
        """
        Drops specified columns from the DataFrame.
        
        Parameters:
            columns (list): A list of column names to be dropped.
        
        Returns:
            self (GeoDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.drop(*columns)
        return self

    def convert_timestamp(self, column_name: str):
        """
        Converts a string column to a timestamp data type.
        
        Parameters:
            column_name (str): The name of the column to be converted.
        
        Returns:
            self (GeoDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn(column_name, to_timestamp(col(column_name)))
        return self

    def reorder_columns(self, column_order: list):
        """
        Reorders the DataFrame columns to a specified order.
        
        Parameters:
            column_order (list): A list specifying the desired column order.
        
        Returns:
            self (GeoDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.select(column_order)
        return self

    def clean_geo_df(self):
        """
        Cleans the Geo DataFrame by performing the necessary transformations.
        
        Returns:
            DataFrame: The cleaned Geo DataFrame.
        """
        self.create_coordinates()\
            .drop_columns(["latitude", "longitude"])\
            .convert_timestamp("timestamp")\
            .reorder_columns(["ind", "country", "coordinates", "timestamp"])
        return self.df

In [None]:
class UserDataCleaner:
    """
    A class to clean and preprocess User DataFrames.
    
    Attributes:
        df (DataFrame): The User DataFrame to be cleaned.
    """

    def __init__(self, df: DataFrame):
        """
        Initializes the UserDataCleaner with a DataFrame.
        
        Parameters:
            df (DataFrame): The User DataFrame to be cleaned.
        """
        self.df = df

    def create_user_name(self):
        """
        Creates a new column 'user_name' by concatenating 'first_name' and 'last_name'.
        
        Returns:
            self (UserDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))
        return self

    def drop_columns(self, columns):
        """
        Drops specified columns from the DataFrame.
        
        Parameters:
            columns (list): A list of column names to be dropped.
        
        Returns:
            self (UserDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.drop(*columns)
        return self

    def convert_timestamp(self, column_name: str):
        """
        Converts a string column to a timestamp data type.
        
        Parameters:
            column_name (str): The name of the column to be converted.
        
        Returns:
            self (UserDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.withColumn(column_name, to_timestamp(col(column_name)))
        return self

    def convert_columns_to_int(self, columns):
        """
        Converts specified columns to the int data type.
        
        Parameters:
            columns (list): List of columns to convert to int data type.
        
        Returns:
            self (UserDataCleaner): Returns the instance itself to allow method chaining.
        """
        for column in columns:
            self.df = self.df.withColumn(column, col(column).cast("int"))
        return self

    def reorder_columns(self, column_order: list):
        """
        Reorders the DataFrame columns to a specified order.
        
        Parameters:
            column_order (list): A list specifying the desired column order.
        
        Returns:
            self (UserDataCleaner): Returns the instance itself to allow method chaining.
        """
        self.df = self.df.select(column_order)
        return self

    def clean_user_df(self):
        """
        Cleans the User DataFrame by performing the necessary transformations.
        
        Returns:
            DataFrame: The cleaned User DataFrame.
        """
        int_columns = ["ind", "age"]

        self.create_user_name()\
            .drop_columns(["first_name", "last_name"])\
            .convert_timestamp("date_joined")\
            .convert_columns_to_int(int_columns)\
            .reorder_columns(["ind", "user_name", "age", "date_joined"])
        return self.df


# Initialize the cleaners & run the cleaning scripts

In [None]:
# Initialize the cleaners with the proper DataFrames
pin_cleaner = PinterestDataCleaner(df_pin)
geo_cleaner = GeoDataCleaner(df_geo)
user_cleaner = UserDataCleaner(df_user)

# Clean the DataFrames
df_pin_cleaned = pin_cleaner.clean_pin_df()
df_geo_cleaned = geo_cleaner.clean_geo_df()
df_user_cleaned = user_cleaner.clean_user_df()

In [None]:
# Display Cleaned Pinterest DataFrame
display(df_pin_cleaned)

In [None]:
# Display Cleaned Geo DataFrame
display(df_geo_cleaned)

In [None]:
# Display Cleaned User DataFrame
display(df_user_cleaned)

# Save DataFrames as new Delta Tables

In [None]:
# Create global temporary views
df_pin_cleaned.createOrReplaceGlobalTempView("global_cleaned_pin")
df_geo_cleaned.createOrReplaceGlobalTempView("global_cleaned_geo")
df_user_cleaned.createOrReplaceGlobalTempView("global_cleaned_user")