In [0]:
# This script contains the dataframe cleaning methods defined for each dataset (to be used in both the batch and the stream processing pipelines)

In [0]:
# necessary imports
import re

from pyspark.sql.functions import col, concat_ws, to_timestamp, udf
from pyspark.sql.types import IntegerType

In [0]:
# defining the method for cleaning the Pinterest posts dataframe

def convert_follower_count_letters_to_digits(str):
    if str is not None:
        str = re.sub(r'M', '000000', str)
        str = re.sub(r'k', '000', str)
        return str
    
follower_count_UDF = udf(lambda x:convert_follower_count_letters_to_digits(x))

def strip_down_to_save_location_filepath(str):
    if str is not None:
        str = re.sub(r'Local save in ', '', str)
        return str
    
save_location_UDF = udf(lambda x:strip_down_to_save_location_filepath(x))

def clean_pin_df(df_pin):

    # remove duplicate rows
    df_pin = df_pin.distinct()

    # replace entries with no data or no useful data with None
    df_pin = df_pin.na.replace({"User Info Error": None}, 'follower_count')
    df_pin = df_pin.na.replace({"No description available": None,
                                "No description available Story format": None}, 'description')
    df_pin = df_pin.na.replace({"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e": None}, 'tag_list')
    df_pin = df_pin.na.replace({"No Title Data Available": None}, 'title')
    df_pin = df_pin.na.replace({"Image src error.": None}, 'image_src')

    # convert all follower_count entries to numeric values
    df_pin = df_pin.withColumn("follower_count", follower_count_UDF(col("follower_count")).cast(IntegerType()))

    # clean save_location column to include just filepath
    df_pin = df_pin.withColumn("save_location", save_location_UDF(col("save_location")))

    # rename index column to ind
    df_pin = df_pin.withColumnRenamed("index", "ind")

    # reorder the columns (NB this drops the downloaded column)
    df_pin = df_pin.select(["ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category"])

    # TODO: also turn tag list strings into array values?

    return df_pin

In [0]:
# defining the method for cleaning the geolocation dataframe

def clean_geo_df(df_geo):
    
    df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
    df_geo = df_geo.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy:MM:dd HH:mm:ss"))

    # reorder the columns and drop the longitude and latitude columns
    df_geo = df_geo.select(["ind", "country", "coordinates", "timestamp"])

    return df_geo 

In [0]:
# defining the method for cleaning the users dataframe

def clean_user_df(df_user):

    # create a new column user_name that concatenates the information found in the first_name and last_name columns
    df_user = df_user.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

    # convert date_joined column from string to timestamp data type
    df_user = df_user.withColumn("date_joined", to_timestamp(col("date_joined"), "yyyy:MM:dd HH:mm:ss"))

    # reorder the columns and drop the longitude and latitude columns, dropping first_name and last_name columns
    df_user = df_user.select(["ind", "user_name", "age", "date_joined"]) 

    return df_user