## Cleaning the batch data dataframes

The code in this notebook was originally used following the code in the notebook mount_s3_bucket.ipynb

### Cleaning pin dataframe

In [0]:
pin_df = dirty_pin_df

# Cleaning all the invalid data

# Column names that contain invalid data
columns_for_null = ['description', 'follower_count', 'image_src', 'poster_name', 'tag_list', 'title']

# Dictionary of invalid data entries to change to null
values_for_null = {"No description available%": None,
                   "User Info Error": None,
                   "Image src error.": None,
                   "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e": None,
                   "No Title Data Available": None}

# Loops through the relative columns and changes the data to None where the invalid values in the dictionary are present
for column in columns_for_null:
  pin_df = pin_df.replace(values_for_null, subset=[column])
  

In [0]:
# Converts follower_count to ensure every entry is a number
pin_df = pin_df.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
pin_df = pin_df.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# Cast follower_count column to integer type
pin_df = pin_df.withColumn("follower_count", col("follower_count").cast('int'))

# Converts save_location column to include only the save location path
pin_df = pin_df.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

# Renames the index column to ind
pin_df = pin_df.withColumnRenamed("index", "ind")

# Reorders columns
pin_df = pin_df.select([ "ind",
                        "unique_id",
                        "title",
                        "description",
                        "follower_count",
                        "poster_name",
                        "tag_list",
                        "is_image_or_video",
                        "image_src",
                        "save_location",
                        "category"])

display(pin_df)

In [0]:
# Checks all the data types are correct via schema
pin_df.printSchema()

### Cleaning geo dataframe

In [0]:
geo_df = dirty_geo_df

# Creates a new column 'coordinates' containing an array of latitude and longitude
geo_df = geo_df.withColumn("coordinates", array(col("latitude"), col("longitude")))

# Drops the latitude and longitude columns
geo_df = geo_df.drop("latitude", "longitude")

# Cast timestamp column to timestamp data type 
geo_df = geo_df.withColumn("timestamp", geo_df["timestamp"].cast(TimestampType()))

# Reorders columns
geo_df = geo_df.select(["ind",
                        "country", 
                        "coordinates", 
                        "timestamp"])

display(geo_df)

In [0]:
# Checks all the data types are correct via schema
geo_df.printSchema()

### Cleaning user dataframe

In [0]:
user_df = dirty_user_df

# Creates a new column 'user_name' by concatenating 'first_name' and 'last_name' with a space in between
user_df = user_df.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

# Drops the 'first_name' and 'last_name' columns
user_df = user_df.drop("first_name", "last_name")

# Cast date_joined column to timestamp data type 
user_df = user_df.withColumn("date_joined", user_df["date_joined"].cast(TimestampType()))

# Reorders columns
user_df = user_df.select(["ind",
                          "user_name",
                          "age",
                          "date_joined"])

display(user_df)
     

In [0]:
# Checks all the data types are correct via schema
user_df.printSchema()