In [0]:

'''
AiCore Pinterest Data Pipeline Project
Load data from S3 into Pandas; clean and analyse the data.
This code is intended to run in a Databricks notebook.
Author: Kristina Gorkovskaya
Date: 2023-09-30
'''

import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

user_id = '0ec858bf1407'
mount_name = 'pinterest'

def load_data(topic_suffix: str, user_id: str = user_id, mount_name: str = mount_name) -> pyspark.sql.dataframe.DataFrame:
  '''Load data from JSON into a PySpark DataFrame.'''
  print(f'Loading topic {user_id}.{topic_suffix}...', end=' ')
  file_location = f'/mnt/{mount_name}/topics/{user_id}.{topic_suffix}/partition=0/*.json'
  file_type = 'json'
  df = spark.read.format(file_type).option('inferSchema', True).load(file_location)
  
  print(f'{df.count():,} records loaded.')
  return df

def show_non_numeric_patterns(df: pyspark.sql.dataframe.DataFrame, col: str) -> pyspark.sql.dataframe.DataFrame:
  '''Look for non-numeric patterns in a field that is expected to be numeric.'''
  new_col = col + '_pattern'
  df = df.withColumn(new_col, F.regexp_replace(col, '[0-9]+', '9'))
  df.groupBy(new_col).count().show()
  return df


In [0]:

# Load data from the pins topic into df_pin; print the schema to view column names and data types
df_pin = load_data('pin')
df_pin.printSchema()


In [0]:
#############################################################################################################
# TASK 1: Clean the df_pin DataFrame
#############################################################################################################
# (1) Replace empty entries and entries with no relevant data in each column with Nones

# Start by counting nulls
nulls = {col: df_pin.filter((df_pin[col].isNull()) | F.isnan(col)).count() for col in df_pin.columns}
print(nulls)

# Replace nans with None
df_pin = df_pin.replace(float('nan'), None)

# Replace empty strings with None
df_pin = df_pin.select([F.when(df_pin[c] == "", None).otherwise(df_pin[c]).alias(c) for c in df.columns])

# Count nulls again
nulls = {col: df_pin.filter((df_pin[col].isNull()) | F.isnan(col)).count() for col in df_pin.columns}
print(nulls)

In [0]:
# (2) Perform the necessary transformations on the follower_count to ensure every entry is a number. 
# First look for patterns in the data - what non-numeric characters are present?
df_pin = show_non_numeric_patterns(df_pin, 'follower_count')

In [0]:
# Create a multiplier column
# Assumptions: a suffix of "k" in follower_count means a multiplier of 1,000; a suffix of "M" means 10^6
df_pin = df_pin.withColumn('follower_count_multiplier', 
                           F.when(df_pin.follower_count.like('%k'), 1000).
                           when(df_pin.follower_count.like('%M'), 1000000).
                           otherwise(1))

# Check that multiplier has been created correctly
df_pin.groupBy('follower_count_pattern', 'follower_count_multiplier').count().show()

In [0]:
# Parse numeric data from follower_count and apply multiplier to the parsed values
df_pin = df_pin.withColumn('follower_count_numeric',
                           F.when(df_pin.follower_count == 'User Info Error', None).
                           otherwise(F.regexp_replace('follower_count', '[^0-9]+', '')).
                           cast(IntegerType()) * df_pin.follower_count_multiplier)

# Show a random sample of 5 rows rows for each pattern to make sure the calculations are correct
df_pin.createOrReplaceTempView('df_pin')
sql = """
select distinct follower_count, follower_count_numeric 
from (
    select 
        follower_count, 
        follower_count_numeric, 
        row_number() over (partition by follower_count_pattern order by random()) as rn
    from df_pin 
)
where rn <= 5
order by follower_count_numeric
"""
spark.sql(sql).show(truncate=False)

In [0]:
# Remove intermediate columns
df_pin = df_pin.withColumn('follower_count', df_pin.follower_count_numeric)
df_pin = df_pin.drop('follower_count_numeric', 'follower_count_pattern', 'follower_count_multiplier')
df_pin.printSchema()

In [0]:
# (3) Ensure that each column containing numeric data has a numeric data type
# Start by displaying the first 10 rows to identify candidate columns
df_pin.show(10, truncate=True)

In [0]:
# It looks like the following columns are numeric: downloaded, index, follower_count
# follower_count has already been transformed; let's deal with the remaining columns
numeric_cols = ['downloaded', 'index']
for col in numeric_cols:
    df_pin = show_non_numeric_patterns(df_pin, col) 

In [0]:
# The downloaded and index columns are already populated with integers only; therefore 
# no transformations are required before casting data as int
for col in numeric_cols:
    df_pin = df_pin.withColumn(col, df_pin[col].cast(IntegerType()))

# Drop intermediate columns created by the show_non_numeric_patterns function
df_pin = df_pin.drop(*[c + '_pattern' for c in numeric_cols])

# Print schema to make sure the transformations were perforemed as expected
df_pin.printSchema()

In [0]:
# (4) Clean the data in the save_location column to include only the save location path
# Start by manually reviewing the data to look for patterns.
# Display unique values (cache to speed up any subsequent calls to distinct())
num_unique_vals = df_pin.select('save_location').distinct().count()
df_pin.cache()

print(f'{num_unique_vals:,} unique values')
if num_unique_vals < 50:
    df_pin.select('save_location').distinct().show(truncate=False)
else:
    df_pin.select('save_location').show(10, truncate=False)

In [0]:
# All the records follow a consistent pattern; can therefore be cleaned using regexp replace
df_pin = df_pin.withColumn('save_location', F.regexp_replace('save_location', 'Local save in ', ''))
df_pin.select('save_location').distinct().show(truncate=False)

In [0]:
# (6) Rename the index column to ind.
# (6) Reorder the DataFrame columns.

cols = [    
    'ind',
    'unique_id',
    'title',
    'description',
    'follower_count',
    'poster_name',
    'tag_list',
    'is_image_or_video',
    'image_src',
    'save_location',
    'category'
    ]
df_pin = df_pin.withColumnRenamed('index', 'ind').select(cols)
df_pin.printSchema()