In [None]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

# AWS S3 bucket name
AWS_S3_BUCKET = "user-12f6b2c1ae4f-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/12f6b2c1ae4f-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)

# Mount the drive (RUN ONLY ONCE!)
# dbutils command is supported only with databricks ecosystem.
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
# Display topics if correctly s3 bucket is mounted:
display(dbutils.fs.ls("/mnt/12f6b2c1ae4f-bucket/topics/"))

# Check all mounts on cluster alternative method:
# %fs
# mounts


path,name,size,modificationTime
dbfs:/mnt/12f6b2c1ae4f-bucket/topics/12f6b2c1ae4f.geo/,12f6b2c1ae4f.geo/,0,1709426456068
dbfs:/mnt/12f6b2c1ae4f-bucket/topics/12f6b2c1ae4f.pin/,12f6b2c1ae4f.pin/,0,1709426456068
dbfs:/mnt/12f6b2c1ae4f-bucket/topics/12f6b2c1ae4f.user/,12f6b2c1ae4f.user/,0,1709426456068


In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/12f6b2c1ae4f-bucket/topics/12f6b2c1ae4f.{}/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"

# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location.format("pin"))

df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location.format("geo"))

df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location.format("user"))


In [None]:
# Display the created dataframes.
display(df_pin.head(7))
display(df_geo.head(7))
display(df_user.head(7))

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
diy-and-crafts,Make this cute and easy gingerbread man paper bag puppet craft this Christmas! It comes with a FREE printable template and is great to create with preschool,1,267k,https://i.pinimg.com/originals/94/24/02/942402ca7264750c0cff719badabb10e.jpg,2732,image,Easy Kids Crafts & Activities | Preschool & Kindergarten Ideas,Local save in /data/diy-and-crafts,"Gingerbread Man Crafts,Gingerbread Man Activities,Christmas Activities For Kids,Winter Crafts For Kids,Preschool Christmas,Gingerbread Man Kindergarten,Gingerbread Man Template,Christmas Projects For Kids,Cheap Christmas Crafts",Gingerbread Man Paper Bag Puppet Craft [Free Template],800c4f90-50aa-4e3b-aa4f-2be20995f303
event-planning,20ftx10ft Blush Double Layer Polyester Chiffon Backdrop With Rod Pockets Write a review Item Number: BKDP300_046 Default Title - $107.99 USD $107.99Sale Price$166.59Retail Price,1,68k,https://i.pinimg.com/originals/55/1c/c6/551cc65e5aa06edf2eb25318c61ecbd4.jpg,4925,image,"eFavormart | One-Stop Shop for Wedding, Party & Event Supplies!",Local save in /data/event-planning,"Pink Photography,Background For Photography,Photography Backdrops,Wedding Photography,Photography Ideas,Burlap Backdrop,Fabric Backdrop,Pink Backdrop,Sequin Backdrop",20FT x 10FT Blush | Rose Gold Double Layer Polyester Chiffon Backdrop With Rod Pockets,662c77cd-d99a-45f8-9a36-62feadbb4f7b
christmas,"All the farmhouse Christmas decor ideas to create a gorgeous home for the holidays. Ideas to create a beautiful mantle, stocking you willl love, pillow covers and more!",1,91k,https://i.pinimg.com/originals/6f/df/3b/6fdf3bb828b057c76050d5f158694cf6.png,2288,image,"Kristen | Lifestyle, Mom Tips & Teacher Stuff Blog",Local save in /data/christmas,"Christmas Decorations For The Home,Winter Home Decor,Farmhouse Christmas Decor,Rustic Christmas,Christmas Fireplace,Christmas Mantels,Livingroom Christmas Decor,Christmas Home Decorating,Stocking Decorating Ideas",97 Farmhouse Christmas Decor Ideas For Your Home - Chaylor & Mads,991f5083-e4f2-4eee-bf85-d76bf84a08e2
christmas,Traditional Red Tartan Plaid Christmas Tree | 2016 Michaels Dream Tree Challenge #michaelsmakers - Classic red and gold Christmas Tree Decorating Ideas and Inspiration,1,116k,https://i.pinimg.com/originals/c8/ae/49/c8ae4959583799f661570eb7efb7aeaf.jpg,1928,image,Stephanie Lynn,Local save in /data/christmas,"Red And Gold Christmas Tree,Tartan Christmas,Traditional Christmas Tree,Gold Christmas Decorations,Vintage Halloween Decorations,Beautiful Christmas Trees,Christmas Tree Themes,Plaid Christmas,Christmas Traditions",Traditional Red Tartan Plaid Christmas Tree | 2016 Michaels Dream Tree Challenge - bystephanielynn,444c659e-146a-4536-88a6-5524f494a4c1
diy-and-crafts,These clay pot Christmas tree ornaments are so cute and fun for kids to make! They are an easy Christmas ornament craft that kids of all ages will be able to create.,1,20k,https://i.pinimg.com/originals/fd/5d/51/fd5d5164125a68b18f7b1186d68d2d4d.jpg,3232,image,Projects with Kids,Local save in /data/diy-and-crafts,"Easy Christmas Ornaments,Christmas Gifts For Teen Girls,Christmas Crafts For Gifts,Christmas Fun,Church Christmas Craft,Ornaments For Teachers,Ideas For Christmas Presents,Christmas Gifts For Teachers,Christmas Tree Decorations For Kids",How to Make Adorable Clay Pot Christmas Tree Ornaments,9958caec-20ac-477f-9e13-b16192f187b9
christmas,"With this Dollar Tree Christmas Village Hack you can create your own high end Christmas Village without the huge expense, and it's so easy!",1,24k,https://i.pinimg.com/originals/fb/37/55/fb375515aaebda118ab9c8c518fbcfc4.png,2424,image,Yami @ The Latina Next Door | Blogger | DIY | Budget Friendly Home Decor | Recipes | Latina,Local save in /data/christmas,"Christmas Tree Village,Dollar Tree Christmas,Christmas Hacks,Diy Christmas Ornaments,Christmas Projects,Diy Christmas Gifts,Christmas Decorations Dollar Tree,Christmas Villages,Homemade Christmas",Dollar Tree Christmas Village Hack - The Latina Next Door,c1e1a65b-20f6-4fe1-a89c-f5207486754f
christmas,"Discover how to make DIY yarn Christmas trees, easy and last minute tutorial for a beautiful home decoration idea!",1,436,https://i.pinimg.com/originals/2e/fc/51/2efc51831d8fc04f31496358cee4e3ef.jpg,1819,image,"Francine's Place Blog | DIY, lifestyle, creativity",Local save in /data/christmas,"Diy Christmas Crafts To Sell,Christmas On A Budget,Christmas Ornament Crafts,Simple Christmas Tree Decorations,How To Make Christmas Tree,Flocked Christmas Trees Decorated,White Christmas Trees,Diy Crafts With Yarn,Ideas For Christmas",DIY YARN CHRISTMAS TREES (EASY & LAST MINUTE DECORATION IDEA) - Francine's Place Blog,e20ac887-9f16-4848-9620-8798d8dce270


country,index,latitude,longitude,timestamp
British Indian Ocean Territory (Chagos Archipelago),9455,-82.9272,-150.346,2022-03-15 01:46:32
British Indian Ocean Territory (Chagos Archipelago),6814,-86.5675,-149.565,2022-09-02 11:34:28
British Indian Ocean Territory (Chagos Archipelago),5111,-83.7472,8.65953,2021-04-01 00:56:57
British Indian Ocean Territory (Chagos Archipelago),2989,-87.013,133.062,2020-01-09 19:18:54
Antarctica (the territory South of 60 deg S),10073,-32.8885,-170.295,2021-06-29 19:56:04
Antarctica (the territory South of 60 deg S),10073,-32.8885,-170.295,2021-06-29 19:56:04
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27 11:30:59


age,date_joined,first_name,index,last_name
42,2017-02-18 00:31:22,Christopher,6353,Hernandez
27,2016-03-08 13:38:37,Christopher,2015,Bradshaw
59,2017-05-12 21:22:17,Alexander,10673,Cervantes
48,2016-02-27 16:57:44,Christopher,1857,Hamilton
45,2016-09-15 06:02:53,Christopher,10020,Hawkins
35,2015-10-22 22:42:23,Christopher,2041,Campbell
48,2016-06-13 17:09:14,Christopher,7031,Anderson


## Cleaning the dataframe containing pinterest data

In [None]:
from pyspark.sql.functions import col,when
from pyspark.sql.functions import regexp_extract

df_pin = df_pin.withColumn("description", when(col("description").contains("No description available"), "None").otherwise(col("description")))
df_pin = df_pin.withColumn("image_src", when(col("image_src").contains("Image src error"), "None").otherwise(col("image_src")))
df_pin = df_pin.withColumn("follower_count", when(col("follower_count").contains("User Info Error"), "None").otherwise(col("follower_count")))
df_pin = df_pin.withColumn("follower_count", regexp_replace(df_pin["follower_count"], "M", "000000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace(df_pin["follower_count"], "k", "000"))
df_pin = df_pin.withColumn("downloaded",col("downloaded").cast("int"))
df_pin = df_pin.withColumn("follower_count",col("follower_count").cast("int"))
df_pin = df_pin.withColumn("index",col("index").cast("int"))
df_pin = df_pin.withColumnRenamed("index","ind")

df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")

display(df_pin.head(6))

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
2732,800c4f90-50aa-4e3b-aa4f-2be20995f303,Gingerbread Man Paper Bag Puppet Craft [Free Template],Make this cute and easy gingerbread man paper bag puppet craft this Christmas! It comes with a FREE printable template and is great to create with preschool,267000,Easy Kids Crafts & Activities | Preschool & Kindergarten Ideas,"Gingerbread Man Crafts,Gingerbread Man Activities,Christmas Activities For Kids,Winter Crafts For Kids,Preschool Christmas,Gingerbread Man Kindergarten,Gingerbread Man Template,Christmas Projects For Kids,Cheap Christmas Crafts",image,https://i.pinimg.com/originals/94/24/02/942402ca7264750c0cff719badabb10e.jpg,Local save in /data/diy-and-crafts,diy-and-crafts
4925,662c77cd-d99a-45f8-9a36-62feadbb4f7b,20FT x 10FT Blush | Rose Gold Double Layer Polyester Chiffon Backdrop With Rod Pockets,20ftx10ft Blush Double Layer Polyester Chiffon Backdrop With Rod Pockets Write a review Item Number: BKDP300_046 Default Title - $107.99 USD $107.99Sale Price$166.59Retail Price,68000,"eFavormart | One-Stop Shop for Wedding, Party & Event Supplies!","Pink Photography,Background For Photography,Photography Backdrops,Wedding Photography,Photography Ideas,Burlap Backdrop,Fabric Backdrop,Pink Backdrop,Sequin Backdrop",image,https://i.pinimg.com/originals/55/1c/c6/551cc65e5aa06edf2eb25318c61ecbd4.jpg,Local save in /data/event-planning,event-planning
2288,991f5083-e4f2-4eee-bf85-d76bf84a08e2,97 Farmhouse Christmas Decor Ideas For Your Home - Chaylor & Mads,"All the farmhouse Christmas decor ideas to create a gorgeous home for the holidays. Ideas to create a beautiful mantle, stocking you willl love, pillow covers and more!",91000,"Kristen | Lifestyle, Mom Tips & Teacher Stuff Blog","Christmas Decorations For The Home,Winter Home Decor,Farmhouse Christmas Decor,Rustic Christmas,Christmas Fireplace,Christmas Mantels,Livingroom Christmas Decor,Christmas Home Decorating,Stocking Decorating Ideas",image,https://i.pinimg.com/originals/6f/df/3b/6fdf3bb828b057c76050d5f158694cf6.png,Local save in /data/christmas,christmas
1928,444c659e-146a-4536-88a6-5524f494a4c1,Traditional Red Tartan Plaid Christmas Tree | 2016 Michaels Dream Tree Challenge - bystephanielynn,Traditional Red Tartan Plaid Christmas Tree | 2016 Michaels Dream Tree Challenge #michaelsmakers - Classic red and gold Christmas Tree Decorating Ideas and Inspiration,116000,Stephanie Lynn,"Red And Gold Christmas Tree,Tartan Christmas,Traditional Christmas Tree,Gold Christmas Decorations,Vintage Halloween Decorations,Beautiful Christmas Trees,Christmas Tree Themes,Plaid Christmas,Christmas Traditions",image,https://i.pinimg.com/originals/c8/ae/49/c8ae4959583799f661570eb7efb7aeaf.jpg,Local save in /data/christmas,christmas
3232,9958caec-20ac-477f-9e13-b16192f187b9,How to Make Adorable Clay Pot Christmas Tree Ornaments,These clay pot Christmas tree ornaments are so cute and fun for kids to make! They are an easy Christmas ornament craft that kids of all ages will be able to create.,20000,Projects with Kids,"Easy Christmas Ornaments,Christmas Gifts For Teen Girls,Christmas Crafts For Gifts,Christmas Fun,Church Christmas Craft,Ornaments For Teachers,Ideas For Christmas Presents,Christmas Gifts For Teachers,Christmas Tree Decorations For Kids",image,https://i.pinimg.com/originals/fd/5d/51/fd5d5164125a68b18f7b1186d68d2d4d.jpg,Local save in /data/diy-and-crafts,diy-and-crafts
2424,c1e1a65b-20f6-4fe1-a89c-f5207486754f,Dollar Tree Christmas Village Hack - The Latina Next Door,"With this Dollar Tree Christmas Village Hack you can create your own high end Christmas Village without the huge expense, and it's so easy!",24000,Yami @ The Latina Next Door | Blogger | DIY | Budget Friendly Home Decor | Recipes | Latina,"Christmas Tree Village,Dollar Tree Christmas,Christmas Hacks,Diy Christmas Ornaments,Christmas Projects,Diy Christmas Gifts,Christmas Decorations Dollar Tree,Christmas Villages,Homemade Christmas",image,https://i.pinimg.com/originals/fb/37/55/fb375515aaebda118ab9c8c518fbcfc4.png,Local save in /data/christmas,christmas


## Cleaning the dataframe containing geolocation data

In [None]:
df_geo = df_geo.withColumn("timestamp", col("timestamp").cast("timestamp"))
geo = df_geo.withColumn("index",col("index").cast("int"))
df_geo = df_geo.withColumnRenamed("index","ind")
df_geo = df_geo.withColumn('coordinates', array('latitude', 'longitude'))

df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")

display(df_geo.head(6))

ind,country,coordinates,timestamp
9455,British Indian Ocean Territory (Chagos Archipelago),"List(-82.9272, -150.346)",2022-03-15T01:46:32.000+0000
6814,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2022-09-02T11:34:28.000+0000
5111,British Indian Ocean Territory (Chagos Archipelago),"List(-83.7472, 8.65953)",2021-04-01T00:56:57.000+0000
2989,British Indian Ocean Territory (Chagos Archipelago),"List(-87.013, 133.062)",2020-01-09T19:18:54.000+0000
10073,Antarctica (the territory South of 60 deg S),"List(-32.8885, -170.295)",2021-06-29T19:56:04.000+0000
10073,Antarctica (the territory South of 60 deg S),"List(-32.8885, -170.295)",2021-06-29T19:56:04.000+0000


## Cleaning the dataframe containing user data

In [None]:
df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
df_user = df_user.withColumn("date_joined", col("date_joined").cast("timestamp"))
df_user = df_user.withColumnRenamed("index","ind")

df_user = df_user.select("ind", "user_name", "age", "date_joined")

display(df_user.head(5))

ind,user_name,age,date_joined
6353,Christopher Hernandez,42,2017-02-18T00:31:22.000+0000
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
10673,Alexander Cervantes,59,2017-05-12T21:22:17.000+0000
1857,Christopher Hamilton,48,2016-02-27T16:57:44.000+0000
10020,Christopher Hawkins,45,2016-09-15T06:02:53.000+0000


#### View all 3 transformed dataframes

In [None]:
display(df_pin.head(5))
display(df_geo.head(5))
display(df_user.head(5))

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
2732,800c4f90-50aa-4e3b-aa4f-2be20995f303,Gingerbread Man Paper Bag Puppet Craft [Free Template],Make this cute and easy gingerbread man paper bag puppet craft this Christmas! It comes with a FREE printable template and is great to create with preschool,267000,Easy Kids Crafts & Activities | Preschool & Kindergarten Ideas,"Gingerbread Man Crafts,Gingerbread Man Activities,Christmas Activities For Kids,Winter Crafts For Kids,Preschool Christmas,Gingerbread Man Kindergarten,Gingerbread Man Template,Christmas Projects For Kids,Cheap Christmas Crafts",image,https://i.pinimg.com/originals/94/24/02/942402ca7264750c0cff719badabb10e.jpg,Local save in /data/diy-and-crafts,diy-and-crafts
4925,662c77cd-d99a-45f8-9a36-62feadbb4f7b,20FT x 10FT Blush | Rose Gold Double Layer Polyester Chiffon Backdrop With Rod Pockets,20ftx10ft Blush Double Layer Polyester Chiffon Backdrop With Rod Pockets Write a review Item Number: BKDP300_046 Default Title - $107.99 USD $107.99Sale Price$166.59Retail Price,68000,"eFavormart | One-Stop Shop for Wedding, Party & Event Supplies!","Pink Photography,Background For Photography,Photography Backdrops,Wedding Photography,Photography Ideas,Burlap Backdrop,Fabric Backdrop,Pink Backdrop,Sequin Backdrop",image,https://i.pinimg.com/originals/55/1c/c6/551cc65e5aa06edf2eb25318c61ecbd4.jpg,Local save in /data/event-planning,event-planning
2288,991f5083-e4f2-4eee-bf85-d76bf84a08e2,97 Farmhouse Christmas Decor Ideas For Your Home - Chaylor & Mads,"All the farmhouse Christmas decor ideas to create a gorgeous home for the holidays. Ideas to create a beautiful mantle, stocking you willl love, pillow covers and more!",91000,"Kristen | Lifestyle, Mom Tips & Teacher Stuff Blog","Christmas Decorations For The Home,Winter Home Decor,Farmhouse Christmas Decor,Rustic Christmas,Christmas Fireplace,Christmas Mantels,Livingroom Christmas Decor,Christmas Home Decorating,Stocking Decorating Ideas",image,https://i.pinimg.com/originals/6f/df/3b/6fdf3bb828b057c76050d5f158694cf6.png,Local save in /data/christmas,christmas
1928,444c659e-146a-4536-88a6-5524f494a4c1,Traditional Red Tartan Plaid Christmas Tree | 2016 Michaels Dream Tree Challenge - bystephanielynn,Traditional Red Tartan Plaid Christmas Tree | 2016 Michaels Dream Tree Challenge #michaelsmakers - Classic red and gold Christmas Tree Decorating Ideas and Inspiration,116000,Stephanie Lynn,"Red And Gold Christmas Tree,Tartan Christmas,Traditional Christmas Tree,Gold Christmas Decorations,Vintage Halloween Decorations,Beautiful Christmas Trees,Christmas Tree Themes,Plaid Christmas,Christmas Traditions",image,https://i.pinimg.com/originals/c8/ae/49/c8ae4959583799f661570eb7efb7aeaf.jpg,Local save in /data/christmas,christmas
3232,9958caec-20ac-477f-9e13-b16192f187b9,How to Make Adorable Clay Pot Christmas Tree Ornaments,These clay pot Christmas tree ornaments are so cute and fun for kids to make! They are an easy Christmas ornament craft that kids of all ages will be able to create.,20000,Projects with Kids,"Easy Christmas Ornaments,Christmas Gifts For Teen Girls,Christmas Crafts For Gifts,Christmas Fun,Church Christmas Craft,Ornaments For Teachers,Ideas For Christmas Presents,Christmas Gifts For Teachers,Christmas Tree Decorations For Kids",image,https://i.pinimg.com/originals/fd/5d/51/fd5d5164125a68b18f7b1186d68d2d4d.jpg,Local save in /data/diy-and-crafts,diy-and-crafts


ind,country,coordinates,timestamp
9455,British Indian Ocean Territory (Chagos Archipelago),"List(-82.9272, -150.346)",2022-03-15T01:46:32.000+0000
6814,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2022-09-02T11:34:28.000+0000
5111,British Indian Ocean Territory (Chagos Archipelago),"List(-83.7472, 8.65953)",2021-04-01T00:56:57.000+0000
2989,British Indian Ocean Territory (Chagos Archipelago),"List(-87.013, 133.062)",2020-01-09T19:18:54.000+0000
10073,Antarctica (the territory South of 60 deg S),"List(-32.8885, -170.295)",2021-06-29T19:56:04.000+0000


ind,user_name,age,date_joined
6353,Christopher Hernandez,42,2017-02-18T00:31:22.000+0000
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
10673,Alexander Cervantes,59,2017-05-12T21:22:17.000+0000
1857,Christopher Hamilton,48,2016-02-27T16:57:44.000+0000
10020,Christopher Hawkins,45,2016-09-15T06:02:53.000+0000


#### Create temporary views to run queries on

In [None]:
df_pin.createOrReplaceTempView("pin")
df_geo.createOrReplaceTempView("geo")
df_user.createOrReplaceTempView("user")

#### Most popular category in each country

In [None]:
result= spark.sql("""SELECT 
                        country, category, category_count
                    FROM (
                        SELECT 
                            country, category, count(*) as category_count,
                            DENSE_RANK() OVER (PARTITION by country ORDER BY count(*) DESC) AS ranking
                        FROM geo
                        JOIN pin 
                        ON geo.ind = pin.ind
                        GROUP BY country, category
                    )
                    where ranking = 1"""
                )

display(result)

country,category,category_count
Afghanistan,education,6
Albania,education,12
Algeria,quotes,18
American Samoa,art,9
Andorra,tattoos,7
Angola,diy-and-crafts,3
Anguilla,diy-and-crafts,7
Antarctica (the territory South of 60 deg S),tattoos,7
Antigua and Barbuda,christmas,5
Argentina,tattoos,11


#### Most popular category each year

In [None]:
result= spark.sql("""SELECT 
                        post_year, category, category_count
                    FROM (
                        SELECT 
                            year(timestamp) as post_year, category, count(*) as category_count,
                            DENSE_RANK() OVER (PARTITION by year(timestamp) ORDER BY count(*) DESC) AS ranking
                        FROM geo
                        JOIN pin 
                        ON geo.ind = pin.ind
                        GROUP BY year(timestamp), category
                    )
                    where ranking = 1"""
                )

display(result)

post_year,category,category_count
2017,mens-fashion,6
2018,diy-and-crafts,31
2019,art,22
2020,home-decor,22
2021,education,24
2021,quotes,24
2022,christmas,23


#### User with most followers in each country & Country with most followers

In [None]:
# When querying for the poster with the most followers in each country, in some instances the poster may have amassed more follower between posts, given the difference in timestamps between his/her post. Therefore, the MAX() function can be used to select the the maximimum follower count based on latest data.

result_a = spark.sql("""SELECT 
                            poster_name, country, MAX(follower_count) as max_follower_count
                        FROM pin
                        JOIN geo
                        ON pin.ind = geo.ind
                        GROUP BY country, poster_name
                        ORDER BY max_follower_count DESC, country
                        """
                    )

display(result_a)

result_b = spark.sql("""SELECT 
                            country, MAX(follower_count) as max_follower_count
                        FROM geo
                        JOIN pin
                        ON pin.ind = geo.ind
                        GROUP BY country
                        ORDER BY max_follower_count DESC
                        LIMIT 5
                        """
                    )

display(result_b)


poster_name,country,max_follower_count
Mamas Uncut,American Samoa,8000000.0
Tastemade,Angola,8000000.0
Style Me Pretty,Azerbaijan,6000000.0
Behance,Burkina Faso,6000000.0
The Minds Journal,Albania,5000000.0
Apartment Therapy,Algeria,5000000.0
BuzzFeed,American Samoa,5000000.0
POPSUGAR,Bouvet Island (Bouvetoya),5000000.0
Ruffled,Comoros,4000000.0
imgur,Guam,4000000.0


country,max_follower_count
American Samoa,8000000
Angola,8000000
Burkina Faso,6000000
Azerbaijan,6000000
Albania,5000000


#### Popular categories based on age groups

In [None]:
result = spark.sql("""WITH CTE AS (
                        SELECT 
                            age_group, category, category_count,
                            DENSE_RANK() OVER (PARTITION BY age_group ORDER BY category_count DESC) AS ranking
                        FROM (
                            SELECT
                                CASE 
                                    WHEN age BETWEEN 18 AND 24 THEN "18-24"
                                    WHEN age BETWEEN 25 AND 35 THEN "25-35"
                                    WHEN age BETWEEN 36 AND 50 THEN "36-50"
                                    WHEN age > 50 THEN "50+"
                                    --ELSE "NONE" 
                                END AS age_group,
                                category,
                                COUNT(*) AS category_count
                            FROM user
                            JOIN pin ON pin.ind = user.ind
                            GROUP BY age_group, category
                        )
                    )
                    
                    SELECT 
                        age_group, category, category_count
                    FROM CTE
                    WHERE ranking = 1"""
                )

display(result)

age_group,category,category_count
18-24,diy-and-crafts,57
25-35,christmas,31
36-50,travel,30
50+,vehicles,15


#### Median follower count based on age group

In [None]:
result = spark.sql("""SELECT
                        CASE 
                            WHEN age BETWEEN 18 AND 24 THEN "18-24"
                            WHEN age BETWEEN 25 AND 35 THEN "25-35"
                            WHEN age BETWEEN 36 AND 50 THEN "36-50"
                            WHEN age > 50 THEN "50+"
                            --ELSE "NONE" 
                            END AS age_group, 
                        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY follower_count) AS median_follower_count
                FROM user 
                JOIN pin on pin.ind = user.ind
                GROUP BY age_group
                ORDER BY median_follower_count DESC"""
            )

display(result)

age_group,median_follower_count
18-24,190000.0
25-35,25500.0
36-50,9000.0
50+,1000.0


#### Users joined between 2015 and 2020

In [None]:
result = spark.sql("""SELECT 
                        YEAR(date_joined) as year_joined, count(date_joined) as new_users_joined
                    FROM user
                    JOIN pin
                    ON pin.ind = user.ind
                    GROUP BY year_joined
                    HAVING year_joined BETWEEN 2015 AND 2020
                    ORDER BY year_joined"""
)

display(result)

year_joined,new_users_joined
2015,394
2016,408
2017,135


#### Median follower count of users that have joined between 2015 and 2020

In [None]:
results = spark.sql("""SELECT 
                        YEAR(date_joined) as year_joined, 
                        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY follower_count) AS median_follower_count
                    FROM user 
                    JOIN pin 
                    ON pin.ind = user.ind
                    GROUP by year_joined
                    HAVING year_joined BETWEEN 2015 and 2020"""
                )
display(results)

year_joined,median_follower_count
2015,203000.0
2016,20500.0
2017,4000.0


#### Median follower count based on joining year and age group

In [None]:
result = spark.sql("""SELECT
                        CASE 
                            WHEN age BETWEEN 18 AND 24 THEN "18-24"
                            WHEN age BETWEEN 25 AND 35 THEN "25-35"
                            WHEN age BETWEEN 36 AND 50 THEN "36-50"
                            WHEN age > 50 THEN "50+"
                            --ELSE "NONE" 
                            END AS age_group,
                        YEAR(date_joined) as year_joined,
                        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY follower_count) AS median_follower_count
                    FROM user 
                    JOIN pin on pin.ind = user.ind
                    GROUP BY age_group, year_joined
                    ORDER BY year_joined, age_group"""
            )

display(result)

age_group,year_joined,median_follower_count
18-24,2015,296000.0
25-35,2015,100000.0
36-50,2015,19000.0
50+,2015,30000.0
18-24,2016,55000.0
25-35,2016,24000.0
36-50,2016,10500.0
50+,2016,504.0
18-24,2017,4000.0
25-35,2017,5000.0


In [None]:
dbutils.fs.unmount(MOUNT_NAME)