In [0]:
dbutils.fs.ls("/FileStore/tables")

In [0]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

In [0]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [0]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-0a48d8473ced-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-0a48d8473ced-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [0]:
display(dbutils.fs.ls("/mnt/user-0a48d8473ced-bucket/../.."))

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,1697025807192
dbfs:/Volume/,Volume/,0,0
dbfs:/Volumes/,Volumes/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/delta/,delta/,0,1697025807192
dbfs:/df_pin.csv/,df_pin.csv/,0,1697025807192
dbfs:/local_disk0/,local_disk0/,0,1697025807192
dbfs:/mnt/,mnt/,0,1697025807192
dbfs:/pin_kinesis_events/,pin_kinesis_events/,0,1697025807192


In [0]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location_pin = "/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/*.json"
file_location_geo = "/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.geo/partition=0/*.json" 
file_location_user = "/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.user/partition=0/*.json" 

file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location_pin)
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location_geo)
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location_user)

# Display Spark dataframe to check its content
display(df_pin)
display(df_geo)
display(df_user)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
event-planning,Το όνομα που επέλεξε η μαμά Ανδριανή για τη γλυκιά Τιτίκα δεν είναι καθόλου τυχαίο. Και φυσικά δεν άφησε τίποτα στην τύχη ούτε την ημέρα της βάπτισης. Ανέθεσε την οργάνωση στην…,1,4,https://i.pinimg.com/originals/db/aa/d2/dbaad28fa85012a4ea6958540d98a8e5.jpg,4387,image,Manosbojana Katsareas,Local save in /data/event-planning,"Diy Flowers,Flower Diy,Baptism Decorations,Christening,Event Planning,Wedding Planner,Baptism Ideas,Birthday,Party",Βάπτιση: H παραμυθένια βάπτιση της Τιτίκας με θέμα το μονόκερο από την e.m. for you,ae5e7377-f1bd-4ac5-94de-bee317f51a43
home-decor,"Традиционные шведские коттеджи, обычно с красным фасадом — это настоящее воплощением идеального зимнего уюта. Они обычно оформлены очень просто и ✌PUFIK. Beautiful Interiors. On…",1,136k,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,6717,image,PUFIK Interiors & Inspirations,Local save in /data/home-decor,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",〚 Уютные шведские коттеджи от Carina Olander 〛 ◾ Фото ◾ Идеи ◾ Дизайн,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3
event-planning,"15.1k Likes, 83 Comments - THE EVENT COLLECTIVE ✖️ (@theeventcollectivex) on Instagram: “I’ve always loved emerald green 🌲 by @a.purnellproduction Beautiful balloons by…”",1,311,https://i.pinimg.com/originals/91/0b/5c/910b5c120f7d1570ffc840302d7b49f4.jpg,4858,image,Marie Bradford,Local save in /data/event-planning,"Diy Birthday Decorations,Balloon Decorations,Table Decorations,Emerald Green Decor,40th Birthday Parties,24th Birthday,Surprise Birthday,Brunch Decor,Quinceanera Themes",THE EVENT COLLECTIVE ✖️ on Instagram: “I’ve always loved emerald green 🌲 by @a.purnellproduction Beautiful balloons by @basicallycuteevents @inspiredengravings for the acrylic…”,58101415-9273-4311-a5bd-0015a56579b4
event-planning,"Wow your guests! Our backdrops are a great option for providing a personalized, stylish and fun addition to your party .It will be the focal point in any event! They are great a…",1,1k,https://i.pinimg.com/originals/15/1f/93/151f93d662dc158ca2c9bbfed198f556.jpg,4608,image,"Iconica Design | Personalized Event Decor, Stationery & Gifts",Local save in /data/event-planning,"Christmas Party Backdrop,Holiday Banner,Birthday Backdrop,Circus First Birthday,First Birthday Banners,Dinasour Birthday,Birthday Bash,Banner Backdrop,Photo Booth Backdrop","Virtual Baby Shower Little Man Baby Shower Banner, Mustache Baby Shower Backdrop, Oh Boy, Any Color, Printed Or Printable File BBS0035 - 10x8 ft / Top Pole Pocket",d234e56f-5b18-4ef3-905b-44103f7719d9
home-decor,"6,636 Likes, 141 Comments - The Cottage Journal (@thecottagejournal) on Instagram: “Can you say color?! 😍😍😍 We are loving the cheery vibes that these aqua blue cabinets are g…",1,394,https://i.pinimg.com/originals/8c/17/a2/8c17a257b70780480bb89c3699363144.jpg,6633,image,Sarah Martin,Local save in /data/home-decor,"Diy Kitchen Cabinets,Kitchen Redo,Home Decor Kitchen,New Kitchen,Home Kitchens,Kitchen Remodeling,Aqua Kitchen,Kitchen Counters,Kitchen Islands",The Cottage Journal on Instagram: “Can you say color?! 😍😍😍 We are loving the cheery vibes that these aqua blue cabinets are giving. If you could paint your cabinets any…”,d136f6bc-840d-44f8-bbad-115eb7e6c51e
christmas,Make your own gingerbread person with our free Christmas craft. 4 pages of accessories to mix & match! #gingerbreadchristmasdecor #gingerbreadcraftspreschool #gingerbreadcraftfo…,1,7k,https://i.pinimg.com/originals/ca/59/b1/ca59b1055ca52521b9ebd01799513b8c.jpg,2539,image,"Mrs. Merry | Free Printables for Kids, Holiday Printables & Party",Local save in /data/christmas,"Christmas Projects For Kids,Christmas To Do List,Christmas Decorations For Kids,Christmas Activities For Kids,Preschool Christmas,Free Christmas Printables,Christmas Books,Christmas Themes,Gingerbread Christmas Decor",Free Kids Printable - Build a Gingerbread Person Craft - Christmas Activities for Kids | Mrs. Merry,cd2c667e-da47-4818-8f94-3def20b90864
christmas,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",1,5k,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,1706,image,Wear24-7,Local save in /data/christmas,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,b5c8a1b5-9e90-4522-9bec-2477b698d5b7
christmas,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",1,5k,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,1706,image,Wear24-7,Local save in /data/christmas,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,b5c8a1b5-9e90-4522-9bec-2477b698d5b7
diy-and-crafts,"This handprint reindeer is a fun DIY Christmas craft for kids to make. It's easy enough for toddlers, preschool and kindergarten children to make and it comes with a free printa…",1,267k,https://i.pinimg.com/originals/6d/56/f7/6d56f7f568174b9fd6160f1d96cd3186.jpg,3431,image,Easy Kids Crafts & Activities | Preschool & Kindergarten Ideas,Local save in /data/diy-and-crafts,"Christmas Arts And Crafts,Christmas Activities For Kids,Christmas Fun,Christmas Crafts For Kids To Make At School,Childrens Christmas Crafts,Homemade Christmas,Toddler Christmas Crafts,Christmas Decorations Diy For Kids,Easy Christmas Crafts For Toddlers",Handprint Reindeer Craft For Kids [Free Template],cb2453aa-3075-44a9-b8b1-a0d99cacab64
christmas,"You HAVE TO check out these modern minimalist Christmas decorations! They're SO GOOD! I'm so glad I found these understated Christmas decoration ideas, definitely going to use t…",1,37k,https://i.pinimg.com/originals/84/40/67/84406731f0b32ec71829e6ca9b2dddeb.jpg,1845,image,"Joyfully Growing Blog | All things DIY + home decor, budget friendly hacks, & blogging tips",Local save in /data/christmas,"Decoration Christmas,Farmhouse Christmas Decor,Decoration Table,Christmas Decorations For Apartment,Apartment Holiday Decor,Christmas Decor For Stairs,Tv Stand Christmas Decor,Home Decorations,Modern Christmas Decor",12 Minimalist Christmas Decorations You'll Want to Copy This Year,d1ec8a17-5517-42a9-82f3-fc0fb26a87b2


country,ind,latitude,longitude,timestamp
British Indian Ocean Territory (Chagos Archipelago),9455,-82.9272,-150.346,2022-03-15 01:46:32
British Indian Ocean Territory (Chagos Archipelago),6814,-86.5675,-149.565,2022-09-02 11:34:28
British Indian Ocean Territory (Chagos Archipelago),7151,-14.6744,-75.3714,2020-06-05 23:37:24
British Indian Ocean Territory (Chagos Archipelago),8221,-20.5574,-54.4834,2021-12-29 06:33:46
British Indian Ocean Territory (Chagos Archipelago),7569,-86.5675,-149.565,2018-10-16 08:40:26
British Indian Ocean Territory (Chagos Archipelago),5111,-83.7472,8.65953,2021-04-01 00:56:57
British Indian Ocean Territory (Chagos Archipelago),2989,-87.013,133.062,2020-01-09 19:18:54
British Indian Ocean Territory (Chagos Archipelago),1624,9.50751,119.757,2020-10-24 11:13:47
Antarctica (the territory South of 60 deg S),10073,-32.8885,-170.295,2021-06-29 19:56:04
Antarctica (the territory South of 60 deg S),10073,-32.8885,-170.295,2021-06-29 19:56:04


age,date_joined,first_name,ind,last_name
42,2017-02-18 00:31:22,Christopher,6353,Hernandez
27,2016-03-08 13:38:37,Christopher,2015,Bradshaw
59,2017-05-12 21:22:17,Alexander,10673,Cervantes
48,2016-02-27 16:57:44,Christopher,1857,Hamilton
45,2016-09-15 06:02:53,Christopher,10020,Hawkins
35,2015-10-22 22:42:23,Christopher,2041,Campbell
48,2016-06-13 17:09:14,Christopher,7031,Anderson
50,2017-02-07 08:09:03,Benjamin,8075,Fitzpatrick
36,2016-06-01 09:41:09,Christopher,2539,Williams
39,2016-06-29 20:43:59,Christina,6398,Davenport


In [0]:
display(dbutils.fs.ls("/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/"))


path,name,size,modificationTime
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000000.json,0a48d8473ced.pin+0+0000000000.json,444,1696877757000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000001.json,0a48d8473ced.pin+0+0000000001.json,681,1696877760000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000002.json,0a48d8473ced.pin+0+0000000002.json,714,1696877763000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000003.json,0a48d8473ced.pin+0+0000000003.json,640,1696877766000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000004.json,0a48d8473ced.pin+0+0000000004.json,444,1696877800000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000005.json,0a48d8473ced.pin+0+0000000005.json,681,1696877803000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000006.json,0a48d8473ced.pin+0+0000000006.json,444,1697024999000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000007.json,0a48d8473ced.pin+0+0000000007.json,681,1697025002000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000008.json,0a48d8473ced.pin+0+0000000008.json,714,1697025004000
dbfs:/mnt/user-0a48d8473ced-bucket/topics/0a48d8473ced.pin/partition=0/0a48d8473ced.pin+0+0000000009.json,0a48d8473ced.pin+0+0000000009.json,640,1697025007000


In [0]:
#Clean the df_pin DataFrame
#Replace empty entries and entries with no relevant data in each column with Nones
df_pin = df_pin.replace('', None)
df_pin = df_pin.replace(' ', None)
df_pin = df_pin.replace('nan', None)
#Perform the necessary transformations on the follower_count to ensure every entry is a number. 
# Make sure the data type of this column is an int.
#The follower count is a string some with a k or M at the end.
#Replace the k with 000 
#Replace the M with 000000 and convert the string to an integer
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', 'k', '000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', 'M', '000000'))
df_pin = df_pin.withColumn('follower_count', df_pin['follower_count'].cast('int'))
#Clean the data in the save_location column to include only the save location path
#The column contains Local save in /data/path
#Replace Local save in with nothing
df_pin = df_pin.withColumn('save_location', regexp_replace('save_location', 'Local save in ', ''))
#Rename the index column to ind
df_pin = df_pin.withColumnRenamed('index', 'ind')
#Reorder the DataFrame columns to have the following column order:
# ind unique_id title description follower_count poster_name tag_list is_image_or_video
# image_src save_location category
df_pin = df_pin.select('ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name', 'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category')

In [0]:
display(df_pin)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
4387,ae5e7377-f1bd-4ac5-94de-bee317f51a43,Βάπτιση: H παραμυθένια βάπτιση της Τιτίκας με θέμα το μονόκερο από την e.m. for you,Το όνομα που επέλεξε η μαμά Ανδριανή για τη γλυκιά Τιτίκα δεν είναι καθόλου τυχαίο. Και φυσικά δεν άφησε τίποτα στην τύχη ούτε την ημέρα της βάπτισης. Ανέθεσε την οργάνωση στην…,4.0,Manosbojana Katsareas,"Diy Flowers,Flower Diy,Baptism Decorations,Christening,Event Planning,Wedding Planner,Baptism Ideas,Birthday,Party",image,https://i.pinimg.com/originals/db/aa/d2/dbaad28fa85012a4ea6958540d98a8e5.jpg,/data/event-planning,event-planning
6717,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3,〚 Уютные шведские коттеджи от Carina Olander 〛 ◾ Фото ◾ Идеи ◾ Дизайн,"Традиционные шведские коттеджи, обычно с красным фасадом — это настоящее воплощением идеального зимнего уюта. Они обычно оформлены очень просто и ✌PUFIK. Beautiful Interiors. On…",136000.0,PUFIK Interiors & Inspirations,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",image,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,/data/home-decor,home-decor
4858,58101415-9273-4311-a5bd-0015a56579b4,THE EVENT COLLECTIVE ✖️ on Instagram: “I’ve always loved emerald green 🌲 by @a.purnellproduction Beautiful balloons by @basicallycuteevents @inspiredengravings for the acrylic…”,"15.1k Likes, 83 Comments - THE EVENT COLLECTIVE ✖️ (@theeventcollectivex) on Instagram: “I’ve always loved emerald green 🌲 by @a.purnellproduction Beautiful balloons by…”",311.0,Marie Bradford,"Diy Birthday Decorations,Balloon Decorations,Table Decorations,Emerald Green Decor,40th Birthday Parties,24th Birthday,Surprise Birthday,Brunch Decor,Quinceanera Themes",image,https://i.pinimg.com/originals/91/0b/5c/910b5c120f7d1570ffc840302d7b49f4.jpg,/data/event-planning,event-planning
4608,d234e56f-5b18-4ef3-905b-44103f7719d9,"Virtual Baby Shower Little Man Baby Shower Banner, Mustache Baby Shower Backdrop, Oh Boy, Any Color, Printed Or Printable File BBS0035 - 10x8 ft / Top Pole Pocket","Wow your guests! Our backdrops are a great option for providing a personalized, stylish and fun addition to your party .It will be the focal point in any event! They are great a…",1000.0,"Iconica Design | Personalized Event Decor, Stationery & Gifts","Christmas Party Backdrop,Holiday Banner,Birthday Backdrop,Circus First Birthday,First Birthday Banners,Dinasour Birthday,Birthday Bash,Banner Backdrop,Photo Booth Backdrop",image,https://i.pinimg.com/originals/15/1f/93/151f93d662dc158ca2c9bbfed198f556.jpg,/data/event-planning,event-planning
6633,d136f6bc-840d-44f8-bbad-115eb7e6c51e,The Cottage Journal on Instagram: “Can you say color?! 😍😍😍 We are loving the cheery vibes that these aqua blue cabinets are giving. If you could paint your cabinets any…”,"6,636 Likes, 141 Comments - The Cottage Journal (@thecottagejournal) on Instagram: “Can you say color?! 😍😍😍 We are loving the cheery vibes that these aqua blue cabinets are g…",394.0,Sarah Martin,"Diy Kitchen Cabinets,Kitchen Redo,Home Decor Kitchen,New Kitchen,Home Kitchens,Kitchen Remodeling,Aqua Kitchen,Kitchen Counters,Kitchen Islands",image,https://i.pinimg.com/originals/8c/17/a2/8c17a257b70780480bb89c3699363144.jpg,/data/home-decor,home-decor
2539,cd2c667e-da47-4818-8f94-3def20b90864,Free Kids Printable - Build a Gingerbread Person Craft - Christmas Activities for Kids | Mrs. Merry,Make your own gingerbread person with our free Christmas craft. 4 pages of accessories to mix & match! #gingerbreadchristmasdecor #gingerbreadcraftspreschool #gingerbreadcraftfo…,7000.0,"Mrs. Merry | Free Printables for Kids, Holiday Printables & Party","Christmas Projects For Kids,Christmas To Do List,Christmas Decorations For Kids,Christmas Activities For Kids,Preschool Christmas,Free Christmas Printables,Christmas Books,Christmas Themes,Gingerbread Christmas Decor",image,https://i.pinimg.com/originals/ca/59/b1/ca59b1055ca52521b9ebd01799513b8c.jpg,/data/christmas,christmas
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",5000.0,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,/data/christmas,christmas
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",5000.0,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,/data/christmas,christmas
3431,cb2453aa-3075-44a9-b8b1-a0d99cacab64,Handprint Reindeer Craft For Kids [Free Template],"This handprint reindeer is a fun DIY Christmas craft for kids to make. It's easy enough for toddlers, preschool and kindergarten children to make and it comes with a free printa…",267000.0,Easy Kids Crafts & Activities | Preschool & Kindergarten Ideas,"Christmas Arts And Crafts,Christmas Activities For Kids,Christmas Fun,Christmas Crafts For Kids To Make At School,Childrens Christmas Crafts,Homemade Christmas,Toddler Christmas Crafts,Christmas Decorations Diy For Kids,Easy Christmas Crafts For Toddlers",image,https://i.pinimg.com/originals/6d/56/f7/6d56f7f568174b9fd6160f1d96cd3186.jpg,/data/diy-and-crafts,diy-and-crafts
1845,d1ec8a17-5517-42a9-82f3-fc0fb26a87b2,12 Minimalist Christmas Decorations You'll Want to Copy This Year,"You HAVE TO check out these modern minimalist Christmas decorations! They're SO GOOD! I'm so glad I found these understated Christmas decoration ideas, definitely going to use t…",37000.0,"Joyfully Growing Blog | All things DIY + home decor, budget friendly hacks, & blogging tips","Decoration Christmas,Farmhouse Christmas Decor,Decoration Table,Christmas Decorations For Apartment,Apartment Holiday Decor,Christmas Decor For Stairs,Tv Stand Christmas Decor,Home Decorations,Modern Christmas Decor",image,https://i.pinimg.com/originals/84/40/67/84406731f0b32ec71829e6ca9b2dddeb.jpg,/data/christmas,christmas


In [0]:
#Clean the df_geo dataframe
#Create a new column coordinates that contains an array based on the latitude and longitude columns
df_geo = df_geo.withColumn('coordinates', array('latitude', 'longitude'))
#Drop the latitude and longitude columns from the DataFrame
df_geo = df_geo.drop('latitude', 'longitude')
#Convert the timestamp column from a string to a timestamp data type
df_geo = df_geo.withColumn('timestamp', df_geo['timestamp'].cast('timestamp'))
#Reorder the DataFrame columns to have the following column order:
# ind country coordinates timestamp
df_geo = df_geo.select('ind', 'country', 'coordinates', 'timestamp')

display(df_geo)

ind,country,coordinates,timestamp
9455,British Indian Ocean Territory (Chagos Archipelago),"List(-82.9272, -150.346)",2022-03-15T01:46:32.000+0000
6814,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2022-09-02T11:34:28.000+0000
7151,British Indian Ocean Territory (Chagos Archipelago),"List(-14.6744, -75.3714)",2020-06-05T23:37:24.000+0000
8221,British Indian Ocean Territory (Chagos Archipelago),"List(-20.5574, -54.4834)",2021-12-29T06:33:46.000+0000
7569,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2018-10-16T08:40:26.000+0000
5111,British Indian Ocean Territory (Chagos Archipelago),"List(-83.7472, 8.65953)",2021-04-01T00:56:57.000+0000
2989,British Indian Ocean Territory (Chagos Archipelago),"List(-87.013, 133.062)",2020-01-09T19:18:54.000+0000
1624,British Indian Ocean Territory (Chagos Archipelago),"List(9.50751, 119.757)",2020-10-24T11:13:47.000+0000
10073,Antarctica (the territory South of 60 deg S),"List(-32.8885, -170.295)",2021-06-29T19:56:04.000+0000
10073,Antarctica (the territory South of 60 deg S),"List(-32.8885, -170.295)",2021-06-29T19:56:04.000+0000


In [0]:
#Clean the df_user dataframe
#Create a new column user_name that concatenates the information found in the first_name and last_name columns
df_user = df_user.withColumn('user_name', concat(df_user['first_name'], lit(' '), df_user['last_name']))
#Drop the first_name and last_name columns from the DataFrame
df_user = df_user.drop('first_name', 'last_name')
#Convert the date_joined column from a string to a timestamp data type
df_user = df_user.withColumn('date_joined', df_user['date_joined'].cast('timestamp'))
#Reorder the DataFrame columns to have the following column order:
# ind user_name age date_joined
df_user = df_user.select('ind', 'user_name', 'age', 'date_joined')

display(df_user)

ind,user_name,age,date_joined
6353,Christopher Hernandez,42,2017-02-18T00:31:22.000+0000
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
10673,Alexander Cervantes,59,2017-05-12T21:22:17.000+0000
1857,Christopher Hamilton,48,2016-02-27T16:57:44.000+0000
10020,Christopher Hawkins,45,2016-09-15T06:02:53.000+0000
2041,Christopher Campbell,35,2015-10-22T22:42:23.000+0000
7031,Christopher Anderson,48,2016-06-13T17:09:14.000+0000
8075,Benjamin Fitzpatrick,50,2017-02-07T08:09:03.000+0000
2539,Christopher Williams,36,2016-06-01T09:41:09.000+0000
6398,Christina Davenport,39,2016-06-29T20:43:59.000+0000


In [0]:
#Find the most popular Pinterest category people post to based on their country.
#Your query should return a DataFrame that contains the following columns:
# country category category_count, a new column containing the desired query output

#Create a new DataFrame that contains the country and category columns from the 
# country in df_geo DataFrame 
#and the category column from the df_pin DataFrame
#with the ind column as the join key
df_category_country = df_geo.join(df_pin, df_geo.ind == df_pin.ind).select(df_geo.country, df_pin.category)

df_category_country = df_category_country.groupBy('country', 'category').count()
df_category_country = df_category_country.withColumnRenamed('count', 'category_count')
df_category_country = df_category_country.orderBy('category_count', ascending=False)
df_category_country = df_category_country.dropDuplicates(['country'])
df_category_country = df_category_country.select('country', 'category', 'category_count')

display(df_category_country)


country,category,category_count
Afghanistan,education,29
Albania,art,40
Algeria,quotes,53
American Samoa,tattoos,18
Andorra,tattoos,11
Angola,diy-and-crafts,7
Anguilla,diy-and-crafts,9
Antarctica (the territory South of 60 deg S),christmas,11
Antigua and Barbuda,travel,8
Argentina,tattoos,22


In [0]:
#Find how many posts each category had between 2018 and 2022.

#Return a DataFrame that contains the following columns:
# post_year, a new column that contains only the year from the timestamp column in df_geo
# category from the df_pin DataFrame
# category_count, a new column containing the desired query output

#join the two DataFrames on the ind column and select the post_year using the year from the timestamp and category columns
df_category_year = df_geo.join(df_pin, df_geo.ind == df_pin.ind).select(year(df_geo.timestamp).alias('post_year'), df_pin.category)
df_category_year = df_category_year.groupBy('post_year', 'category').count()
df_category_year = df_category_year.withColumnRenamed('count', 'category_count')
df_category_year = df_category_year.orderBy('post_year', ascending=False)
df_category_year = df_category_year.select('post_year', 'category', 'category_count')
df_category_year = df_category_year.filter(df_category_year.post_year >= 2018)
df_category_year = df_category_year.filter(df_category_year.post_year <= 2022)
display(df_category_year)

post_year,category,category_count
2022,vehicles,27
2022,travel,38
2022,beauty,41
2022,finance,43
2022,event-planning,28
2022,tattoos,39
2022,mens-fashion,24
2022,christmas,58
2022,art,41
2022,diy-and-crafts,34


In [0]:
#Find the user with the most followers in each country.
#For each country find the user with the most followers.
#Your query should return a DataFrame that contains the following columns:
#country poster_name follower_count

#country is from df_geo
#poster_name and follower_count are from df_pin
#join the two DataFrames on the ind column and select the country, poster_name, and follower_count columns
df_most_followers_per_country = df_geo.join(df_pin, df_geo.ind == df_pin.ind).select(df_geo.country, df_pin.poster_name, df_pin.follower_count)
df_most_followers_per_country = df_most_followers_per_country.groupBy('country', 'poster_name').max('follower_count')
df_most_followers_per_country = df_most_followers_per_country.withColumnRenamed('max(follower_count)', 'follower_count')
df_most_followers_per_country = df_most_followers_per_country.orderBy('follower_count', ascending=False)
df_most_followers_per_country = df_most_followers_per_country.dropDuplicates(['country'])
df_most_followers_per_country = df_most_followers_per_country.select('country', 'poster_name', 'follower_count')
display(df_most_followers_per_country)

country,poster_name,follower_count
Afghanistan,9GAG,3000000
Albania,The Minds Journal,5000000
Algeria,Apartment Therapy,5000000
American Samoa,Mamas Uncut,8000000
Andorra,Teachers Pay Teachers,1000000
Angola,Tastemade,8000000
Anguilla,We Heart It,15000000
Antarctica (the territory South of 60 deg S),Refinery29,1000000
Antigua and Barbuda,Country Living Magazine,1000000
Argentina,Cheezburger,2000000


In [0]:
#Find the country with the user with most followers.
#Return a DataFrame (with one entry) that contains the following columns:
#country follower_count

df_most_followed = df_most_followers_per_country.groupBy('country').max('follower_count')
df_most_followed = df_most_followed.withColumnRenamed('max(follower_count)', 'follower_count')
df_most_followed = df_most_followed.orderBy('follower_count', ascending=False)

#limit to one entry
df_most_followed = df_most_followed.limit(1)
display(df_most_followed)

country,follower_count
Anguilla,15000000


In [0]:
#Find the most popular category for different age groups.

#The query returns a DataFrame that contains the following columns:
#age_group, a new column based on the original age column in df_user
#category from df_pin
#category_count, a new column containing the desired query output

#The following age groups are used:
# 18-24 25-35 36-50 50+

#Join DataFrames using ind column
df_age_groups_category = df_pin.join(df_user, df_pin.ind == df_user.ind).select(df_user.age, df_pin.category)
#Create a new column age_group that contains the age group based on the age column
df_age_groups_category = df_age_groups_category.withColumn('age_group', when((df_age_groups_category.age >= 18) & (df_age_groups_category.age <= 24), '18-24')
                                                           .when((df_age_groups_category.age >= 25) & (df_age_groups_category.age <= 35), '25-35')
                                                           .when((df_age_groups_category.age >= 36) & (df_age_groups_category.age <= 50), '36-50')
                                                           .when(df_age_groups_category.age > 50, '50+')
                                                           .otherwise('Unknown'))
df_age_groups_category = df_age_groups_category.groupBy('age_group', 'category').count()
df_age_groups_category = df_age_groups_category.withColumnRenamed('count', 'category_count')
df_age_groups_category = df_age_groups_category.orderBy('age_group', ascending=True)
df_age_groups_category = df_age_groups_category.select('age_group', 'category', 'category_count')
display(df_age_groups_category)


age_group,category,category_count
50+,travel,15
50+,event-planning,22
50+,christmas,17
50+,tattoos,5
50+,diy-and-crafts,9
50+,quotes,14
50+,beauty,26
50+,finance,9
50+,mens-fashion,22
50+,education,12


In [0]:
#Find the median follower count for these age groups:
#18-24 25-35 36-50 50+

#This query returns a DataFrame that contains the following columns:
#age_group, a new column based on the original age column from df_user
#median_follower_count, a new column containing the desired query output, follower_count is from df_pin

df_median_followers = df_pin.join(df_user, df_pin.ind == df_user.ind).select(df_user.age, df_pin.follower_count)
df_median_followers = df_median_followers.withColumn('age_group', when((df_median_followers.age >= 18) & (df_median_followers.age <= 24), '18-24')
							     .when((df_median_followers.age >= 25) & (df_median_followers.age <= 35), '25-35')
							     .when((df_median_followers.age >= 36) & (df_median_followers.age <= 50), '36-50')
							     .when(df_median_followers.age > 50, '50+')	
							     .otherwise('Unknown'))

#calculate median follower count
df_median_followers = df_median_followers.groupBy('age_group').agg(expr('percentile(follower_count, 0.5)').alias('median_follower_count'))
df_median_followers = df_median_followers.orderBy('age_group', ascending=True)
display(df_median_followers)

age_group,median_follower_count
18-24,130000.0
25-35,26000.0
36-50,7000.0
50+,877.0


In [0]:
#Find how many users have joined each year.

#This query returns a DataFrame that contains the following columns:
#post_year, a new column that contains only the year from the timestamp column in df_user
#number_users_joined, a new column containing the desired query output

df_users_joined = df_user.select(year(df_user.date_joined).alias('post_year'))
df_users_joined = df_users_joined.groupBy('post_year').count()
df_users_joined = df_users_joined.withColumnRenamed('count', 'number_users_joined')
df_users_joined = df_users_joined.orderBy('post_year', ascending=True)
display(df_users_joined)

post_year,number_users_joined
2015,894
2016,1004
2017,359


In [0]:
#Find median follower count of users based on the year they joined.

#For years joined between 2015 and 2020.
#This query should return a DataFrame that contains the following columns:
#post_year, a new column that contains only the year from the timestamp column
#median_follower_count, a new column containing the desired query output

df_users_joined_median_follower_count = df_user.join(df_pin, df_user.ind == df_pin.ind).select(year(df_user.date_joined).alias('post_year'), df_pin.follower_count)
df_users_joined_median_follower_count = df_users_joined_median_follower_count.groupBy('post_year').agg(expr('percentile(follower_count, 0.5)').alias('median_follower_count'))
df_users_joined_median_follower_count = df_users_joined_median_follower_count.orderBy('post_year', ascending=True)
df_users_joined_median_follower_count = df_users_joined_median_follower_count.filter(df_users_joined_median_follower_count.post_year >= 2015)
df_users_joined_median_follower_count = df_users_joined_median_follower_count.filter(df_users_joined_median_follower_count.post_year <= 2020)
display(df_users_joined_median_follower_count)


post_year,median_follower_count
2015,163000.0
2016,18000.0
2017,4000.0


In [0]:
#Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.

#This query should return a DataFrame that contains the following columns:
#age_group, a new column based on the original age column from df_user
#post_year, a new column that contains only the year from the timestamp column from df_user
#median_follower_count, a new column containing the desired query output, from df_pin

df_med_followers_for_year_age_group = df_user.join(df_pin, df_user.ind == df_pin.ind).select(df_user.age, year(df_user.date_joined).alias('post_year'), df_pin.follower_count)
df_med_followers_for_year_age_group = df_med_followers_for_year_age_group.withColumn('age_group', when((df_med_followers_for_year_age_group.age >= 18) & (df_med_followers_for_year_age_group.age <= 24), '18-24')
														 .when((df_med_followers_for_year_age_group.age >= 25) & (df_med_followers_for_year_age_group.age <= 35), '25-35')
														 .when((df_med_followers_for_year_age_group.age >= 36) & (df_med_followers_for_year_age_group.age <= 50), '36-50')
														 .when(df_med_followers_for_year_age_group.age > 50, '50+')
														 .otherwise('Unknown'))
df_med_followers_for_year_age_group = df_med_followers_for_year_age_group.groupBy('post_year', 'age_group').agg(expr('percentile(follower_count, 0.5)').alias('median_follower_count'))
df_med_followers_for_year_age_group = df_med_followers_for_year_age_group.orderBy('post_year', ascending=True)
df_med_followers_for_year_age_group = df_med_followers_for_year_age_group.filter(df_med_followers_for_year_age_group.post_year >= 2015)
df_med_followers_for_year_age_group = df_med_followers_for_year_age_group.filter(df_med_followers_for_year_age_group.post_year <= 2020)
display(df_med_followers_for_year_age_group)

post_year,age_group,median_follower_count
2015,50+,14000.0
2015,25-35,44000.0
2015,18-24,228000.0
2015,36-50,11000.0
2016,36-50,9500.0
2016,25-35,24000.0
2016,50+,457.0
2016,18-24,37000.0
2017,36-50,3000.0
2017,25-35,6000.0
