In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as spark_sum, countDistinct
import time

# Initialize Spark session
spark = SparkSession.builder \
    .appName("spotify-datalake") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "1024M") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")


25/02/07 19:50:04 WARN Utils: Your hostname, cloud1 resolves to a loopback address: 127.0.1.1; using 192.168.121.115 instead (on interface eth0)
25/02/07 19:50:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/usr/local/sdkman/candidates/spark/3.5.1/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/lucaslaredo/.ivy2/cache
The jars for the packages stored in: /home/lucaslaredo/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-36d6557f-8309-4691-9942-e23dec1d195d;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 124ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3  

In [2]:
playlists_v1_path = '/shared/sampled/playlists_v1.json'
playlists_v2_path = '/shared/sampled/playlists_v2.json'
playlists_v3_path = '/shared/sampled/playlists_v3.json'
tracks_v1_path = '/shared/sampled/tracks_v1.json'
tracks_v2_path = '/shared/sampled/tracks_v2.json'
tracks_v3_path = '/shared/sampled/tracks_v3.json'

In [3]:
# considering v2 as bronze layer to avoid disk usage
playlists_v2_df = spark.read.json(playlists_v2_path)
tracks_v2_df = spark.read.json(tracks_v2_path)

                                                                                

# Silver Layer

In [4]:
silver_playlists = spark.read.parquet("./silver/parquet/playlists/")
silver_tracks = spark.read.parquet("./silver/parquet/playlist_tracks")
silver_songs = spark.read.parquet("./silver/parquet/songs/")
silver_artists = spark.read.parquet("./silver/parquet/artists_information/")
silver_album = spark.read.parquet("./silver/parquet/album_information/")

In [5]:
from pyspark.sql.functions import col, coalesce

updated_tracks = silver_tracks.alias("old").join(
    tracks_v2_df.alias("new"),
    (col("old.pid") == col("new.pid")) & (col("old.pos") == col("new.pos")),
    "outer"
).select(
    coalesce(col("new.pid"), col("old.pid")).alias("pid"),
    coalesce(col("new.pos"), col("old.pos")).alias("pos"),
    coalesce(col("new.track_uri"), col("old.track_uri")).alias("track_uri"),
    coalesce(col("new.album_uri"), col("old.album_uri")).alias("album_uri"),
    coalesce(col("new.artist_uri"), col("old.artist_uri")).alias("artist_uri")
)

In [6]:
updated_playlists = silver_playlists.alias("old").join(
    playlists_v2_df.alias("new"),
    silver_playlists.pid == playlists_v2_df.pid,
    "outer"
).select(
    coalesce(col("new.pid"), col("old.pid")).alias("pid"),
    coalesce(col("new.name"), col("old.name")).alias("name"), 
    coalesce(col("new.description"), col("old.description")).alias("description"),
    coalesce(col("new.collaborative"), col("old.collaborative")).alias("collaborative")
)

In [7]:
updated_tracks.write.mode("overwrite").parquet("./silver/parquet/playlist_tracks/")
updated_playlists.write.mode("overwrite").parquet("./silver/parquet/playlists/")

25/02/07 19:50:19 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

# Gold Layer

In [8]:
updated_tracks = spark.read.parquet("./silver/parquet/playlist_tracks/")
updated_playlists = spark.read.parquet("./silver/parquet/playlists/")
silver_songs = spark.read.parquet("./silver/parquet/songs/")
silver_artists = spark.read.parquet("./silver/parquet/artists_information/")
silver_album = spark.read.parquet("./silver/parquet/album_information/")

In [9]:
gold_playlist_info = updated_tracks.alias("spt").join(
    silver_songs.alias("ss"), col("spt.track_uri") == col("ss.track_uri"), "inner"
).select(
    col("ss.duration_ms"),
    col("ss.artist_uri").alias("song_artist_uri"),
    col("ss.album_uri"),
    col("ss.track_uri"),
    col("spt.pid").alias("playlist_id")
).groupBy("playlist_id").agg(
    col("playlist_id").alias("pid"),
    spark_sum("duration_ms").alias("total_duration_ms"),
    countDistinct("track_uri").alias("num_tracks"),
    countDistinct("song_artist_uri").alias("num_artists"),
    countDistinct("album_uri").alias("num_albums")
).join(updated_playlists, "pid", "inner").select(
    col("pid").alias("playlist_id"),
    "total_duration_ms",
    "num_tracks",
    "num_artists",
    "num_albums",
    "name",
    "description"
)

In [10]:
gold_playlist_tracks = updated_tracks.alias("spt").join(
    silver_songs.alias("ss"), col("spt.track_uri") == col("ss.track_uri"), "inner"
).select(
    col("ss.artist_uri"),
    col("ss.album_uri"),
    col("ss.track_name"),
    col("spt.pos"),
    col("spt.pid").alias("playlist_id")
).join(
    silver_artists, "artist_uri", "inner"
).select(
    "playlist_id",
    "pos",
    "track_name",
    "album_uri",
    "artist_name"
).join(
    silver_album, "album_uri", "inner"
).select(
    "playlist_id",
    "pos",
    "track_name",
    "artist_name",
    "album_name"
)

In [11]:
gold_playlist_info.write.mode("overwrite").parquet("./gold/parquet/playlists/")
gold_playlist_tracks.write.mode("overwrite").parquet("./gold/parquet/playlist_tracks/")

25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:50:25 WARN RowBasedKeyValueBatch: Calling spill() on

# Update playlist 11992

In [34]:
silver_playlists = spark.read.parquet("./silver/parquet/playlists/")
updated_tracks = spark.read.parquet("./silver/parquet/playlist_tracks/")
updated_playlists = spark.read.parquet("./silver/parquet/playlists/")
silver_songs = spark.read.parquet("./silver/parquet/songs/")

In [35]:
# fix incorrect data entry for playlist 11992
from pyspark.sql.functions import col

updated_playlists = silver_playlists.alias("old").join(
    playlists_v2_df.alias("new"),
    col("old.pid") == col("new.pid"),
    "outer"
).select(
    coalesce(col("new.pid"), col("old.pid")).alias("pid"),
    coalesce(col("new.name"), col("old.name")).alias("name"), 
    coalesce(col("new.description"), col("old.description")).alias("description"),
    coalesce(col("new.collaborative"), col("old.collaborative")).alias("collaborative")
)

In [None]:
updated_playlists.write.mode("overwrite").parquet("./silver/parquet/playlists/")

                                                                                

In [41]:
updated_playlists = spark.read.parquet("./silver/parquet/playlists/")

In [42]:
gold_playlist_info = updated_tracks.alias("spt").join(
    silver_songs.alias("ss"), col("spt.track_uri") == col("ss.track_uri"), "inner"
).select(
    col("ss.duration_ms"),
    col("ss.artist_uri").alias("song_artist_uri"),
    col("ss.album_uri"),
    col("ss.track_uri"),
    col("spt.pid").alias("playlist_id")
).groupBy("playlist_id").agg(
    col("playlist_id").alias("pid"),
    spark_sum("duration_ms").alias("total_duration_ms"),
    countDistinct("track_uri").alias("num_tracks"),
    countDistinct("song_artist_uri").alias("num_artists"),
    countDistinct("album_uri").alias("num_albums")
).join(updated_playlists, "pid", "inner").select(
    col("pid").alias("playlist_id"),
    "total_duration_ms",
    "num_tracks",
    "num_artists",
    "num_albums",
    "name",
    "description"
)

In [43]:
gold_playlist_tracks = updated_tracks.alias("spt").join(
    silver_songs.alias("ss"), col("spt.track_uri") == col("ss.track_uri"), "inner"
).select(
    col("ss.artist_uri"),
    col("ss.album_uri"),
    col("ss.track_name"),
    col("spt.pos"),
    col("spt.pid").alias("playlist_id")
).join(
    silver_artists, "artist_uri", "inner"
).select(
    "playlist_id",
    "pos",
    "track_name",
    "album_uri",
    "artist_name"
).join(
    silver_album, "album_uri", "inner"
).select(
    "playlist_id",
    "pos",
    "track_name",
    "artist_name",
    "album_name"
)


In [44]:
gold_playlist_info.write.mode("overwrite").parquet("./gold/parquet/playlists/")
gold_playlist_tracks.write.mode("overwrite").parquet("./gold/parquet/playlist_tracks/")

25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 19:54:56 WARN RowBasedKeyValueBatch: Calling spill() on

# Ingest new dataset

In [45]:
# Ingest third sample (v3) and repeat the process
playlists_v3_df = spark.read.json(playlists_v3_path)
tracks_v3_df = spark.read.json(tracks_v3_path)

In [46]:
# ---> Considering bronze layer as v3 to avoid disk usage

# playlists_v3_df.write.mode("overwrite").json("/bronze/playlists_v3/")
# tracks_v3_df.write.mode("overwrite").json("/bronze/tracks_v3/")


In [53]:
silver_playlists = spark.read.parquet("./silver/parquet/playlists/")
updated_tracks = spark.read.parquet("./silver/parquet/playlist_tracks/")
updated_playlists = spark.read.parquet("./silver/parquet/playlists/")
silver_songs = spark.read.parquet("./silver/parquet/songs/")

In [54]:
updated_tracks = updated_tracks.alias("old").join(
    tracks_v3_df.alias("new"),
    (col("old.pid") == col("new.pid")) & (col("old.pos") == col("new.pos")),
    "outer"
).select(
    coalesce(col("new.pid"), col("old.pid")).alias("pid"),
    coalesce(col("new.pos"), col("old.pos")).alias("pos"),
    coalesce(col("new.track_uri"), col("old.track_uri")).alias("track_uri"),
    coalesce(col("new.album_uri"), col("old.album_uri")).alias("album_uri"),
    coalesce(col("new.artist_uri"), col("old.artist_uri")).alias("artist_uri")
)


In [55]:
updated_playlists = updated_playlists.alias("old").join(
    playlists_v3_df.alias("new"),
    col("old.pid") == col("new.pid"),
    "outer"
).select(
    coalesce(col("new.pid"), col("old.pid")).alias("pid"),
    coalesce(col("new.name"), col("old.name")).alias("name"),
    coalesce(col("new.description"), col("old.description")).alias("description"),
    coalesce(col("new.collaborative"), col("old.collaborative")).alias("collaborative")
)

In [57]:
updated_tracks.write.mode("overwrite").parquet("./silver/parquet/playlist_tracks/")
updated_playlists.write.mode("overwrite").parquet("./silver/parquet/playlists/")

25/02/07 20:03:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [61]:
updated_tracks = spark.read.parquet("./silver/parquet/playlist_tracks/")
updated_playlists = spark.read.parquet("./silver/parquet/playlists/")

In [62]:
gold_playlist_info = updated_tracks.alias("spt").join(
    silver_songs.alias("ss"), col("spt.track_uri") == col("ss.track_uri"), "inner"
).select(
    col("ss.duration_ms"),
    col("ss.artist_uri").alias("song_artist_uri"),
    col("ss.album_uri"),
    col("ss.track_uri"),
    col("spt.pid").alias("playlist_id")
).groupBy("playlist_id").agg(
    col("playlist_id").alias("pid"),
    spark_sum("duration_ms").alias("total_duration_ms"),
    countDistinct("track_uri").alias("num_tracks"),
    countDistinct("song_artist_uri").alias("num_artists"),
    countDistinct("album_uri").alias("num_albums")
).join(updated_playlists, "pid", "inner").select(
    col("pid").alias("playlist_id"),
    "total_duration_ms",
    "num_tracks",
    "num_artists",
    "num_albums",
    "name",
    "description"
)

In [63]:
gold_playlist_tracks = updated_tracks.alias("spt").join(
    silver_songs.alias("ss"), col("spt.track_uri") == col("ss.track_uri"), "inner"
).select(
    col("ss.artist_uri"),
    col("ss.album_uri"),
    col("ss.track_name"),
    col("spt.pos"),
    col("spt.pid").alias("playlist_id")
).join(
    silver_artists, "artist_uri", "inner"
).select(
    "playlist_id",
    "pos",
    "track_name",
    "album_uri",
    "artist_name"
).join(
    silver_album, "album_uri", "inner"
).select(
    "playlist_id",
    "pos",
    "track_name",
    "artist_name",
    "album_name"
)

In [64]:
gold_playlist_info.write.mode("overwrite").parquet("./gold/parquet/playlists/")
gold_playlist_tracks.write.mode("overwrite").parquet("./gold/parquet/playlist_tracks/")

25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/07 20:03:58 WARN RowBasedKeyValueBatch: Calling spill() on

A adoção do Parquet em Data Lakes traz desafios como a evolução do esquema, que exige gerenciamento rigoroso para adaptar mudanças na estrutura de dados sem comprometer a compatibilidade. Além disso, o formato não suporta nativamente operações de atualização/exclusão eficientes, obrigando a regravação de arquivos inteiros, o que é custoso em grandes volumes. Particionamento inadequado pode gerar problemas de desempenho, e a concorrência em leitura/gravação exige mecanismos extras para garantir consistência.

Outros obstáculos incluem a complexidade no gerenciamento de metadados (essencial para consultas eficientes) e a escolha de estratégias de compressão balanceadas entre armazenamento e desempenho. A integração com ferramentas de análise também pode demandar ajustes. Apesar dessas limitações, o Parquet permanece vantajoso para cenários de leitura, desde que seus desafios sejam mitigados com planejamento e boas práticas de arquitetura.