In [2]:
import csv 
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SpotifySongs").getOrCreate()

df = spark.read.csv("/home/ec2-user/stream-processing-template/my-files/phase01/spotify-2023.csv", header=True, inferSchema=True)


                                                                                

Select columns

In [3]:
df.select("track_name", "artist(s)_name", "released_year", "streams").show()

+--------------------+--------------------+-------------+----------+
|          track_name|      artist(s)_name|released_year|   streams|
+--------------------+--------------------+-------------+----------+
|Seven (feat. Latt...|    Latto, Jung Kook|         2023| 141381703|
|                LALA|         Myke Towers|         2023| 133716286|
|             vampire|      Olivia Rodrigo|         2023| 140003974|
|        Cruel Summer|        Taylor Swift|         2019| 800840817|
|      WHERE SHE GOES|           Bad Bunny|         2023| 303236322|
|            Sprinter|   Dave, Central Cee|         2023| 183706234|
|     Ella Baila Sola|Eslabon Armado, P...|         2023| 725980112|
|            Columbia|             Quevedo|         2023|  58149378|
|            fukumean|               Gunna|         2023|  95217315|
|     La Bebe - Remix|Peso Pluma, Yng L...|         2023| 553634067|
|           un x100to|Bad Bunny, Grupo ...|         2023| 505671438|
|           Super Shy|            

Filtering based on a condition

In [4]:
df.filter(df.streams > 140000000).show()

+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+----------+------------------+---------------+-------------------+----------------+----------------+---+----+-----+--------------+---------+--------+--------------+------------------+----------+-------------+
|          track_name|      artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|   streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm| key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|
+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+----------+------------------+---------------+-------------------+----------------+----------------+---+----+-----+--------------+---------+--------+--------------+-------

Grouping based on a certain condition

In [5]:
df.groupBy("released_year").count().show()

+-------------+-----+
|released_year|count|
+-------------+-----+
|         1959|    2|
|         1975|    2|
|         2003|    2|
|         2007|    1|
|         2018|   10|
|         2015|   11|
|         2023|  175|
|         2022|  402|
|         2013|   13|
|         1942|    1|
|         1952|    1|
|         1997|    1|
|         1994|    1|
|         1968|    1|
|         2014|   13|
|         1930|    1|
|         1973|    1|
|         1979|    1|
|         1946|    1|
|         2019|   36|
+-------------+-----+
only showing top 20 rows



Sorting Ascending/Descending

In [6]:
df.orderBy(df.streams.desc()).show()

+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+--------------------+------------------+---------------+-------------------+----------------+----------------+---+----+-----+--------------+---------+--------+--------------+------------------+----------+-------------+
|          track_name|      artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|             streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm| key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|
+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+--------------------+------------------+---------------+-------------------+----------------+----------------+---+----+-----+--------------+---------+-

Adding a new column

In [7]:
df.withColumn("total_charts_presence", df.in_spotify_charts + df.in_apple_charts + df.in_deezer_charts + df.in_shazam_charts).select("track_name", "total_charts_presence").show()

+--------------------+---------------------+
|          track_name|total_charts_presence|
+--------------------+---------------------+
|Seven (feat. Latt...|               1246.0|
|                LALA|                570.0|
|             vampire|               1283.0|
|        Cruel Summer|                867.0|
|      WHERE SHE GOES|                623.0|
|            Sprinter|               1267.0|
|     Ella Baila Sola|                703.0|
|            Columbia|                339.0|
|            fukumean|               1257.0|
|     La Bebe - Remix|                506.0|
|           un x100to|                508.0|
|           Super Shy|                430.0|
|             Flowers|                 null|
|            Daylight|                 null|
|           As It Was|                 null|
|           Kill Bill|                438.0|
|   Cupid - Twin Ver.|                295.0|
|"What Was I Made ...|                 null|
|          Classy 101|                341.0|
|         