In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").getOrCreate()

22/06/24 09:13:19 WARN Utils: Your hostname, DESKTOP-EJLBN3A resolves to a loopback address: 127.0.1.1; using 172.20.24.149 instead (on interface eth0)
22/06/24 09:13:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/24 09:13:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [201]:
# Reads the csv in the data directory into a spark dataframe
spark_df = (spark.read.format("csv").options(header="true").load("./data/spotify_artists.csv"))

### DATA PROFILING

# .describe(), with no arguments prints the dataframe column datatypes 
spark_df.describe()
spark_df.printSchema()
spark_df.select(spark_df.name, spark_df.genres).show(10)

[Stage 454:>                                                        (0 + 1) / 1]

root
 |-- row: string (nullable = true)
 |-- artist_popularity: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name_prev: string (nullable = true)
 |-- type: string (nullable = true)

+--------------------+--------------------+
|                name|              genres|
+--------------------+--------------------+
|       Juliano Cezar|['sertanejo', 'se...|
|      The Grenadines|                  []|
|             Gangway| ['danish pop rock']|
|               FADES|['uk alternative ...|
| Jean-Pierre Guignon|  ['french baroque']|
|              Filhos|                  []|
|                Eloq|                  []|
|              Fravær|                  []|
|       Camille Pépin|                  []|
|Pepe Willberg & T...|['classic finnish...|
+--------------------+--------------------+
only showing top 10 ro

                                                                                

In [210]:
### DATA CLEANING

# Where the genre is an empty list, replace it with ['elevator music'].
# For the columns 'artist_popularity' and 'followers', cast the data type as integers.
# Sort the data in descending order by number of followers.
# 'artist_popularity' is a rank out of 100. Write a user defined function that will divide each popularity value by 100. Rename the column 'popularity_percent'.

from pyspark.sql.functions import regexp_replace, col, udf
from pyspark.sql.types import IntegerType

spark_df = (spark.read.format("csv").options(header="true").load("./data/spotify_artists.csv"))

# Replace the genres column's empty list values with ['elevator music]
# spark_df.filter("genres = '[]' ").show(10)
spark_df = spark_df.withColumn("genres", regexp_replace("genres", r"\[]", r"['elevator music']"))
spark_df.show(10)

spark_df = spark_df.withColumn("artist_popularity", spark_df["artist_popularity"].cast(IntegerType()))
spark_df = spark_df.withColumn("followers", spark_df["followers"].cast(IntegerType()))
print(spark_df.dtypes)

# two different ways to use sort (though the sort order changes..) 
# .sort() without also passing col() left me unable to use the .withColumn() method afterwards
# spark_df = spark_df.sort("followers", descending=True)
spark_df = spark_df.sort(col("followers").desc())
spark_df.show(3)

def div_by_100(num):
    """
    Divide a column with type int by 100, obtaining a percentage
    """
    num = num/100
    return num

udf_div_by_100 = udf(lambda x:div_by_100(x))

spark_df = spark_df.withColumn("artist_popularity", udf_div_by_100("artist_popularity"))
spark_df = spark_df.withColumnRenamed("artist_popularity", "popularity_percent")
spark_df.show(3)

+---+-----------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|row|artist_popularity|followers|              genres|                  id|                name|            track_id|track_name_prev|  type|
+---+-----------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|  0|               44|    23230|['sertanejo', 'se...|4mGnpjhqgx4RUdsIJ...|       Juliano Cezar|0wmDmAILuW9e2aRtt...|        track_9|artist|
|  1|               22|      313|  ['elevator music']|1dLnVku4VQUOLswwD...|      The Grenadines|4wqwj0gA8qPZKLl5W...|       track_30|artist|
|  2|               26|     1596| ['danish pop rock']|6YVY310fjfUzKi8hi...|             Gangway|1bFqWDbvHmZe2f4Nf...|       track_38|artist|
|  3|               31|      149|['uk alternative ...|2VElyouiCfoYPDJlu...|               FADES|3MFSUBAidPzRBbIS7...|       track_34|artist|
|  4|        