# Asssignment 3

In [1]:
import spotipy
import json

from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from kafka import KafkaProducer
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, lit, rand
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler

In [2]:
packages = {"org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0"}

In [3]:
load_dotenv()
spark = SparkSession.builder.appName('assignment3').config("spark.jars.packages", ",".join(packages)).getOrCreate()
producer = KafkaProducer(bootstrap_servers='localhost:9092')
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/home/hajta2/.local/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/hajta2/.ivy2/cache
The jars for the packages stored in: /home/hajta2/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d6b4fd0f-25f5-405f-beb3-a6508535f7fe;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 760ms :: artifacts dl 37ms
	:: 

In [4]:
scope = "user-top-read playlist-modify-public playlist-modify-private"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [5]:
producer.send('tracks_topic', b'')
producer.send('audio_features_topic', b'')
producer.send('trending_tracks_topic', b'')
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "tracks_topic, audio_features_topic, trending_tracks_topic")
    .option("startingOffsets", "latest")
    .load()
)

In [6]:
tracks_stream = df.selectExpr("CAST(value AS STRING)").filter("topic = 'tracks_topic'")
features_stream = df.selectExpr("CAST(value AS STRING)").filter("topic = 'audio_features_topic'")
trending_tracks_stream = df.selectExpr("CAST(value AS STRING)").filter("topic = 'trending_tracks_topic'")

In [7]:
tracks_query = tracks_stream.writeStream.format("memory").queryName("tracks").start()
features_query = features_stream.writeStream.format("memory").queryName("features").start()
trending_tracks_query = trending_tracks_stream.writeStream.format("memory").queryName("trending_tracks").start()

In [8]:
saved_tracks = sp.current_user_top_tracks(limit=50, offset=0, time_range='short_term')
for track in saved_tracks['items']:
    producer.send('tracks_topic', json.dumps(track).encode('utf-8'))
    audio_features = sp.audio_features(track['id'])
    producer.send('audio_features_topic', json.dumps(audio_features[0]).encode('utf-8'))

                                                                                

In [9]:
trending_playlist_id = sp.featured_playlists(limit=1, country='HU', locale='hu_HU')['playlists']['items'][0]['id']
trending_tracks = sp.playlist_tracks(trending_playlist_id, limit=50, offset=0, market='HU')['items']
for item in trending_tracks:
    track = item['track']
    producer.send('trending_tracks_topic', json.dumps(track).encode('utf-8'))
    audio_features = sp.audio_features(track['id'])
    producer.send('audio_features_topic', json.dumps(audio_features[0]).encode('utf-8'))

                                                                                

In [10]:
tracks = spark.sql("select * from tracks")
features = spark.sql("select * from features")
trending_tracks = spark.sql("select * from trending_tracks")

In [11]:
print(f"Tracks: {tracks.count()}")
print(f"Features: {features.count()}")
print(f"Trending tracks: {trending_tracks.count()}")

Tracks: 50
Features: 100
Trending tracks: 50


In [12]:
tracks_query.stop()
features_query.stop()
trending_tracks_query.stop()

In [13]:
tracks.printSchema()
features.printSchema()
trending_tracks.printSchema()

root
 |-- value: string (nullable = true)

root
 |-- value: string (nullable = true)

root
 |-- value: string (nullable = true)



In [14]:
tracks_schema = StructType(
    [
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("artists", StringType(), True),
        StructField("duration_ms", IntegerType(), True),
        StructField("popularity", FloatType(), True),
    ]
)

tracks_parsed = tracks.withColumn(
    "parsed_value", from_json(col("value"), tracks_schema)
).select("parsed_value.*")
trending_tracks_parsed = trending_tracks.withColumn(
    "parsed_value", from_json(col("value"), tracks_schema)
).select("parsed_value.*")

In [15]:
tracks_parsed.printSchema()
trending_tracks_parsed.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- popularity: float (nullable = true)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- popularity: float (nullable = true)



In [16]:
print(f"Tracks: {tracks_parsed.count()}")
print(f"Trending tracks: {trending_tracks_parsed.count()}")

Tracks: 50
Trending tracks: 50


In [17]:
print(f"Tracks: {tracks_parsed.show(1)}")
print(f"Trending tracks: {trending_tracks_parsed.show(1)}")

+--------------------+--------------------+--------------------+-----------+----------+
|                  id|                name|             artists|duration_ms|popularity|
+--------------------+--------------------+--------------------+-----------+----------+
|4Z4i631BesV0P6LTv...|Talk to Me You'll...|[{"external_urls"...|     417099|      55.0|
+--------------------+--------------------+--------------------+-----------+----------+
only showing top 1 row

Tracks: None
+--------------------+--------+--------------------+-----------+----------+
|                  id|    name|             artists|duration_ms|popularity|
+--------------------+--------+--------------------+-----------+----------+
|7n5JBAnjVBTFgTEsd...|Inkasszó|[{"external_urls"...|     149673|      38.0|
+--------------------+--------+--------------------+-----------+----------+
only showing top 1 row

Trending tracks: None


In [18]:
features_schema = StructType(
    [
        StructField("id", StringType(), True),
        StructField("danceability", FloatType(), True),
        StructField("energy", FloatType(), True),
        StructField("key", IntegerType(), True),
        StructField("loudness", FloatType(), True),
        StructField("mode", IntegerType(), True),
        StructField("speechiness", FloatType(), True),
        StructField("acousticness", FloatType(), True),
        StructField("instrumentalness", FloatType(), True),
        StructField("liveness", FloatType(), True),
        StructField("valence", FloatType(), True),
        StructField("tempo", FloatType(), True),
    ]
)

features_parsed = features.withColumn(
    "parsed_value", from_json(col("value"), features_schema)
).select("parsed_value.*")

In [19]:
features_parsed.printSchema()

root
 |-- id: string (nullable = true)
 |-- danceability: float (nullable = true)
 |-- energy: float (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: float (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- acousticness: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- tempo: float (nullable = true)



In [20]:
print(f"Features: {features_parsed.count()}")

Features: 100


In [21]:
print(f"Features: {features_parsed.show(1)}")

+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|                  id|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|4Z4i631BesV0P6LTv...|       0.741| 0.619|  1| -11.366|   0|     0.0514|       0.661|           0.674|   0.125|  0.146|126.008|
+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
only showing top 1 row

Features: None


In [22]:
joined_tracks = tracks_parsed.join(features_parsed, tracks_parsed.id == features_parsed.id).drop(features_parsed.id)
joined_trending_tracks = trending_tracks_parsed.join(features_parsed, trending_tracks_parsed.id == features_parsed.id).drop(features_parsed.id)

In [23]:
print(f"Joined tracks: {joined_tracks.count()}")
print(f"Joined trending tracks: {joined_trending_tracks.count()}")

Joined tracks: 50
Joined trending tracks: 50


In [24]:
joined_tracks = joined_tracks.withColumn('favorite', lit(1))
joined_trending_tracks = joined_trending_tracks.withColumn('favorite', lit(0))

In [25]:
combined_tracks = joined_tracks.union(joined_trending_tracks)

In [33]:
combined_tracks = combined_tracks.orderBy(rand())
train, X_test = combined_tracks.randomSplit([0.8, 0.2])

In [34]:
assembler = VectorAssembler(inputCols=["duration_ms", "popularity", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"], outputCol="features")
assembled_train = assembler.transform(train)
assembled_test = assembler.transform(X_test)

In [35]:
random_forest = RandomForestClassifier(featuresCol='features', labelCol='favorite', numTrees=100, maxDepth=5, maxBins=32)
model = random_forest.fit(assembled_train)

In [36]:
predictions = model.transform(assembled_test)