<a href="https://colab.research.google.com/github/kl3778/APAN5400_Group/blob/Data-Analytics/Recommendation_Score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving APAN5400_Group-data_storage.zip to APAN5400_Group-data_storage.zip


In [3]:
import zipfile

zip_path = "APAN5400_Group-data_storage.zip"
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall("data_storage")

!ls data_storage

APAN5400_Group-data_storage  Spotify


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("pipeline2") \
    .getOrCreate()

In [6]:
from google.colab import files

uploaded = files.upload()   # <-- choose BOTH JSON files

Saving spotify_artist_info_20251201.json to spotify_artist_info_20251201.json
Saving spotify_artist_tracks_20251201.json to spotify_artist_tracks_20251201.json


In [8]:
!ls -l

total 8292
-rw-r--r-- 1 root root 4407445 Dec  4 20:48 APAN5400_Group-data_storage.zip
drwxr-xr-x 4 root root    4096 Dec  4 20:49 data_storage
drwxr-xr-x 1 root root    4096 Nov 20 14:30 sample_data
-rw-r--r-- 1 root root  233038 Dec  4 20:51 spotify_artist_info_20251201.json
-rw-r--r-- 1 root root 3834685 Dec  4 20:51 spotify_artist_tracks_20251201.json


In [9]:
import os

base_path_p2 = "/content/pipeline2_data"
os.makedirs(base_path_p2, exist_ok=True)

for fn in uploaded.keys():
    os.rename(f"/content/{fn}", f"{base_path_p2}/{fn}")

!ls -l {base_path_p2}

total 3976
-rw-r--r-- 1 root root  233038 Dec  4 20:51 spotify_artist_info_20251201.json
-rw-r--r-- 1 root root 3834685 Dec  4 20:51 spotify_artist_tracks_20251201.json


In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("pipeline2").getOrCreate()

date_str = "20251201"

tracks_raw = spark.read.option("multiLine", True) \
    .json(f"{base_path_p2}/spotify_artist_tracks_{date_str}.json")

info_raw = spark.read.option("multiLine", True) \
    .json(f"{base_path_p2}/spotify_artist_info_{date_str}.json")

tracks_raw.printSchema()
info_raw.printSchema()

root
 |-- album_name: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- popularity: long (nullable = true)
 |-- release_date: string (nullable = true)
 |-- spotify_url: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)

root
 |-- artist_id: string (nullable = true)
 |-- followers: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- name: string (nullable = true)
 |-- popularity: long (nullable = true)



In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("pipeline2_song_portfolio").getOrCreate()

base_path_p2 = "/content/pipeline2_data"   # folder where you put the JSON
date_str = "20251201"

tracks_raw = (
    spark.read
         .option("multiLine", True)
         .json(f"{base_path_p2}/spotify_artist_tracks_{date_str}.json")
)

tracks_raw.printSchema()

root
 |-- album_name: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- popularity: long (nullable = true)
 |-- release_date: string (nullable = true)
 |-- spotify_url: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)



In [14]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

tracks_clean = (
    tracks_raw
    .select(
        col("artist_id"),
        col("track_id"),
        col("track_name"),
        col("popularity").cast("int").alias("track_popularity")  # 0–100
    )
    .dropna(subset=["artist_id", "track_id"])
)

tracks_clean.show(5, truncate=False)

+----------------------+----------------------+------------------------------+----------------+
|artist_id             |track_id              |track_name                    |track_popularity|
+----------------------+----------------------+------------------------------+----------------+
|6sq1yF0OZEWA4xoXVKW1L9|4NUt1fcnO9aQAELBgXq3Kr|Me Dediqué a Perderte         |80              |
|6sq1yF0OZEWA4xoXVKW1L9|4tfawx7oL39IUS0GyOlmcP|Como Quien Pierde una Estrella|72              |
|6sq1yF0OZEWA4xoXVKW1L9|5JpOkusDXbxrcPj1Kd0O2Q|Qué Voy a Hacer Con Mi Amor   |73              |
|6sq1yF0OZEWA4xoXVKW1L9|6T2VP9Ez5YMTyAFGllEhL2|Me Está Doliendo              |66              |
|6sq1yF0OZEWA4xoXVKW1L9|1OJkKHmXsZVKtWlECdpDAe|Sé Que Te Duele               |72              |
+----------------------+----------------------+------------------------------+----------------+
only showing top 5 rows



In [15]:
artist_pop = (
    tracks_clean
    .groupBy("artist_id")
    .agg(F.avg("track_popularity").alias("avg_track_popularity"))
)

artist_pop.show(5, truncate=False)

+----------------------+--------------------+
|artist_id             |avg_track_popularity|
+----------------------+--------------------+
|3MdG05syQeRYPPcClLaUGl|52.8                |
|2zvJLk0gTH7r7A5Q6X5Bq8|58.7                |
|3ppQEG71r7jVpI8RudzycF|65.0                |
|2NrprO4CludRmaWtpqud10|60.7                |
|6YzSM19LzpLH0nVKKU6Jsy|63.1                |
+----------------------+--------------------+
only showing top 5 rows



In [16]:
pop_stats = artist_pop.agg(
    F.min("avg_track_popularity").alias("min_p"),
    F.max("avg_track_popularity").alias("max_p")
).collect()[0]

min_p = pop_stats["min_p"]
max_p = pop_stats["max_p"]

artist_pop_scored = (
    artist_pop
    .withColumn(
        "pop_norm",
        F.when(F.lit(max_p) == F.lit(min_p), F.lit(0.5))  # avoid divide-by-zero
         .otherwise(
             (F.col("avg_track_popularity") - F.lit(min_p)) /
             (F.lit(max_p) - F.lit(min_p))
         )
    )
    .withColumn("song_portfolio_score", F.col("pop_norm") * 40.0)
)

In [17]:
info_raw = spark.read.option("multiLine", True) \
    .json(f"{base_path_p2}/spotify_artist_info_20251201.json")

In [19]:
from pyspark.sql.functions import col

info_clean = (
    info_raw
    .select(
        "artist_id",
        col("name").alias("artist_name")   # rename here
    )
    .dropDuplicates(["artist_id"])
)

artist_pop_scored_named = (
    artist_pop_scored.alias("p")
    .join(info_clean.alias("i"), on="artist_id", how="left")
    .select(
        "artist_id",
        "artist_name",
        "avg_track_popularity",
        "song_portfolio_score"
    )
)

artist_pop_scored_named.orderBy("song_portfolio_score", ascending=False).show(truncate=False)

+----------------------+----------------------------------+--------------------+--------------------+
|artist_id             |artist_name                       |avg_track_popularity|song_portfolio_score|
+----------------------+----------------------------------+--------------------+--------------------+
|7ouEqUl1PCVPlNninecdcz|HAVEN.                            |81.0                |40.0                |
|1ebt9HnXdyYA6KgLXr1n4P|CORTIS                            |75.83333333333333   |36.69333333333333   |
|5t5FqBwTcgKTaWmfEbwQY9|ENHYPEN                           |75.6                |36.544              |
|1GMwSpFzrLd12jUX15bHB6|BLOK3                             |75.5                |36.480000000000004  |
|6PH3FLQAxtqYy46Zv08bpV|Ivan Cornejo                      |74.6                |35.903999999999996  |
|2n2RSaZqBuUUukhbLlpnE6|Sleep Token                       |74.5                |35.84               |
|250LLR8V4fFq1XFzAB1ZN5|Poizi                             |74.1                |35

In [21]:
!ls -l

total 4320
-rw-r--r-- 1 root root 4407445 Dec  4 20:48 APAN5400_Group-data_storage.zip
drwxr-xr-x 4 root root    4096 Dec  4 20:49 data_storage
drwxr-xr-x 2 root root    4096 Dec  4 20:52 pipeline2_data
drwxr-xr-x 1 root root    4096 Nov 20 14:30 sample_data


In [22]:
artist_pop_scored_named.toPandas().to_csv("Recommendation_Score.csv", index=False)

In [23]:
!ls -l

total 4388
-rw-r--r-- 1 root root 4407445 Dec  4 20:48 APAN5400_Group-data_storage.zip
drwxr-xr-x 4 root root    4096 Dec  4 20:49 data_storage
drwxr-xr-x 2 root root    4096 Dec  4 20:52 pipeline2_data
-rw-r--r-- 1 root root   66454 Dec  4 21:16 Recommendation_Score.csv
drwxr-xr-x 1 root root    4096 Nov 20 14:30 sample_data


In [24]:
from google.colab import files
files.download("Recommendation_Score.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>