In [1]:
from google.colab import files

uploaded = files.upload()

Saving spotify_artist_tracks_20251206.json to spotify_artist_tracks_20251206.json
Saving spotify_artist_info_20251206.json to spotify_artist_info_20251206.json


In [2]:
!ls -l

total 5260
drwxr-xr-x 1 root root    4096 Nov 20 14:30 sample_data
-rw-r--r-- 1 root root  303680 Dec  7 21:20 spotify_artist_info_20251206.json
-rw-r--r-- 1 root root 5072769 Dec  7 21:20 spotify_artist_tracks_20251206.json


In [3]:
import os

base_path_p2 = "/content/pipeline2_data"
os.makedirs(base_path_p2, exist_ok=True)

for fn in uploaded.keys():
    os.rename(f"/content/{fn}", f"{base_path_p2}/{fn}")

!ls -l {base_path_p2}

total 5256
-rw-r--r-- 1 root root  303680 Dec  7 21:20 spotify_artist_info_20251206.json
-rw-r--r-- 1 root root 5072769 Dec  7 21:20 spotify_artist_tracks_20251206.json


In [5]:
print("base_path_p2 =", base_path_p2)
print("tracks path =", f"{base_path_p2}/spotify_artist_tracks_{date_str}.json")
print("info path   =", f"{base_path_p2}/spotify_artist_info_{date_str}.json")

base_path_p2 = /content/pipeline2_data
tracks path = /content/pipeline2_data/spotify_artist_tracks_20251201.json
info path   = /content/pipeline2_data/spotify_artist_info_20251201.json


In [6]:
import os

print(os.listdir(base_path_p2))

['spotify_artist_tracks_20251206.json', 'spotify_artist_info_20251206.json']


In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("pipeline2_song_portfolio").getOrCreate()

base_path_p2 = "/content/pipeline2_data"
date_str = "20251206"

tracks_raw = (
    spark.read
         .option("multiLine", True)
         .json(f"{base_path_p2}/spotify_artist_tracks_{date_str}.json")
)

tracks_raw.printSchema()

root
 |-- album_name: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- popularity: long (nullable = true)
 |-- release_date: string (nullable = true)
 |-- spotify_url: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)



In [9]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

tracks_clean = (
    tracks_raw
    .select(
        col("artist_id"),
        col("track_id"),
        col("track_name"),
        col("popularity").cast("int").alias("track_popularity")
    )
    .dropna(subset=["artist_id", "track_id"])
)

tracks_clean.show(5, truncate=False)

+----------------------+----------------------+----------------------------------------------+----------------+
|artist_id             |track_id              |track_name                                    |track_popularity|
+----------------------+----------------------+----------------------------------------------+----------------+
|1Xyo4u8uXC1ZmMpatF05PJ|7CyPwkp0oE8Ro9Dd5CUDjW|One Of The Girls (with JENNIE, Lily Rose Depp)|92              |
|1Xyo4u8uXC1ZmMpatF05PJ|0FIDCNYYjNvPVimz5icugS|Timeless (feat Playboi Carti)                 |91              |
|1Xyo4u8uXC1ZmMpatF05PJ|7MXVkk9YMctZqd1Srtv4MB|Starboy                                       |91              |
|1Xyo4u8uXC1ZmMpatF05PJ|0VjIjW4GlUZAMYd2vXMi3b|Blinding Lights                               |90              |
|1Xyo4u8uXC1ZmMpatF05PJ|2LBqCSwhJGcFQeTHMVGwy3|Die For You                                   |87              |
+----------------------+----------------------+----------------------------------------------+----------

In [10]:
artist_pop = (
    tracks_clean
    .groupBy("artist_id")
    .agg(F.avg("track_popularity").alias("avg_track_popularity"))
)

artist_pop.show(5, truncate=False)

+----------------------+--------------------+
|artist_id             |avg_track_popularity|
+----------------------+--------------------+
|5lpH0xAS4fVfLkACg9DAuM|66.7                |
|3c0gDdb9lhnHGFtP4prQpn|81.7                |
|5nLYd9ST4Cnwy6NHaCxbj8|55.7                |
|3oDbviiivRWhXwIE8hxkVV|68.3                |
|6kACVPfCOnqzgfEF5ryl0x|68.2                |
+----------------------+--------------------+
only showing top 5 rows



In [11]:
pop_stats = artist_pop.agg(
    F.min("avg_track_popularity").alias("min_p"),
    F.max("avg_track_popularity").alias("max_p")
).collect()[0]

min_p = pop_stats["min_p"]
max_p = pop_stats["max_p"]

artist_pop_scored = (
    artist_pop
    .withColumn(
        "pop_norm",
        F.when(F.lit(max_p) == F.lit(min_p), F.lit(0.5))
         .otherwise(
             (F.col("avg_track_popularity") - F.lit(min_p)) /
             (F.lit(max_p) - F.lit(min_p))
         )
    )
    .withColumn("song_portfolio_score", F.col("pop_norm") * 40.0)
)

In [12]:
info_raw = spark.read.option("multiLine", True) \
    .json(f"{base_path_p2}/spotify_artist_info_20251206.json")

In [13]:
from pyspark.sql.functions import col

info_clean = (
    info_raw
    .select(
        "artist_id",
        col("name").alias("artist_name")
    )
    .dropDuplicates(["artist_id"])
)

artist_pop_scored_named = (
    artist_pop_scored.alias("p")
    .join(info_clean.alias("i"), on="artist_id", how="left")
    .select(
        "artist_id",
        "artist_name",
        "avg_track_popularity",
        "song_portfolio_score"
    )
)

artist_pop_scored_named.orderBy("song_portfolio_score", ascending=False).show(truncate=False)

+----------------------+-----------------------+--------------------+--------------------+
|artist_id             |artist_name            |avg_track_popularity|song_portfolio_score|
+----------------------+-----------------------+--------------------+--------------------+
|06HL4z0CvFAxyc27GXpf02|Taylor Swift           |91.2                |40.0                |
|00x1fYSGhdqScXBRpSj3DW|Olivia Dean            |88.3                |38.72105843439912   |
|1Xyo4u8uXC1ZmMpatF05PJ|The Weeknd             |88.1                |38.63285556780595   |
|4q3ewBCX7sLwd24euuV69X|Bad Bunny              |88.0                |38.58875413450937   |
|74KM79TiuVKeVCqs8QtB0B|Sabrina Carpenter      |87.5                |38.368246968026455  |
|6qqNVTkY8uBg9cP3Jd7DAH|Billie Eilish          |87.2                |38.235942668136715  |
|4gzpq5DPGxSnKTe4SA8HAU|Coldplay               |87.2                |38.235942668136715  |
|7mW7Tv7NvywKKXqafZo0Lc|KPop Demon Hunters Cast|86.9                |38.10363836824697   |

In [17]:
artist_pop_scored_named.toPandas().to_csv("UPDATED_Recommendation_Score.csv", index=False)

In [18]:
from google.colab import files
files.download("UPDATED_Recommendation_Score.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>