In [2]:
import glob
import json
import re
from datetime import datetime
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["spotify_db"]
col = db.artist_daily_streams


files = sorted(glob.glob("kworb/spotify_artist_top10_albums_*.json"))

date_pattern = re.compile(r"spotify_artist_top10_albums_(\d{8})\.json")

for file in files:
    match = date_pattern.search(file)
    
    if match:
        date_str = match.group(1)
        date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    else:
        continue

    with open(file, "r") as f:
        records = json.load(f)

    for r in records:
        daily_raw = r.get("daily", "").replace(",", "")
        daily = int(daily_raw) if daily_raw.isdigit() else 0
        col.insert_one({
            "artist_name": r["artist_name"],
            "artist_id": r["artist_id"],
            "daily": daily,
            "date": date})

print("Data inserted successfully.")


Data inserted successfully.


In [31]:
with open("filtered_artist_ids.json", "r") as f:
    filtered_ids = json.load(f)

In [32]:
pipeline = [

    # 0. Keep only selected artists
    {"$match": {"artist_id": {"$in": filtered_ids}}},

    # 1. Group by artist + date → sum daily streams
    {
        "$group": {
            "_id": {
                "artist_id": "$artist_id",
                "artist_name": "$artist_name",
                "date": "$date"
            },
            "daily_streams": {"$sum": "$daily"}
        }
    },

    # 2. Group by artist_id → build the streams array
    {
        "$group": {
            "_id": "$_id.artist_id",
            "artist_name": {"$first": "$_id.artist_name"},
            "streams": {
                "$push": {
                    "date": "$_id.date",
                    "daily_streams": "$daily_streams"
                }
            }
        }
    },

    # 3. Sort the streams array by date
    {
        "$project": {
            "_id": 0,
            "artist_id": "$_id",
            "artist_name": 1,
            "streams": {
                "$sortArray": {
                    "input": "$streams",
                    "sortBy": { "date": 1 }
                }
            }
        }
    },

    # 4. Sort artists alphabetically by name
    {"$sort": {"artist_name": 1}}
]


In [33]:
source_col = db.artist_daily_streams
target_col = db.artist_time_series   

results = list(source_col.aggregate(pipeline))

target_col.delete_many({})  # clears old results
target_col.insert_many(results)

# print("Aggregation result stored in 'artist_time_series' collection.")

InsertManyResult([ObjectId('69321e4c4de81cf692a3fa26'), ObjectId('69321e4c4de81cf692a3fa27'), ObjectId('69321e4c4de81cf692a3fa28'), ObjectId('69321e4c4de81cf692a3fa29'), ObjectId('69321e4c4de81cf692a3fa2a'), ObjectId('69321e4c4de81cf692a3fa2b'), ObjectId('69321e4c4de81cf692a3fa2c'), ObjectId('69321e4c4de81cf692a3fa2d'), ObjectId('69321e4c4de81cf692a3fa2e'), ObjectId('69321e4c4de81cf692a3fa2f'), ObjectId('69321e4c4de81cf692a3fa30'), ObjectId('69321e4c4de81cf692a3fa31'), ObjectId('69321e4c4de81cf692a3fa32'), ObjectId('69321e4c4de81cf692a3fa33'), ObjectId('69321e4c4de81cf692a3fa34'), ObjectId('69321e4c4de81cf692a3fa35'), ObjectId('69321e4c4de81cf692a3fa36'), ObjectId('69321e4c4de81cf692a3fa37'), ObjectId('69321e4c4de81cf692a3fa38'), ObjectId('69321e4c4de81cf692a3fa39'), ObjectId('69321e4c4de81cf692a3fa3a'), ObjectId('69321e4c4de81cf692a3fa3b'), ObjectId('69321e4c4de81cf692a3fa3c'), ObjectId('69321e4c4de81cf692a3fa3d'), ObjectId('69321e4c4de81cf692a3fa3e'), ObjectId('69321e4c4de81cf692a3fa

In [34]:
results[5]

{'artist_name': 'Chance the Rapper',
 'artist_id': '1anyVhU62p31KFi8MEzkbf',
 'streams': [{'date': '2025-11-30', 'daily_streams': 2307309},
  {'date': '2025-12-01', 'daily_streams': 2307309},
  {'date': '2025-12-02', 'daily_streams': 1406376},
  {'date': '2025-12-03', 'daily_streams': 703188}],
 '_id': ObjectId('69321e4c4de81cf692a3fa2b')}

In [35]:
# Optiional for data files export
import pandas as pd

rows = []

for doc in results:  # results is a list of MongoDB documents
    artist_id = doc["artist_id"]
    artist_name = doc["artist_name"]

    for entry in doc["streams"]:
        rows.append({
            "artist_id": artist_id,
            "artist_name": artist_name,
            "date": entry["date"],
            "daily_streams": entry["daily_streams"]
        })

df = pd.DataFrame(rows)
df.head()


Unnamed: 0,artist_id,artist_name,date,daily_streams
0,17lzZA2AlOHwCwFALHttmp,2 Chainz,2025-11-30,3033624
1,17lzZA2AlOHwCwFALHttmp,2 Chainz,2025-12-01,3033624
2,17lzZA2AlOHwCwFALHttmp,2 Chainz,2025-12-02,2022416
3,17lzZA2AlOHwCwFALHttmp,2 Chainz,2025-12-03,925735
4,6lcwlkAjBPSKnFBZjjZFJs,Alex G,2025-11-30,6667881


In [36]:
df.to_csv("all_artists_streams.csv", index=False)