In [3]:
import glob
import json
import re
from datetime import datetime
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["spotify_db"]
col = db.artist_daily_streams


files = sorted(glob.glob("kworb/top_10_albums/spotify_artist_top10_albums_*.json"))

date_pattern = re.compile(r"spotify_artist_top10_albums_(\d{8})\.json")

for file in files:
    match = date_pattern.search(file)
    
    if match:
        date_str = match.group(1)
        date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    else:
        continue

    with open(file, "r") as f:
        records = json.load(f)

    for r in records:
        daily_raw = r.get("daily", "").replace(",", "")
        daily = int(daily_raw) if daily_raw.isdigit() else 0
        col.insert_one({
            "artist_name": r["artist_name"],
            "artist_id": r["artist_id"],
            "daily": daily,
            "date": date})

print("Data inserted successfully.")


Data inserted successfully.


In [6]:
pipeline = [

    {"$sort": {"artist_name": 1, "date": 1}},

    {
        "$group": {
            "_id": {
                "artist_name": "$artist_name",
                "date": "$date"
            },
            "daily_streams": {"$sum": "$daily"}
        }
    },

    {
        "$group": {
            "_id": "$_id.artist_name",
            "streams": {
                "$push": {
                    "date": "$_id.date",
                    "daily_streams": "$daily_streams"
                }
            }
        }
    },

    {
        "$project": {
            "_id": 0,
            "artist_name": "$_id",
            "streams": {
                "$sortArray": {
                    "input": "$streams",
                    "sortBy": { "date": 1 }
                }
            }
        }
    },

    {"$sort": {"artist_name": 1}}
]


[{'artist_name': '$uicideboy$',
  'streams': [{'date': '2025-11-30', 'daily_streams': 3663193},
   {'date': '2025-12-01', 'daily_streams': 3663193},
   {'date': '2025-12-02', 'daily_streams': 3493246},
   {'date': '2025-12-03', 'daily_streams': 4191579}]},
 {'artist_name': '*NSYNC',
  'streams': [{'date': '2025-11-30', 'daily_streams': 6961795},
   {'date': '2025-12-01', 'daily_streams': 6961795},
   {'date': '2025-12-02', 'daily_streams': 6352348},
   {'date': '2025-12-03', 'daily_streams': 6817970}]},
 {'artist_name': '.Feast',
  'streams': [{'date': '2025-11-30', 'daily_streams': 2757174},
   {'date': '2025-12-01', 'daily_streams': 2757174},
   {'date': '2025-12-02', 'daily_streams': 2757174},
   {'date': '2025-12-03', 'daily_streams': 2767789}]},
 {'artist_name': '070 Shake',
  'streams': [{'date': '2025-11-30', 'daily_streams': 2210947},
   {'date': '2025-12-01', 'daily_streams': 2210947},
   {'date': '2025-12-02', 'daily_streams': 2048476},
   {'date': '2025-12-03', 'daily_stream

In [8]:
source_col = db.artist_daily_streams
target_col = db.artist_time_series   

results = list(source_col.aggregate(pipeline))

target_col.delete_many({})  # clears old results
target_col.insert_many(results)

print("Aggregation result stored in 'artist_time_series' collection.")

Aggregation result stored in 'artist_time_series' collection.
