In [None]:
import glob
import json
import re
from datetime import datetime
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["spotify_db"]
col = db.artist_daily_streams


files = sorted(glob.glob("kworb/spotify_artist_top10_albums_*.json"))

date_pattern = re.compile(r"spotify_artist_top10_albums_(\d{8})\.json")

for file in files:
    match = date_pattern.search(file)
    
    if match:
        date_str = match.group(1)
        date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    else:
        continue

    with open(file, "r") as f:
        records = json.load(f)

    for r in records:
        daily_raw = r.get("daily", "").replace(",", "")
        daily = int(daily_raw) if daily_raw.isdigit() else 0
        col.insert_one({
            "artist_name": r["artist_name"],
            "artist_id": r["artist_id"],
            "daily": daily,
            "date": date})

print("Data inserted successfully.")


KeyboardInterrupt: 

In [None]:
with open("filtered_artist_ids.json", "r") as f:
    filtered_ids = json.load(f)

In [1]:
pipeline = [

    # ---------------------------------------------
    # 0. Filter only artists from the SQLite results
    # ---------------------------------------------
    {"$match": {"artist_id": {"$in": filtered_ids}}},

    # ---------------------------------------------
    # Original pipeline continues
    # ---------------------------------------------
    {"$sort": {"artist_name": 1, "date": 1}},

    {
        "$group": {
            "_id": {
                "artist_name": "$artist_name",
                "date": "$date"
            },
            "daily_streams": {"$sum": "$daily"}
        }
    },

    {
        "$group": {
            "_id": "$_id.artist_name",
            "streams": {
                "$push": {
                    "date": "$_id.date",
                    "daily_streams": "$daily_streams"
                }
            }
        }
    },

    {
        "$project": {
            "_id": 0,
            "artist_name": "$_id",
            "streams": {
                "$sortArray": {
                    "input": "$streams",
                    "sortBy": { "date": 1 }
                }
            }
        }
    },

    {"$sort": {"artist_name": 1}}
]

NameError: name 'filtered_ids' is not defined

In [None]:
source_col = db.artist_daily_streams
target_col = db.artist_time_series   

results = list(source_col.aggregate(pipeline))

target_col.delete_many({})  # clears old results
target_col.insert_many(results)

# print("Aggregation result stored in 'artist_time_series' collection.")

In [None]:
# Optiional for data files export
# mongoexport \
#   --uri="mongodb://localhost:27017" \
#   --db spotify_db \
#   --collection target_col \
#   --out target_col.json