In [2]:
import glob
import json
import re
from datetime import datetime
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["spotify_db"]
col = db.artist_daily_streams


files = sorted(glob.glob("kworb/spotify_artist_top10_albums_*.json"))

date_pattern = re.compile(r"spotify_artist_top10_albums_(\d{8})\.json")

for file in files:
    match = date_pattern.search(file)
    
    if match:
        date_str = match.group(1)
        date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    else:
        continue

    with open(file, "r") as f:
        records = json.load(f)

    for r in records:
        daily_raw = r.get("daily", "").replace(",", "")
        daily = int(daily_raw) if daily_raw.isdigit() else 0
        col.insert_one({
            "artist_name": r["artist_name"],
            "artist_id": r["artist_id"],
            "daily": daily,
            "date": date})

print("Data inserted successfully.")


Data inserted successfully.


In [3]:
with open("filtered_artist_ids.json", "r") as f:
    filtered_ids = json.load(f)

In [19]:
pipeline = [

    # ---------------------------------------------
    # 0. Filter only artists from the SQLite results
    # ---------------------------------------------
    {"$match": {"artist_id": {"$in": filtered_ids}}},

    # ---------------------------------------------
    # Original pipeline continues
    # ---------------------------------------------
    {"$sort": {"artist_name": 1, "date": 1}},

    {
        "$group": {
            "_id": {
                "artist_name": "$artist_name",
                "artist_id": "$artist_id",
                "date": "$date"
            },
            "daily_streams": {"$sum": "$daily"}
        }
    },

    {
        "$group": {
            "_id": "$artist_id"     
            },
            "streams": {
                "$push": {
                    "artist_name": "$_id.artist_name",
                    "date": "$_id.date",
                    "daily_streams": "$daily_streams"
                }
            }
        },

    {
        "$project": {
            "_id": 0,
            "artist_id": "$_id",
            "streams": {
                "$sortArray": {
                    "input": "$streams",
                    "sortBy": { "date": 1 }
                }
            }
        }
    },
    {"$sort": {"artist_name": 1}}
]

In [20]:
source_col = db.artist_daily_streams
target_col = db.artist_time_series   

results = list(source_col.aggregate(pipeline))

target_col.delete_many({})  # clears old results
target_col.insert_many(results)

# print("Aggregation result stored in 'artist_time_series' collection.")

OperationFailure: A pipeline stage specification object must contain exactly one field., full error: {'ok': 0.0, 'errmsg': 'A pipeline stage specification object must contain exactly one field.', 'code': 40323, 'codeName': 'Location40323'}

In [6]:
results[5]

{'artist_name': 'Chance the Rapper',
 'streams': [{'date': '2025-11-30', 'daily_streams': 2307309},
  {'date': '2025-12-01', 'daily_streams': 2307309},
  {'date': '2025-12-02', 'daily_streams': 1406376},
  {'date': '2025-12-03', 'daily_streams': 703188}],
 '_id': ObjectId('693212234de81cf692a3f9bb')}

In [15]:
# Optiional for data files export
import pandas as pd

rows = []

for doc in results:  # results is a list of MongoDB documents
    artist = doc["artist_name"]
    
    for entry in doc["streams"]:
        rows.append({
            "artist_name": artist,
            "date": entry["date"],
            "daily_streams": entry["daily_streams"]
        })

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,artist_name,date,daily_streams
0,2 Chainz,2025-11-30,3033624
1,2 Chainz,2025-12-01,3033624
2,2 Chainz,2025-12-02,2022416
3,2 Chainz,2025-12-03,925735
4,Alex G,2025-11-30,6667881


In [16]:
df.to_csv("all_artists_streams.csv", index=False)