In [4]:
import pandas as pd
import sqlite3

df1 = pd.read_csv("Spotify/spotify_artist_info_20251201.csv")
df2 = pd.read_csv("Spotify/spotify_artist_info_20251130.csv")


def clean_df(df):
    # Numeric values
    df['followers'] = df['followers'].astype(int)
    df['popularity'] = df['popularity'].astype(int)

    # list
    if isinstance(df['genres'].iloc[0], list):
        df['genres'] = df['genres'].apply(lambda x: ",".join(x))
    return df

df1 = clean_df(df1)
df2 = clean_df(df2)

df1['date'] = "2025-12-01"
df2['date'] = "2025-11-30"

# Meet the requirement on both days
df_combined = pd.concat([df1, df2], ignore_index=True)


conn = sqlite3.connect("spotify_artists.db")
cursor = conn.cursor()

# Create table
cursor.execute("""
CREATE TABLE IF NOT EXISTS artists (
    artist_id TEXT,
    name TEXT,
    genres TEXT,
    followers INTEGER,
    popularity INTEGER,
    date TEXT
)
""")
conn.commit()

# Insert data
df_combined.to_sql("artists", conn, if_exists="replace", index=False)


query = """
WITH avg_scores AS (
    SELECT date, AVG(popularity) AS avg_pop
    FROM artists
    GROUP BY date
),
above_avg AS (
    SELECT a.*
    FROM artists a
    JOIN avg_scores s ON a.date = s.date
    WHERE a.popularity > s.avg_pop
)
-- Find artists appearing in both days' above-average lists
SELECT artist_id, name
FROM above_avg
GROUP BY artist_id, name
HAVING COUNT(*) = 2;
"""

result = pd.read_sql_query(query, conn)
print("Artists with popularity > daily average on BOTH days:")
print(result)


Artists with popularity > daily average on BOTH days:
                 artist_id                     name
0   0Il5vC8lqunSFNav4mMLxU          Natanzinho Lima
1   0LKAV3zJ8a8AIGnyc5OvfB                      SDM
2   0QntOArZgiNHoemAzwJPu5                Gur Sidhu
3   0V2oXYR7DtrZAEFeILRW2r                 Lvbel C5
4   0Y3agQaa6g2r0YmHPOO9rh                   Davido
5   0h1zs4CTlU9D2QtgPxptUD           Ricardo Arjona
6   0oAZhL6hFrM3YRr6QzjlOf               MJ Records
7   0vLuOi2k62sHujIfplInlK                  Hungria
8   10VBp06W8NIgMW4JruLCC4              Kidd Voodoo
9   17lzZA2AlOHwCwFALHttmp                 2 Chainz
10  1CcZoULzFHa8Uhwo6OlQcp                 Semicenk
11  1GMwSpFzrLd12jUX15bHB6                    BLOK3
12  1T7MiVJ2MJlR5GKi11w4VT              Pawan Singh
13  1anyVhU62p31KFi8MEzkbf        Chance the Rapper
14  1fUSLFr4WUBx7joEcGwpvG           Sorriso Maroto
15  1oH2B8tcHn4Gpl2bmmyd4A           La Santa Grifa
16  1on7ZQ2pvgeQF4vmIA09x5        Charlie Brown Jr.
17  1scVfB

In [3]:
import json

filtered_ids = set(result["artist_id"].tolist())

with open("filtered_artist_ids.json", "w") as f:
    json.dump(list(filtered_ids), f, indent=2)

print("Saved filtered artist IDs to filtered_artist_ids.json")

Saved filtered artist IDs to filtered_artist_ids.json
