In [1]:
import sqlite3

import numpy as np
import pandas as pd


# Generate synthetic data
def generate_synthetic_data(num_songs=100000, num_artists=2500):
    np.random.seed(42)

    # Generate artist IDs and names
    artist_ids = [f"artist_{i}" for i in range(num_artists)]
    artist_names = [f"Artist {chr(65 + i)}" for i in range(num_artists)]

    # Define some genre options for variety
    genres = ["Pop", "Rock", "Jazz", "Hip-Hop", "Classical", "Electronic", "Country"]

    # Assign each song to a random artist and genre
    song_artist_ids = np.random.choice(artist_ids, size=num_songs)
    song_genres = np.random.choice(genres, size=num_songs)

    # Generate synthetic song features
    song_features = {
        "song_id": [f"song_{i}" for i in range(num_songs)],
        "song_name": [f"Song {i}" for i in range(num_songs)],
        "artist_id": song_artist_ids,
        "danceability": np.random.rand(num_songs),
        "energy": np.random.rand(num_songs),
        "acousticness": np.random.rand(num_songs),
        "instrumentalness": np.random.rand(num_songs),
        "liveness": np.random.rand(num_songs),
        "valence": np.random.rand(num_songs),
        "speechiness": np.random.rand(num_songs),
        "popularity": np.random.randint(0, 100, num_songs),  # Popularity scale of 0-100
        "genre": song_genres,
        "duration_ms": np.random.randint(
            180000, 300000, num_songs
        ),  # Duration between 3-5 minutes in ms
    }
    songs_df = pd.DataFrame(song_features)

    # Create artist DataFrame and merge to add artist_name to songs_df
    artists_df = pd.DataFrame({"artist_id": artist_ids, "artist_name": artist_names})
    songs_df = songs_df.merge(artists_df, on="artist_id")
    conn = sqlite3.connect("music_data.db")
    songs_df.to_sql("songs", conn, if_exists="replace", index=False)
    artists_df.to_sql("artists", conn, if_exists="replace", index=False)
    artist_profiles = aggregate_artist_profiles(songs_df, artists_df)
    artist_profiles.to_sql("artist_profiles", conn, if_exists="replace", index=False)
    conn.close()


def aggregate_artist_profiles(songs_df, artists_df):
    # Select features to aggregate
    features = [
        "danceability",
        "energy",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "speechiness",
        "popularity",
        "duration_ms",  # Include duration in artist profile
    ]

    # Aggregate by artist
    artist_profiles = songs_df.groupby("artist_id")[features].mean().reset_index()

    # Merge artist names into the aggregated profile
    artist_profiles = artist_profiles.merge(artists_df, on="artist_id")

    # Reorder columns to have artist name first for easier readability
    artist_profiles = artist_profiles[["artist_id", "artist_name"] + features]

    return artist_profiles


In [2]:
generate_synthetic_data()

In [6]:
# ---------------------------------------------
# Load Data from SQLite Database
# ---------------------------------------------
def load_data():
    conn = sqlite3.connect("music_data.db")
    songs_df = pd.read_sql_query("SELECT * FROM songs", conn)
    artists_df = pd.read_sql_query("SELECT * FROM artists", conn)
    artist_profiles = pd.read_sql_query("SELECT * FROM artist_profiles", conn)
    conn.close()
    return songs_df, artists_df, artist_profiles

In [7]:
songs_df, artists_df, artist_profiles = load_data()

In [11]:
import os

import pandas as pd
import plotly.express as px
import streamlit as st


# ---------------------------------------------
# Pre-generate Graphs
# ---------------------------------------------
def generate_and_save_graphs(songs_df, artist_profiles):
    # Create output directory if it doesn't exist
    output_dir = "graphs"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Popularity Distribution
    fig = px.histogram(artist_profiles, x="popularity", title="Popularity Distribution")
    fig.update_traces(marker=dict(color="blue"))
    fig.write_html(os.path.join(output_dir, "popularity_distribution.html"))

    # Danceability vs Popularity
    fig = px.scatter(
        artist_profiles,
        x="danceability",
        y="popularity",
        title="Danceability vs Popularity",
        color="artist_name",
        hover_name="artist_name",
    )
    fig.update_traces(marker=dict(opacity=0.7))
    fig.write_html(os.path.join(output_dir, "danceability_vs_popularity.html"))

    # Energy Distribution
    fig = px.box(artist_profiles, y="energy", title="Energy Distribution by Artist")
    fig.update_traces(marker=dict(color="green"))
    fig.write_html(os.path.join(output_dir, "energy_distribution.html"))

    # Acousticness Distribution
    fig = px.histogram(
        artist_profiles, x="acousticness", title="Acousticness Distribution"
    )
    fig.update_traces(marker=dict(color="orange"))
    fig.write_html(os.path.join(output_dir, "acousticness_distribution.html"))

    # Valence vs Energy
    fig = px.scatter(
        artist_profiles,
        x="valence",
        y="energy",
        title="Valence vs Energy",
        color="artist_name",
        hover_name="artist_name",
    )
    fig.update_traces(marker=dict(opacity=0.7))
    fig.write_html(os.path.join(output_dir, "valence_vs_energy.html"))

    # Number of Songs by Genre
    genre_counts = songs_df["genre"].value_counts().reset_index()
    genre_counts.columns = ["Genre", "Number of Songs"]
    fig = px.bar(
        genre_counts, x="Genre", y="Number of Songs", title="Number of Songs by Genre"
    )
    fig.update_traces(marker=dict(color="purple"))
    fig.write_html(os.path.join(output_dir, "number_of_songs_by_genre.html"))

    # Artist Song Count
    artist_counts = songs_df["artist_name"].value_counts().reset_index()
    artist_counts.columns = ["Artist", "Number of Songs"]
    fig = px.bar(
        artist_counts,
        x="Artist",
        y="Number of Songs",
        title="Number of Songs by Artist",
    )
    fig.update_traces(marker=dict(color="red"))
    fig.write_html(os.path.join(output_dir, "number_of_songs_by_artist.html"))

    # Duration Distribution
    fig = px.histogram(
        songs_df,
        x="duration_ms",
        nbins=30,
        title="Song Duration Distribution",
        labels={"duration_ms": "Duration (ms)"},
    )
    fig.update_traces(marker=dict(color="cyan"))
    fig.write_html(os.path.join(output_dir, "duration_distribution.html"))

    # Danceability vs Energy
    fig = px.scatter(
        songs_df,
        x="danceability",
        y="energy",
        color="genre",
        title="Danceability vs Energy by Genre",
        hover_name="artist_name",
    )
    fig.update_traces(marker=dict(opacity=0.7))
    fig.write_html(os.path.join(output_dir, "danceability_vs_energy.html"))

    # Energy vs Acousticness
    fig = px.scatter(
        songs_df,
        x="energy",
        y="acousticness",
        color="artist_name",
        title="Energy vs Acousticness",
        hover_name="song_name",
    )
    fig.update_traces(marker=dict(opacity=0.7))
    fig.write_html(os.path.join(output_dir, "energy_vs_acousticness.html"))

    # Valence vs Popularity
    fig = px.scatter(
        songs_df,
        x="valence",
        y="popularity",
        color="genre",
        title="Valence vs Popularity by Genre",
        hover_name="song_name",
    )
    fig.update_traces(marker=dict(opacity=0.7))
    fig.write_html(os.path.join(output_dir, "valence_vs_popularity.html"))


def display_saved_graphs():
    st.markdown("## 📊 Data Insights")
    st.write("Explore various attributes of the artist and song data:")

    # Dropdown menu for selecting the attribute to visualize
    attribute = st.selectbox(
        "Choose an attribute to visualize",
        [
            "Popularity Distribution",
            "Danceability vs Popularity",
            "Energy Distribution",
            "Acousticness Distribution",
            "Valence vs Energy",
            "Number of Songs by Genre",
            "Number of Songs by Artist",
            "Duration Distribution",
            "Danceability vs Energy",
            "Energy vs Acousticness",
            "Valence vs Popularity",
        ],
    )

    # Dictionary mapping attributes to corresponding HTML files
    graph_files = {
        "Popularity Distribution": "popularity_distribution.html",
        "Danceability vs Popularity": "danceability_vs_popularity.html",
        "Energy Distribution": "energy_distribution.html",
        "Acousticness Distribution": "acousticness_distribution.html",
        "Valence vs Energy": "valence_vs_energy.html",
        "Number of Songs by Genre": "number_of_songs_by_genre.html",
        "Number of Songs by Artist": "number_of_songs_by_artist.html",
        "Duration Distribution": "duration_distribution.html",
        "Danceability vs Energy": "danceability_vs_energy.html",
        "Energy vs Acousticness": "energy_vs_acousticness.html",
        "Valence vs Popularity": "valence_vs_popularity.html",
    }

    # Display the selected graph
    graph_path = os.path.join("graphs", graph_files[attribute])
    if os.path.exists(graph_path):
        with open(graph_path, "r", encoding="utf-8") as f:
            html_content = f.read()
        st.components.v1.html(html_content, height=600)
    else:
        st.write("Graph not found. Please make sure to generate the graphs first.")


In [12]:
generate_and_save_graphs(songs_df,artist_profiles)