In [1]:
import hashlib
import pandas as pd
import numpy as np

# Load the data (replace with the actual file path)
data = pd.read_csv(r"C:\Users\seang\Downloads\Spotifty_songs_CSE6242 1.csv")
keith_data = pd.read_excel(r"C:\Users\seang\Downloads\KD_random_tracks.xlsx")

data = pd.concat([data,keith_data])
data['popularity'] = data['popularity'] / 100
data['artist_popularity'] = data['artist_popularity'] / 100
data['duration_minutes'] = (data['duration_ms'] / 60000).round(2)
data["acousticness"] = np.where(data["acousticness"]>=.5,1,0) #convert to binary
data["instrumentalness"] = np.where(data["instrumentalness"]>=.5,1,0)  #convert to binary
data["liveness"] = np.where(data["liveness"]>=0.8,1,0)

# Function to generate a unique hash ID for each artist name
def generate_artist_id(name):
    return hashlib.md5(str(name).encode()).hexdigest()

# Step 1: Create separate dataframes for each table and save them as CSV files

# Albums table
albums_df = data[['album_name', 'release_date', 'album_image_url']].drop_duplicates().copy()
albums_df.insert(0, 'album_id', range(1, len(albums_df) + 1))
albums_df.to_csv('../src/assets/albums.csv', index=False)

# Tracks table
tracks_df = data[['track_id', 'track_name', 'popularity', 'duration_ms', 'explicit',
                  'track_external_url', 'type', 'id', 'uri', 'track_href', 'analysis_url',
                  'time_signature', 'album_name']].drop_duplicates(subset="track_id").copy()
tracks_df = tracks_df.merge(albums_df[['album_id', 'album_name']], 
                                          left_on='album_name', right_on='album_name')
tracks_df.to_csv('../src/assets/tracks.csv', index=False)

# Artists table with unique hashed IDs
artists_df = data[['artists', 'artist_popularity', 'artist_followers', 
                   'artist_image_url', 'artist_external_url']].copy()
# Split the artists
artists_df['artists'] = artists_df['artists'].str.split(',')
artists_df = artists_df.explode('artists')
artists_df["artists"] = artists_df['artists'].str.strip() # Remove extra spaces

# Drop duplicates by artist name to ensure each artist has only one row
artists_df = artists_df.drop_duplicates(subset="artists")
# Generate a unique hash ID for each artist name
artists_df['artist_id'] = artists_df['artists'].apply(generate_artist_id)
artists_df.rename(columns={'artists': 'artist_name'}, inplace=True)
artists_df.to_csv('../src/assets/artists.csv', index=False)

#track genres
track_genres_df = data[["track_id","track_genre"]].copy()
track_genres_df['track_genre'] = track_genres_df['track_genre'].str.split(',')
track_genres_df = track_genres_df.explode('track_genre')
track_genres_df['track_genre'] = track_genres_df['track_genre'].str.strip()
track_genres_df.drop_duplicates(inplace=True)
track_genres_df.to_csv('../src/assets/track_genres.csv', index=False)

# Track_Artists table (many-to-many relationship)
track_artists_df = data[['track_id', 'artists']].copy()
# Split and strip artist names
track_artists_df['artists'] = track_artists_df['artists'].str.split(',')
track_artists_df = track_artists_df.explode('artists')
track_artists_df['artists'] = track_artists_df['artists'].str.strip()

# Merge with artists_df to replace artist names with artist IDs
track_artists_df = track_artists_df.merge(artists_df[['artist_id', 'artist_name']], 
                                          left_on='artists', right_on='artist_name')
track_artists_df = track_artists_df[['track_id', 'artist_id']]
track_artists_df.drop_duplicates(inplace=True)
track_artists_df.to_csv('../src/assets/track_artists.csv', index=False)

# Track Features table
track_features_df = data[['track_id', 'danceability', 'energy', 'key', 'loudness', 'mode',
                          'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                          'valence', 'tempo',"duration_minutes"]].drop_duplicates(subset="track_id").copy()
track_features_df.to_csv('../src/assets/track_features.csv', index=False)


In [2]:
import sqlite3
import pandas as pd

# Define the paths to your CSV files
tracks_csv = '../src/assets/tracks.csv'
albums_csv = '../src/assets/albums.csv'
artists_csv = '../src/assets/artists.csv'
track_artists_csv = '../src/assets/track_artists.csv'
track_features_csv = '../src/assets/track_features.csv'
track_genres_csv = '../src/assets/track_genres.csv'

# Connect to SQLite database (or create it)
conn = sqlite3.connect('../src/assets/music_data.db')
cursor = conn.cursor()

# Create tables and load data

# Tracks Table
cursor.execute("DROP TABLE IF EXISTS tracks")
cursor.execute('''CREATE TABLE IF NOT EXISTS tracks (
    track_id TEXT PRIMARY KEY,
    track_name TEXT,
    popularity INTEGER,
    duration_ms INTEGER,
    explicit BOOLEAN,
    track_external_url TEXT,
    type TEXT,
    id TEXT,
    uri TEXT,
    track_href TEXT,
    analysis_url TEXT,
    time_signature INTEGER,
    album_id TEXT
)''')
pd.read_csv(tracks_csv).to_sql('tracks', conn, if_exists='replace', index=False)

# Albums Table
cursor.execute("DROP TABLE IF EXISTS albums")
cursor.execute('''CREATE TABLE IF NOT EXISTS albums (
    album_id TEXT PRIMARY KEY,
    album_name TEXT,
    release_date TEXT,
    album_image_url TEXT
)''')
pd.read_csv(albums_csv).to_sql('albums', conn, if_exists='replace', index=False)

# Artists Table with Auto-Incrementing artist_id
cursor.execute("DROP TABLE IF EXISTS artists")
cursor.execute('''CREATE TABLE IF NOT EXISTS artists (
    artist_id TEXT PRIMARY KEY,
    artist_name TEXT,
    artist_popularity INTEGER,
    artist_followers INTEGER,
    artist_image_url TEXT,
    artist_external_url TEXT
)''')
pd.read_csv(artists_csv).to_sql('artists', conn, if_exists='replace', index=False)

# Track Genres Table (One-to-Many relationship) with foreign keys
cursor.execute("DROP TABLE IF EXISTS track_genres")
cursor.execute('''CREATE TABLE IF NOT EXISTS track_genres (
    track_id TEXT,
    track_genre string,
    FOREIGN KEY (track_id) REFERENCES tracks (track_id)
)''')
pd.read_csv(track_genres_csv).to_sql('track_genres', conn, if_exists='replace', index=False)

# Track_Artists Table (Many-to-Many relationship) with foreign keys
cursor.execute("DROP TABLE IF EXISTS track_artists")
cursor.execute('''CREATE TABLE IF NOT EXISTS track_artists (
    track_id TEXT,
    artist_id INTEGER,
    FOREIGN KEY (track_id) REFERENCES tracks (track_id),
    FOREIGN KEY (artist_id) REFERENCES artists (artist_id)
)''')
pd.read_csv(track_artists_csv).to_sql('track_artists', conn, if_exists='replace', index=False)

# Track Features Table
cursor.execute("DROP TABLE IF EXISTS track_features")
cursor.execute('''CREATE TABLE IF NOT EXISTS track_features (
    track_id TEXT PRIMARY KEY,
    danceability REAL,
    energy REAL,
    key INTEGER,
    loudness REAL,
    mode INTEGER,
    speechiness REAL,
    acousticness REAL,
    instrumentalness REAL,
    liveness REAL,
    valence REAL,
    tempo REAL,
    FOREIGN KEY (track_id) REFERENCES tracks (track_id)
)''')
pd.read_csv(track_features_csv).to_sql('track_features', conn, if_exists='replace', index=False)

# Create Artist_profiles Table
cursor.execute("DROP TABLE IF EXISTS artist_profiles")
cursor.execute('''CREATE TABLE IF NOT EXISTS artist_profiles (
    artist_id INTEGER PRIMARY KEY,
    danceability REAL,
    energy REAL,
    key INTEGER,
    loudness REAL,
    mode INTEGER,
    speechiness REAL,
    acousticness REAL,
    instrumentalness REAL,
    liveness REAL,
    valence REAL,
    tempo REAL,
    FOREIGN KEY (artist_id) REFERENCES artists (artist_id)
)''')

# Load artist and track feature data
track_features_df = pd.read_csv(track_features_csv)
track_artists_df = pd.read_csv(track_artists_csv)

# Merge track features with artist-track mapping
artist_features_df = track_artists_df.merge(track_features_df, on='track_id')

# Aggregate features by artist
artist_profiles_df = artist_features_df.groupby('artist_id').agg({
    'danceability': 'mean',
    'energy': 'mean',
    'loudness': 'mean',
    'speechiness': 'mean',
    'acousticness': lambda x: x.mode().iloc[0],  # Most common binary state (0 or 1)
    'instrumentalness': lambda x: x.mode().iloc[0],
    'liveness': lambda x: x.mode().iloc[0],
    'valence': 'mean',
    'tempo': 'mean',
}).reset_index()

# Insert aggregated data into the artist_profiles table
artist_profiles_df.to_sql('artist_profiles', conn, if_exists='replace', index=False)

genres_df = pd.read_csv(track_genres_csv)

# Merge track features with genre-track mapping
genres_features_df = genres_df.merge(track_features_df, on='track_id')

# Aggregate features by genre
genres_profiles_df = genres_features_df.groupby('track_genre').agg({
    'danceability': 'mean',
    'energy': 'mean',
    'loudness': 'mean',
    'speechiness': 'mean',
    'acousticness': lambda x: x.mode().iloc[0],  # Most common binary state (0 or 1)
    'instrumentalness': lambda x: x.mode().iloc[0],
    'liveness': lambda x: x.mode().iloc[0],
    'valence': 'mean',
    'tempo': 'mean',
}).reset_index()

genres_profiles_df.to_sql('genre_profiles', conn, if_exists='replace', index=False)
# Commit changes and close the connection
conn.commit()
conn.close()

In [18]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import sqlite3

conn = sqlite3.connect('../src/assets/music_data.db')


def extract_year(date_str):
    """Extract year from various date formats"""
    if pd.isna(date_str):
        return None
    if len(str(date_str).strip()) == 4:
        return int(date_str)
    try:
        return pd.to_datetime(date_str).year
    except:
        try:
            year = str(date_str)[:4]
            return int(year) if year.isdigit() else None
        except:
            return None

def get_common_features(artist1_features, artist2_features, threshold=0.1):
    """Identify common musical features between two artists"""
    common_features = []
    feature_names = {
        'avg_danceability': 'Danceability',
        'avg_energy': 'Energy',
        'avg_acousticness': 'Acoustic',
        'avg_instrumentalness': 'Instrumental',
        'avg_liveness': 'Live',
        'avg_valence': 'Mood'
    }
    
    for feat in feature_names:
        if abs(artist1_features[feat] - artist2_features[feat]) < threshold:
            common_features.append(feature_names[feat])
    
    return ", ".join(common_features) if common_features else "Different styles"

def generate_all_graphs(conn, output_dir="../src/assets/graphs"):
    """Generate all visualizations"""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    def save_fig(fig, filename):
        """Helper to save figures with consistent styling"""
        fig.update_layout(
            template="plotly_dark",
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            height=700
        )
        fig.write_html(output_dir / f"{filename}.html")

    # Get top artists with their features
    top_artists_query = """
    WITH TopArtists AS (
        SELECT 
            ar.artist_id,
            ar.artist_name,
            ar.artist_popularity/100.0 as artist_popularity,
            ar.artist_followers,
            ar.artist_image_url,  -- Added this line
            ap.danceability as avg_danceability,
            ap.energy as avg_energy,
            ap.acousticness as avg_acousticness,
            ap.instrumentalness as avg_instrumentalness,
            ap.liveness as avg_liveness,
            ap.valence as avg_valence,
            COUNT(DISTINCT ta.track_id) as track_count,
            AVG(t.popularity/100.0) as avg_track_popularity
        FROM artists ar
        JOIN track_artists ta ON ar.artist_id = ta.artist_id
        JOIN tracks t ON ta.track_id = t.track_id
        JOIN artist_profiles ap ON ar.artist_id = ap.artist_id
        GROUP BY 
            ar.artist_id, ar.artist_name, ar.artist_popularity, 
            ar.artist_followers, ar.artist_image_url, ap.danceability, ap.energy,
            ap.acousticness, ap.instrumentalness, ap.liveness,
            ap.valence
        ORDER BY ar.artist_popularity DESC
        LIMIT 25
    )
    SELECT * FROM TopArtists
    """

    top_artists_df = pd.read_sql_query(top_artists_query, conn)

    # Get tracks data
    tracks_query = """
    WITH TopArtistIds AS (
        SELECT artist_id 
        FROM artists 
        ORDER BY artist_popularity DESC 
        LIMIT 25
    )
    SELECT 
        t.track_id,
        t.track_name,
        t.popularity/100.0 as popularity,
        t.duration_ms/60000.0 as duration_minutes,
        t.explicit,
        a.release_date,
        ar.artist_name,
        ar.artist_popularity/100.0 as artist_popularity,
        tf.danceability,
        tf.energy,
        tf.loudness,
        tf.acousticness,
        tf.instrumentalness,
        tf.liveness,
        tf.valence,
        tf.tempo,
        tg.track_genre
    FROM tracks t
    JOIN track_artists ta ON t.track_id = ta.track_id
    JOIN artists ar ON ta.artist_id = ar.artist_id
    JOIN albums a ON t.album_id = a.album_id
    JOIN track_features tf ON t.track_id = tf.track_id
    LEFT JOIN track_genres tg ON t.track_id = tg.track_id
    WHERE ar.artist_id IN (SELECT artist_id FROM TopArtistIds)
    """
    tracks_df = pd.read_sql_query(tracks_query, conn)
    tracks_df['release_year'] = tracks_df['release_date'].apply(extract_year)

    # 1. ARTIST SIMILARITY ANALYSIS

    # Calculate similarity matrix
    feature_cols = ['avg_danceability', 'avg_energy', 'avg_acousticness', 
                   'avg_instrumentalness', 'avg_liveness', 'avg_valence']
    
    features_normalized = StandardScaler().fit_transform(top_artists_df[feature_cols])
    similarity_matrix = cosine_similarity(features_normalized)

    # Create network graph
    G = nx.Graph()
    
    # Add nodes with attributes
    for idx, artist in top_artists_df.iterrows():
        G.add_node(artist['artist_name'], 
                  popularity=artist['artist_popularity'],
                  followers=artist['artist_followers'])

    # Add edges (similarities)
    for i in range(len(similarity_matrix)):
        artist_i = top_artists_df['artist_name'].iloc[i]
        similar_indices = np.argsort(similarity_matrix[i])[-4:-1]
        for j in similar_indices:
            artist_j = top_artists_df['artist_name'].iloc[j]
            similarity = similarity_matrix[i][j]
            if similarity > 0.7:
                G.add_edge(artist_i, artist_j, weight=similarity)

    # Calculate layout
    pos = nx.spring_layout(G, k=1/np.sqrt(len(G.nodes())), iterations=50)

    # Create network visualization
    fig = go.Figure()

    # Add edges first (they should be behind the nodes)
    edge_x = []
    edge_y = []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    fig.add_trace(go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='rgba(150,150,150,0.5)'),
        hoverinfo='none',
        mode='lines'
    ))

    # Add nodes with images
    for node in G.nodes(data=True):
        x, y = pos[node[0]]
        artist_data = top_artists_df[top_artists_df['artist_name'] == node[0]].iloc[0]
        
        # Add artist image
        fig.add_layout_image(
            dict(
                source=artist_data['artist_image_url'],
                x=x,
                y=y,
                xref="x",
                yref="y",
                sizex=0.15,  # Adjust these values to change image size
                sizey=0.15,
                sizing="contain",
                opacity=1,
                layer="above"
            )
        )
        
        # Add hover text
        fig.add_trace(go.Scatter(
            x=[x],
            y=[y],
            mode='markers',
            marker=dict(
                size=1,
                opacity=0
            ),
            text=artist_data['artist_name'],
            hovertext=(
                f"Artist: {artist_data['artist_name']}<br>"
                f"Popularity: {artist_data['artist_popularity']:.2f}<br>"
                f"Followers: {artist_data['artist_followers']:,}<br>"
                f"Tracks: {artist_data['track_count']}"
            ),
            hoverinfo='text',
            showlegend=False
        ))

        # Add artist name below image
        fig.add_annotation(
            x=x,
            y=y-0.1,  # Adjust this value to position the text
            text=artist_data['artist_name'],
            showarrow=False,
            font=dict(
                color='white',
                size=10
            ),
            xanchor='center',
            yanchor='top'
        )

    # Update layout
    fig.update_layout(
        title="Artist Similarity Network<br>(Images: Artists, Connections: Musical Similarity)",
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-1.5, 1.5]  # Adjust these values to fit all images
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-1.5, 1.5],  # Adjust these values to fit all images
            scaleanchor="x",
            scaleratio=1
        )
    )

    save_fig(fig, "artist_similarity_network")

    # Generate similar artists table
    similar_artists_table = []
    for i in range(len(similarity_matrix)):
        artist = top_artists_df['artist_name'].iloc[i]
        similar_indices = np.argsort(similarity_matrix[i])[-4:-1]
        similar_artists = top_artists_df['artist_name'].iloc[similar_indices].tolist()
        similarity_scores = similarity_matrix[i][similar_indices].round(3)
        
        for similar_artist, score in zip(similar_artists, similarity_scores):
            similar_artists_table.append({
                'Artist': artist,
                'Similar Artist': similar_artist,
                'Similarity Score': score,
                'Features in Common': get_common_features(
                    top_artists_df[feature_cols].iloc[i],
                    top_artists_df[feature_cols].iloc[similar_indices[similar_artists.index(similar_artist)]]
                )
            })
    
    similar_artists_df = pd.DataFrame(similar_artists_table)
    
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=list(similar_artists_df.columns),
            fill_color='darkblue',
            align='left',
            font=dict(color='white', size=12)
        ),
        cells=dict(
            values=[similar_artists_df[col] for col in similar_artists_df.columns],
            fill_color='darkblue',
            align='left',
            font=dict(color='white', size=11)
        )
    )])
    
    fig.update_layout(title="Top Similar Artists Pairs")
    save_fig(fig, "artist_similarity_table")

    # Similarity Heatmap
    fig = px.imshow(
        similarity_matrix,
        x=top_artists_df['artist_name'],
        y=top_artists_df['artist_name'],
        title="Artist Similarity Heatmap",
        color_continuous_scale="Viridis"
    )
    fig.update_layout(xaxis_tickangle=45)
    save_fig(fig, "similarity_heatmap")

    # Feature Space Clustering
    fig = px.scatter_3d(
        top_artists_df,
        x='avg_danceability',
        y='avg_energy',
        z='avg_valence',
        color='artist_popularity',
        text='artist_name',
        title="Artist Feature Space"
    )
    save_fig(fig, "feature_space_clustering")

    # 2. Top Artists Overview

    # Artist Rankings
    fig = px.bar(
        top_artists_df,
        x='artist_name',
        y=['artist_popularity', 'avg_track_popularity'],
        title="Top 25 Artists: Popularity Metrics",
        barmode='group',
        labels={
            'artist_popularity': 'Artist Popularity',
            'avg_track_popularity': 'Average Track Popularity'
        }
    )
    fig.update_layout(xaxis_tickangle=45)
    save_fig(fig, "top_artists")

    # Success Metrics
    fig = px.scatter(
        top_artists_df,
        x='artist_popularity',
        y='artist_followers',
        size='track_count',
        text='artist_name',
        title="Artist Success Metrics",
        labels={
            'artist_followers': 'Follower Count',
            'artist_popularity': 'Popularity Score',
            'track_count': 'Number of Tracks'
        }
    )
    fig.update_layout(yaxis_type="log")
    save_fig(fig, "artist_success_metrics")

    # Audio Features Radar
    fig = go.Figure()
    for _, artist in top_artists_df.head(5).iterrows():
        fig.add_trace(go.Scatterpolar(
            r=[artist[col] for col in feature_cols],
            theta=feature_cols,
            fill='toself',
            name=artist['artist_name']
        ))
    fig.update_layout(title="Top 5 Artists Audio Features")
    save_fig(fig, "top_artists_radar")

    # 3. Musical Characteristics

    # Feature Distributions
    fig = px.violin(
        tracks_df.melt(
            id_vars=['artist_name'],
            value_vars=['danceability', 'energy', 'valence', 'acousticness']
        ),
        x='variable',
        y='value',
        title="Audio Feature Distributions",
        box=True
    )
    save_fig(fig, "feature_distributions")

    # Energy-Valence Analysis
    fig = px.scatter(
        tracks_df,
        x='energy',
        y='valence',
        color='artist_name',
        hover_data=['track_name'],
        title="Energy-Valence Distribution"
    )
    fig.add_hline(y=0.5, line_dash="dash")
    fig.add_vline(x=0.5, line_dash="dash")
    save_fig(fig, "energy_valence_quadrants")

    # Artist Consistency
    for feature in ['danceability', 'energy', 'valence']:
        artist_consistency = tracks_df.groupby('artist_name').agg({
            feature: ['mean', 'std'],
            'artist_popularity': 'first'
        }).reset_index()
        artist_consistency.columns = ['artist_name', f'avg_{feature}', f'{feature}_std', 'artist_popularity']
        
        fig = px.scatter(
            artist_consistency,
            x=f'avg_{feature}',
            y=f'{feature}_std',
            text='artist_name',
            title=f"Artist {feature.title()} Consistency",
            color='artist_popularity'
        )
        save_fig(fig, f"artist_consistency_{feature}")

    # 4. Temporal Analysis

    # Release Timeline
    yearly_releases = tracks_df.groupby(['artist_name', 'release_year']).size().reset_index(name='tracks')
    
    fig = px.line(
        yearly_releases,
        x='release_year',
        y='tracks',
        color='artist_name',
        title="Artist Release Timeline"
    )
    save_fig(fig, "release_timeline")

    # Success Evolution
    yearly_popularity = tracks_df.groupby('release_year').agg({
        'popularity': 'mean',
        'artist_popularity': 'mean'
    }).reset_index()
    
    fig = px.line(
        yearly_popularity,
        x='release_year',
        y=['popularity', 'artist_popularity'],
        title="Popularity Trends Over Time"
    )
    save_fig(fig, "popularity_trends")

    # 5. Content Analysis

    # Duration Analysis
    fig = px.violin(
        tracks_df,
        x='artist_name',
        y='duration_minutes',
        title="Track Duration Distribution",
        points='all'
    )
    fig.update_layout(xaxis_tickangle=45)
    save_fig(fig, "duration_analysis")

    # Explicit Content
    explicit_stats = tracks_df.groupby(['artist_name', 'explicit']).size().unstack(fill_value=0)
    explicit_stats['total'] = explicit_stats[0] + explicit_stats[1]
    explicit_stats['explicit_ratio'] = explicit_stats[1] / explicit_stats['total']
    
    fig = px.bar(
        explicit_stats.reset_index(),
        x='artist_name',
        y='explicit_ratio',
        title="Explicit Content Ratio by Artist"
    )
    fig.update_layout(xaxis_tickangle=45)
    save_fig(fig, "explicit_content")

    # 6. Success Patterns
    # Success Formula - Fixed version
    high_success = tracks_df[tracks_df['popularity'] > tracks_df['popularity'].quantile(0.75)]
    success_features = ['danceability', 'energy', 'valence', 'acousticness', 'liveness']
    
    # Calculate mean and std directly
    success_means = high_success[success_features].mean().round(3)
    success_stds = high_success[success_features].std().round(3)
    
    # Create success profile table
    fig = go.Figure(data=[
        go.Table(
            header=dict(
                values=['Feature', 'Mean', 'Standard Deviation'],
                fill_color='darkblue',
                align='left',
                font=dict(color='white', size=12)
            ),
            cells=dict(
                values=[
                    success_features,
                    success_means.values,
                    success_stds.values
                ],
                fill_color='darkblue',
                align='left',
                font=dict(color='white', size=11)
            )
        )
    ])
    
    fig.update_layout(
        title="Success Formula: Characteristics of Top Tracks",
        height=400
    )
    save_fig(fig, "success_formula")

    # Track Popularity Distribution
    fig = px.box(
        tracks_df,
        x='artist_name',
        y='popularity',
        title="Track Popularity Distribution by Artist",
        points='all'
    )
    fig.update_layout(xaxis_tickangle=45)
    save_fig(fig, "track_popularity_dist")

generate_all_graphs(conn)
conn.close()


In [4]:
import gzip

In [5]:
import gzip
import shutil

# Path to your SQLite database file
db_file_path = '../src/assets/music_data.db'
compressed_file_path = '../src/assets/music_data.db.gz'

# Compress the file
with open(db_file_path, 'rb') as db_file:
    with gzip.open(compressed_file_path, 'wb') as compressed_file:
        shutil.copyfileobj(db_file, compressed_file)

In [6]:
conn = sqlite3.connect('../src/assets/music_data.db')