In [2]:
import hashlib
import pandas as pd

# Load the data (replace with the actual file path)
data = pd.read_csv(r'C:\Users\seang\OneDrive\cse 6242\project\CSE6242_Team148\notebooks\Spotifty_songs_CSE6242.csv')

# Function to generate a unique hash ID for each artist name
def generate_artist_id(name):
    return hashlib.md5(str(name).encode()).hexdigest()

# Step 1: Create separate dataframes for each table and save them as CSV files

# Albums table
albums_df = data[['album_name', 'release_date', 'album_image_url']].drop_duplicates().copy()
albums_df.insert(0, 'album_id', range(1, len(albums_df) + 1))
albums_df.to_csv('albums.csv', index=False)

# Tracks table
tracks_df = data[['track_id', 'track_name', 'popularity', 'duration_ms', 'explicit',
                  'track_external_url', 'type', 'id', 'uri', 'track_href', 'analysis_url',
                  'time_signature', 'album_name']].drop_duplicates(subset="track_id").copy()
tracks_df = tracks_df.merge(albums_df[['album_id', 'album_name']], 
                                          left_on='album_name', right_on='album_name')
tracks_df.to_csv('tracks.csv', index=False)

# Artists table with unique hashed IDs
artists_df = data[['artists', 'artist_popularity', 'artist_followers', 
                   'artist_image_url', 'artist_external_url']].copy()
# Split the artists
artists_df['artists'] = artists_df['artists'].str.split(',')
artists_df = artists_df.explode('artists')
artists_df["artists"] = artists_df['artists'].str.strip() # Remove extra spaces

# Drop duplicates by artist name to ensure each artist has only one row
artists_df = artists_df.drop_duplicates(subset="artists")
# Generate a unique hash ID for each artist name
artists_df['artist_id'] = artists_df['artists'].apply(generate_artist_id)
artists_df.rename(columns={'artists': 'artist_name'}, inplace=True)
artists_df.to_csv('artists.csv', index=False)

# Track_Artists table (many-to-many relationship)
track_artists_df = data[['track_id', 'artists']].copy()
# Split and strip artist names
track_artists_df['artists'] = track_artists_df['artists'].str.split(',')
track_artists_df = track_artists_df.explode('artists')
track_artists_df['artists'] = track_artists_df['artists'].str.strip()

# Merge with artists_df to replace artist names with artist IDs
track_artists_df = track_artists_df.merge(artists_df[['artist_id', 'artist_name']], 
                                          left_on='artists', right_on='artist_name')
track_artists_df = track_artists_df[['track_id', 'artist_id']]
track_artists_df.drop_duplicates(inplace=True)
track_artists_df.to_csv('track_artists.csv', index=False)

# Track Features table
track_features_df = data[['track_id', 'danceability', 'energy', 'key', 'loudness', 'mode',
                          'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                          'valence', 'tempo']].drop_duplicates(subset="track_id").copy()
track_features_df.to_csv('track_features.csv', index=False)


In [None]:
import sqlite3
import pandas as pd

# Define the paths to your CSV files
tracks_csv = 'tracks.csv'
albums_csv = 'albums.csv'
artists_csv = 'artists.csv'
track_artists_csv = 'track_artists.csv'
track_features_csv = 'track_features.csv'


# Connect to SQLite database (or create it)
conn = sqlite3.connect(r'C:\Users\seang\OneDrive\cse 6242\project\CSE6242_Team148\assets\music_data.db')
cursor = conn.cursor()

# Create tables and load data

# Tracks Table
cursor.execute('''CREATE TABLE IF NOT EXISTS tracks (
    track_id TEXT PRIMARY KEY,
    track_name TEXT,
    popularity INTEGER,
    duration_ms INTEGER,
    explicit BOOLEAN,
    track_external_url TEXT,
    type TEXT,
    id TEXT,
    uri TEXT,
    track_href TEXT,
    analysis_url TEXT,
    time_signature INTEGER,
    album_id TEXT
)''')
pd.read_csv(tracks_csv).to_sql('tracks', conn, if_exists='replace', index=False)

# Albums Table
cursor.execute('''CREATE TABLE IF NOT EXISTS albums (
    album_id TEXT PRIMARY KEY,
    album_name TEXT,
    release_date TEXT,
    album_image_url TEXT
)''')
pd.read_csv(albums_csv).to_sql('albums', conn, if_exists='replace', index=False)

# Artists Table with Auto-Incrementing artist_id
cursor.execute('''CREATE TABLE IF NOT EXISTS artists (
    artist_id INTEGER PRIMARY KEY,
    artist_name TEXT,
    artist_popularity INTEGER,
    artist_followers INTEGER,
    artist_image_url TEXT,
    artist_external_url TEXT
)''')
pd.read_csv(artists_csv).to_sql('artists', conn, if_exists='replace', index=False)

# Track_Artists Table (Many-to-Many relationship) with foreign keys
cursor.execute('''CREATE TABLE IF NOT EXISTS track_artists (
    track_id TEXT,
    artist_id INTEGER,
    FOREIGN KEY (track_id) REFERENCES tracks (track_id),
    FOREIGN KEY (artist_id) REFERENCES artists (artist_id)
)''')
pd.read_csv(track_artists_csv).to_sql('track_artists', conn, if_exists='replace', index=False)

# Track Features Table
cursor.execute('''CREATE TABLE IF NOT EXISTS track_features (
    track_id TEXT PRIMARY KEY,
    danceability REAL,
    energy REAL,
    key INTEGER,
    loudness REAL,
    mode INTEGER,
    speechiness REAL,
    acousticness REAL,
    instrumentalness REAL,
    liveness REAL,
    valence REAL,
    tempo REAL,
    FOREIGN KEY (track_id) REFERENCES tracks (track_id)
)''')
pd.read_csv(track_features_csv).to_sql('track_features', conn, if_exists='replace', index=False)

# Create Artist_profiles Table
cursor.execute('''CREATE TABLE IF NOT EXISTS artist_profiles (
    artist_id INTEGER PRIMARY KEY,
    danceability REAL,
    energy REAL,
    key INTEGER,
    loudness REAL,
    mode INTEGER,
    speechiness REAL,
    acousticness REAL,
    instrumentalness REAL,
    liveness REAL,
    valence REAL,
    tempo REAL,
    popularity REAL
    FOREIGN KEY (artist_id) REFERENCES artists (artist_id)
)''')

# Load artist and track feature data
track_features_df = pd.read_csv(track_features_csv)
track_artists_df = pd.read_csv(track_artists_csv)

# Merge track features with artist-track mapping
artist_features_df = track_artists_df.merge(track_features_df, on='track_id')

# Aggregate features by artist
artist_profiles_df = artist_features_df.groupby('artist_id').agg({
    'danceability': 'mean',
    'energy': 'mean',
    'loudness': 'mean',
    'speechiness': 'mean',
    'acousticness': 'mean',
    'instrumentalness': 'mean',
    'liveness': 'mean',
    'valence': 'mean',
    'tempo': 'mean'
    'popularity': 'mean'
}).reset_index()

# Insert aggregated data into the artist_profiles table
artist_profiles_df.to_sql('artist_profiles', conn, if_exists='replace', index=False)

# Commit changes and close the connection
conn.commit()
conn.close()

