In [86]:
import cudf
import numpy as np
import pandas as pd
import re
import time
import spotify
import spotipy
import configparser
from spotipy.oauth2 import SpotifyClientCredentials

In [87]:
million_song_df = pd.read_csv("data/million_song_dataset.csv")
synthetic_df = pd.read_csv("data/dataset_v2.csv")

In [9]:
synthetic_df.columns

Index(['user_id', 'age', 'education', 'gender', 'name', 'country', 'music',
       'artist_name', 'featured_artists', 'genre', 'plays',
       'artiste_popularity', 'audio_popularity', 'music_acousticness',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre', 'release_date', 'explicit', 'duration', 'music_id',
       'id_artists', 'followers'],
      dtype='object')

In [10]:
synthetic_df = synthetic_df.drop(
    columns=[
        "name",
        "plays",
        "audio_popularity",
        "artiste_popularity",
        "track_genre",
        "followers",
    ]
)

In [11]:
million_song_df.columns

Index(['track_id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')

In [13]:
msd_df = million_song_df.copy()

msd_df.rename(columns={"name": "music"}, inplace=True)

msd_df.columns

Index(['track_id', 'music', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')

In [18]:
synthetic_df.rename(columns={"music_acousticness": "acousticness"}, inplace=True)
synthetic_df.columns

Index(['user_id', 'age', 'education', 'gender', 'country', 'music',
       'artist_name', 'featured_artists', 'genre', 'acousticness',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'release_date', 'explicit', 'duration', 'music_id', 'id_artists'],
      dtype='object')

In [19]:
# print columns both datasets have in common
common_columns = synthetic_df.columns.intersection(million_song_df.columns)
print(common_columns)

Index(['acousticness', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'release_date', 'explicit'],
      dtype='object')


In [20]:
print(f"length of synthetic_df: {len(synthetic_df)}\nlength of msd_df: {len(msd_df)}")

length of synthetic_df: 35001
length of msd_df: 1204025


In [21]:
# matched cols
matched_cols = []

for col in synthetic_df.columns:
    if col in msd_df.columns:
        matched_cols.append(col)

matched_cols

# for each column in synthetic_df, by "music" replace the values of the column with the values of the column in msd_df

['music',
 'acousticness',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'release_date',
 'explicit']

In [22]:
if matched_cols[0] in synthetic_df.columns and matched_cols[0] in msd_df.columns:
    print("yes")

yes


In [23]:
synthetic_modified_df = synthetic_df.copy()

In [24]:
synthetic_modified_df.shape

(35001, 26)

In [25]:
synthetic_modified_df.drop(columns=matched_cols[1:], inplace=True)

In [26]:
synthetic_modified_df.shape

(35001, 12)

In [27]:
# Step 1: Verify column names in both DataFrames
print("=" * 100)
print("Columns in million_song_df:", million_song_df.columns)
print("=" * 100)
print("Columns in synthetic_df:", synthetic_modified_df.columns)

Columns in million_song_df: Index(['track_id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')
Columns in synthetic_df: Index(['user_id', 'age', 'education', 'gender', 'country', 'music',
       'artist_name', 'featured_artists', 'genre', 'acousticness',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'release_date', 'explicit', 'duration', 'music_id', 'id_artists'],
      dtype='object')


In [36]:
synthetic_df["music"]

0                                             Bank Account
1        Mo Money Mo Problems (feat. Mase & Puff Daddy)...
2                                             Little Talks
3                                            Wherever I Go
4                                          Hands To Myself
                               ...                        
34996                                          Fuiste Mala
34997                                        Take You Down
34998                                          We Fly High
34999                                          Radioactive
35000           Everybody (Backstreet's Back) - Radio Edit
Name: music, Length: 35001, dtype: object

In [69]:
audio_features = msd_df[
    [
        "music",
        "album",
        "acousticness",
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo",
        "time_signature",
        "release_date",
        "explicit",
    ]
]

In [72]:
# Define aggregation functions
agg_funcs = {
    # "album": lambda x: x.mode()[0] if not x.empty else None,  # Most frequent album
    "acousticness": "mean",
    "danceability": "mean",
    "energy": "mean",
    "key": lambda x: x.mode()[0],  # Most frequent key
    "loudness": "mean",
    "mode": lambda x: x.mode()[0],  # Most frequent mode
    "speechiness": "mean",
    "instrumentalness": "mean",
    "liveness": "mean",
    "valence": "mean",
    "tempo": "mean",
    "time_signature": lambda x: x.mode()[0],  # Most frequent time signature
    "release_date": "min",  # Earliest release date
    "explicit": lambda x: x.mode()[0],  # Most frequent explicit value
}

# Aggregate audio features by 'music'
aggregated_audio_features = audio_features.groupby('music').agg(agg_funcs).reset_index()

In [75]:
# Merge datasets on a common key (e.g., 'music') using an inner join
merged_data = pd.merge(synthetic_modified_df, aggregated_audio_features, on='music', how='inner')

# Check the shape of the merged dataset
print(f"Merged Shape: {merged_data.shape}")


Merged Shape: (19974, 26)


using inner joins result in a loss of 15,027 records

In [90]:
# Save the enriched dataset
merged_data.to_csv('data/enriched_synthetic_data.csv', index=False)

In [68]:
aggregated_audio_features.head()

Unnamed: 0,music,acousticness,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo
0,!,0.099392,0.464,0.634667,6.0,-11.632667,0.666667,0.0483,0.771333,0.233,0.37,125.956
1,!!! - Interlude,0.908,0.0,0.0354,7.0,-20.151,0.0,0.0,0.0,0.479,0.0,0.0
2,!!!Si!!!,0.0117,0.381,0.894,11.0,-6.007,0.0,0.053,0.877,0.116,0.558,101.124
3,!!De Repente!!,0.0498,0.658,0.8875,5.0,-7.4355,1.0,0.0398,1.6e-05,0.0504,0.945,123.594
4,!(?Galactic!#$Adventure!@#,0.782,0.615,0.74,2.0,-8.895,1.0,0.0496,0.668,0.234,0.331,132.22


In [76]:
print(
    f"Synthetic Shape: {synthetic_modified_df.shape}\nAudio Features Shape: {audio_features.shape}\nMerged Shape: {merged_data.shape}"
)

Synthetic Shape: (35001, 12)
Audio Features Shape: (1204025, 16)
Merged Shape: (19974, 26)


In [77]:
# Check for duplicates in audio features
print(
    f"Duplicate Rows in audio_features: {aggregated_audio_features.duplicated().sum()}\nDuplicate Rows in Synthetic Data: {synthetic_modified_df.duplicated().sum()}"
)

Duplicate Rows in audio_features: 0
Duplicate Rows in Synthetic Data: 0


In [83]:
#  load spotify config information
config = configparser.ConfigParser()
config.read("config.ini")

# Extract the client ID and client secret
client_id = config.get("SPOTIFY", "CLIENT_ID")
client_secret = config.get("SPOTIFY", "CLIENT_SECRET")


In [89]:
# Authenticate with Spotify
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))
print(sp)

<spotipy.client.Spotify object at 0x7f3f8533f130>


In [None]:
# Authenticate with Spotify
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# Function to fetch audio features for a batch of tracks
def fetch_audio_features_batch(track_names):
    track_ids = []
    for track_name in track_names:
        try:
            results = sp.search(q=track_name, type='track', limit=1)
            if results['tracks']['items']:
                track_id = results['tracks']['items'][0]['id']
                track_ids.append(track_id)
        except Exception as e:
            print(f"Error fetching ID for {track_name}: {e}")
    if track_ids:
        try:
            features = sp.audio_features(track_ids)
            return features
        except Exception as e:
            print(f"Error fetching features for batch: {e}")
    return []

# Load the remaining records
remaining_records = synthetic_modified_df[~synthetic_modified_df['music'].isin(merged_data['music'])]

# Fetch audio features for remaining records in batches
batch_size = 50  # Adjust batch size as needed
audio_features_list = []
for start in range(0, len(remaining_records), batch_size):
    end = start + batch_size
    batch = remaining_records.iloc[start:end]
    track_names = batch['music'].tolist()
    features = fetch_audio_features_batch(track_names)
    audio_features_list.extend(features)
    # Save progress after each batch
    pd.DataFrame(audio_features_list).to_csv('spotify_audio_features_progress.csv', index=False)
    print(f"Processed batch {start // batch_size + 1}")

# Convert the list of audio features to a DataFrame
spotify_audio_features = pd.DataFrame(audio_features_list)

# Merge the Spotify audio features with the remaining records
remaining_records = remaining_records.merge(spotify_audio_features, left_on='music', right_on='id', how='left')

# Combine the merged data with the remaining records
final_data = pd.concat([merged_data, remaining_records], ignore_index=True)

# Save the final enriched dataset
final_data.to_csv('final_enriched_synthetic_data.csv', index=False)

print(f"Final Data Shape: {final_data.shape}")
