# Data Loading - All Datasets
## Loading all CSV files from data folder

---
## Setup

In [44]:
import pandas as pd
import numpy as np
import os
import re

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("✓ Libraries imported")
print(f"Working directory: {os.getcwd()}")

✓ Libraries imported
Working directory: /Users/lorenzogarduno/Documents/datasci:vis/wrangling and transformation


---
## Load All CSV Files

In [45]:
# Define all data sources
data_sources = {
    'billboard': "../data/billboard_24years_lyrics_spotify.csv",
    'musicoset_songs': "../data/musicoset_metadata/songs.csv",
    'musicoset_artists': "../data/musicoset_metadata/artists.csv",
    'musicoset_acoustic': "../data/musicoset_songfeatures/acoustic_features.csv",
    'musicoset_lyrics': "../data/musicoset_songfeatures/lyrics.csv"
}

# Dictionary to store all datasets
datasets = {}
delimiters_used = {}

print("=" * 80)
print("LOADING ALL DATASETS")
print("=" * 80)
print()

# Try different delimiters
delimiters_to_try = [
    (',', 'comma'),
    ('\t', 'tab'),
    (r'\s+', 'whitespace (regex)'),
    ('|', 'pipe')
]

# Load each dataset with delimiter detection
for name, path in data_sources.items():
    if os.path.exists(path):
        loaded = False
        
        # Special case: force acoustic to use tab
        if name == 'musicoset_acoustic':
            try:
                df = pd.read_csv(path, sep='\t')
                datasets[name] = df
                delimiters_used[name] = 'tab'
                print(f"✓ {name:25s}: {df.shape[0]:6d} rows, {df.shape[1]:3d} columns (sep: tab)")
                loaded = True
            except Exception as e:
                datasets[name] = None
                print(f"✗ {name:25s}: Failed with tab delimiter")
                loaded = True
        else:
            # Auto-detect for other files
            for delimiter, delim_name in delimiters_to_try:
                try:
                    # Try loading with this delimiter
                    if delimiter == r'\s+':
                        # Use regex for multiple spaces
                        df = pd.read_csv(path, sep=delimiter, engine='python')
                    else:
                        df = pd.read_csv(path, sep=delimiter)
                    
                    datasets[name] = df
                    delimiters_used[name] = delim_name
                    print(f"✓ {name:25s}: {df.shape[0]:6d} rows, {df.shape[1]:3d} columns (sep: {delim_name})")
                    loaded = True
                    break
                except Exception as e:
                    continue
        
        if not loaded:
            datasets[name] = None
            print(f"✗ {name:25s}: Failed with all delimiters")
    else:
        datasets[name] = None
        print(f"✗ {name:25s}: Not found")

print()
print("=" * 80)
print(f"Total datasets loaded: {sum(1 for v in datasets.values() if v is not None)}")
print("=" * 80)

LOADING ALL DATASETS

✓ billboard                :   3397 rows,  26 columns (sep: comma)
✓ musicoset_songs          :  20405 rows,   7 columns (sep: tab)
✓ musicoset_artists        :  11518 rows,   8 columns (sep: tab)
✓ musicoset_acoustic       :  20405 rows,  14 columns (sep: tab)
✓ musicoset_lyrics         :  20404 rows,   2 columns (sep: tab)

Total datasets loaded: 5


---
## Create Variable Names

In [46]:
# Create convenient variable names
df_billboard = datasets['billboard']
df_songs = datasets['musicoset_songs']
df_artists = datasets['musicoset_artists']
df_acoustic = datasets['musicoset_acoustic']
df_lyrics = datasets['musicoset_lyrics']

print("\nDatasets available as:")
print(f"  - df_billboard  : Billboard 24 years data {'✓' if df_billboard is not None else '✗'}")
print(f"  - df_songs      : MusicoSet songs metadata {'✓' if df_songs is not None else '✗'}")
print(f"  - df_artists    : MusicoSet artists metadata {'✓' if df_artists is not None else '✗'}")
print(f"  - df_acoustic   : MusicoSet acoustic features {'✓' if df_acoustic is not None else '✗'}")
print(f"  - df_lyrics     : MusicoSet lyrics {'✓' if df_lyrics is not None else '✗'}")


Datasets available as:
  - df_billboard  : Billboard 24 years data ✓
  - df_songs      : MusicoSet songs metadata ✓
  - df_artists    : MusicoSet artists metadata ✓
  - df_acoustic   : MusicoSet acoustic features ✓
  - df_lyrics     : MusicoSet lyrics ✓


---
## Inspect Each Dataset

### Billboard Dataset

In [47]:
if df_billboard is not None:
    print("Billboard Dataset:")
    print(f"Shape: {df_billboard.shape}")
    print(f"\nColumns: {list(df_billboard.columns)}")
    print("\nFirst 5 rows:")
    display(df_billboard.head())
else:
    print("Billboard dataset not loaded")

Billboard Dataset:
Shape: (3397, 26)

Columns: ['ranking', 'song', 'band_singer', 'songurl', 'titletext', 'url', 'year', 'lyrics', 'uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'track_href', 'analysis_url', 'duration_ms', 'time_signature']

First 5 rows:


Unnamed: 0,ranking,song,band_singer,songurl,titletext,url,year,lyrics,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,track_href,analysis_url,duration_ms,time_signature
0,1,Breathe,Faith Hill,/wiki/Breathe_(Faith_Hill_song),Breathe,/wiki/Faith_Hill,2000,I can feel the magic floating in the air\nBein...,spotify:track:3y4LxiYMgDl4RethdzpmNe,0.529,0.496,7.0,-9.007,1.0,0.029,0.173,0.0,0.251,0.278,136.859,audio_features,3y4LxiYMgDl4RethdzpmNe,https://api.spotify.com/v1/tracks/3y4LxiYMgDl4...,https://api.spotify.com/v1/audio-analysis/3y4L...,250547.0,4.0
1,2,Smooth,Santana,/wiki/Smooth_(Santana_song),Smooth,/wiki/Santana_(band),2000,"Man, it's a hot one\nLike seven inches from th...",spotify:track:0n2SEXB2qoRQg171q7XqeW,0.609,0.923,9.0,-3.908,1.0,0.0338,0.16,5e-06,0.295,0.961,115.996,audio_features,0n2SEXB2qoRQg171q7XqeW,https://api.spotify.com/v1/tracks/0n2SEXB2qoRQ...,https://api.spotify.com/v1/audio-analysis/0n2S...,294987.0,4.0
2,2,Smooth,Rob Thomas,/wiki/Smooth_(Santana_song),Smooth,/wiki/Rob_Thomas_(musician),2000,"Man, it's a hot one\nLike seven inches from th...",spotify:track:5IALWUYK0zDSEmZgb4ICvc,0.59,0.637,9.0,-9.171,1.0,0.0301,0.00225,0.807,0.299,0.724,115.983,audio_features,5IALWUYK0zDSEmZgb4ICvc,https://api.spotify.com/v1/tracks/5IALWUYK0zDS...,https://api.spotify.com/v1/audio-analysis/5IAL...,244924.0,4.0
3,3,Maria Maria,Santana,/wiki/Maria_Maria,Maria Maria,/wiki/Santana_(band),2000,"Ladies and gents, turn up your sound systems\n...",spotify:track:3XKIUb7HzIF1Vu9usunMzc,0.777,0.601,2.0,-5.931,1.0,0.126,0.0406,0.00201,0.0348,0.68,97.911,audio_features,3XKIUb7HzIF1Vu9usunMzc,https://api.spotify.com/v1/tracks/3XKIUb7HzIF1...,https://api.spotify.com/v1/audio-analysis/3XKI...,261973.0,4.0
4,3,Maria Maria,The Product G&B,/wiki/Maria_Maria,Maria Maria,/wiki/The_Product_G%26B,2000,Turn up this sound system\nTo the sound of Car...,spotify:track:3XKIUb7HzIF1Vu9usunMzc,,,,,,,,,,,,,,,,,


### MusicoSet - Songs Metadata

In [48]:
if df_songs is not None:
    print("MusicoSet Songs Metadata:")
    print(f"Shape: {df_songs.shape}")
    print(f"\nColumns: {list(df_songs.columns)}")
    print("\nFirst 5 rows:")
    display(df_songs.head())
else:
    print("MusicoSet songs dataset not loaded")

MusicoSet Songs Metadata:
Shape: (20405, 7)

Columns: ['song_id', 'song_name', 'billboard', 'artists', 'popularity', 'explicit', 'song_type']

First 5 rows:


Unnamed: 0,song_id,song_name,billboard,artists,popularity,explicit,song_type
0,3e9HZxeyfWwjeyPAMmWSSQ,"thank u, next","('Thank U, Next', 'Ariana Grande')",{'66CXWjxzNUsdJxJ2JdwvnR': 'Ariana Grande'},86,True,Solo
1,5p7ujcrUXASCNwRaWNHR1C,Without Me,"('Without Me', 'Halsey')",{'26VFTg2z8YR0cCuwLzESi2': 'Halsey'},87,True,Solo
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,"('Sicko Mode', 'Travis Scott')",{'0Y5tJX1MQlPlqiwlOH1tJY': 'Travis Scott'},85,True,Solo
3,3KkXRkHbMCARz0aVfEt68P,Sunflower - Spider-Man: Into the Spider-Verse,('Sunflower (Spider-Man: Into The Spider-Verse...,"{'246dkjvS1zLTtiykXe5h60': 'Post Malone', '1zN...",92,False,Collaboration
4,1rqqCSm0Qe4I9rUvWncaom,High Hopes,"('High Hopes', 'Panic! At The Disco')",{'20JZFwl6HVl6yg8a4H3ZqK': 'Panic! At The Disco'},86,False,Solo


### MusicoSet - Artists Metadata

In [49]:
if df_artists is not None:
    print("MusicoSet Artists Metadata:")
    print(f"Shape: {df_artists.shape}")
    print(f"\nColumns: {list(df_artists.columns)}")
    print("\nFirst 5 rows:")
    display(df_artists.head())
else:
    print("MusicoSet artists dataset not loaded")

MusicoSet Artists Metadata:
Shape: (11518, 8)

Columns: ['artist_id', 'name', 'followers', 'popularity', 'artist_type', 'main_genre', 'genres', 'image_url']

First 5 rows:


Unnamed: 0,artist_id,name,followers,popularity,artist_type,main_genre,genres,image_url
0,66CXWjxzNUsdJxJ2JdwvnR,Ariana Grande,34554242.0,96,singer,dance pop,"['dance pop', 'pop', 'post-teen pop']",https://i.scdn.co/image/b1dfbe843b0b9f54ab2e58...
1,26VFTg2z8YR0cCuwLzESi2,Halsey,7368242.0,90,singer,dance pop,"['dance pop', 'electropop', 'etherpop', 'indie...",https://i.scdn.co/image/22a5f3d8c42bc7cb55215e...
2,0Y5tJX1MQlPlqiwlOH1tJY,Travis Scott,6313709.0,94,rapper,pop,"['pop', 'pop rap', 'rap']",https://i.scdn.co/image/dc5eba5e032c2e5bc4d42c...
3,246dkjvS1zLTtiykXe5h60,Post Malone,16737002.0,96,rapper,dfw rap,"['dfw rap', 'pop', 'rap']",https://i.scdn.co/image/f9d8b742b66609f12da023...
4,1zNqQNIdeOUZHb8zbZRFMX,Swae Lee,483032.0,89,singer,trap music,['trap music'],https://i.scdn.co/image/a177469870b41f7e17e3b5...


### MusicoSet - Acoustic Features

In [50]:
if df_acoustic is not None:
    print("MusicoSet Acoustic Features:")
    print(f"Shape: {df_acoustic.shape}")
    print(f"\nColumns: {list(df_acoustic.columns)}")
    print("\nFirst 5 rows:")
    display(df_acoustic.head())
else:
    print("MusicoSet acoustic features dataset not loaded")

MusicoSet Acoustic Features:
Shape: (20405, 14)

Columns: ['song_id', 'duration_ms', 'key', 'mode', 'time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence', 'tempo']

First 5 rows:


Unnamed: 0,song_id,duration_ms,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo
0,3e9HZxeyfWwjeyPAMmWSSQ,207320,1,1,4,0.229,0.717,0.653,0.0,0.101,-5.634,0.0658,0.412,106.966
1,5p7ujcrUXASCNwRaWNHR1C,201661,6,1,4,0.297,0.752,0.488,9e-06,0.0936,-7.05,0.0705,0.533,136.041
2,2xLMifQCjDGFmkHkpNLD9h,312820,8,1,4,0.00513,0.834,0.73,0.0,0.124,-3.714,0.222,0.446,155.008
3,3KkXRkHbMCARz0aVfEt68P,158040,2,1,4,0.556,0.76,0.479,0.0,0.0703,-5.574,0.0466,0.913,89.911
4,1rqqCSm0Qe4I9rUvWncaom,190947,5,1,4,0.193,0.579,0.904,0.0,0.064,-2.729,0.0618,0.681,82.014


### MusicoSet - Lyrics

In [51]:
if df_lyrics is not None:
    print("MusicoSet Lyrics:")
    print(f"Shape: {df_lyrics.shape}")
    print(f"\nColumns: {list(df_lyrics.columns)}")
    print("\nFirst 5 rows:")
    display(df_lyrics.head())
else:
    print("MusicoSet lyrics dataset not loaded")

MusicoSet Lyrics:
Shape: (20404, 2)

Columns: ['song_id', 'lyrics']

First 5 rows:


Unnamed: 0,song_id,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,3KkXRkHbMCARz0aVfEt68P,
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."


---
## Summary

In [52]:
# Summary of all loaded datasets
print("=" * 80)
print("DATASET SUMMARY")
print("=" * 80)
print()

for name, df in datasets.items():
    if df is not None:
        delim = delimiters_used.get(name, 'unknown')
        print(f"{name:25s}: {df.shape[0]:6d} rows × {df.shape[1]:3d} columns (sep: {delim})")
    else:
        print(f"{name:25s}: Not loaded")

print()
print("=" * 80)

DATASET SUMMARY

billboard                :   3397 rows ×  26 columns (sep: comma)
musicoset_songs          :  20405 rows ×   7 columns (sep: tab)
musicoset_artists        :  11518 rows ×   8 columns (sep: tab)
musicoset_acoustic       :  20405 rows ×  14 columns (sep: tab)
musicoset_lyrics         :  20404 rows ×   2 columns (sep: tab)

