# Downloading Music Audio

## Import Packages

In [None]:
import pandas as pd
import random
import os
import glob
import subprocess
from time import sleep

## Read in Song Data

In [None]:
songs_df = pd.read_csv('../data/SONGS.csv')

# Sample 1,000 popular songs
sample_songs_df = songs_df.query("Popularity > 80").sample(n=1000, random_state=42)

# Create queries
queries = sample_songs_df['Track Name'] + ' by ' + sample_songs_df['Artist'] + ' audio'
queries = [(idx, val) for idx, val in queries.items()]

## Download Audio Files

In [None]:
output_dir = "../data/song_audio_files"
os.makedirs(output_dir, exist_ok=True)

for song_id, query in queries:
    safe_id = str(song_id)
    safe_query = query.replace('"', '').replace("'", "").strip()
    output_path = os.path.join(output_dir, f"{safe_id} - %(title)s.%(ext)s")

    cmd = [
        "yt-dlp",
        f"ytsearch1:{query}",
        "--extract-audio",
        "--audio-format", "mp3",
        "--output", output_path
    ]

    print(f"Downloading {safe_id}: {query}")
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError:
        print(f"❌ Failed: {safe_id} - {query}")
    
    sleep(1)

## Get Additional Files for Failures

In [None]:
original_ids = set(sample_songs_df.index)

available_pool = songs_df[~songs_df.index.isin(original_ids)]
new_sample_df = available_pool.query("Popularity > 80").sample(n=3, random_state=99)

new_queries = new_sample_df['Track Name'] + ' by ' + new_sample_df['Artist'] + ' audio'
new_queries = [(idx, val) for idx, val in new_queries.items()]

In [None]:
output_dir = "../data/song_audio_files"
os.makedirs(output_dir, exist_ok=True)

for song_id, query in new_queries:
    safe_id = str(song_id)
    safe_query = query.replace('"', '').replace("'", "").strip()
    output_path = os.path.join(output_dir, f"{safe_id} - %(title)s.%(ext)s")

    cmd = [
        "yt-dlp",
        f"ytsearch1:{query}",
        "--extract-audio",
        "--audio-format", "mp3",
        "--output", output_path
    ]

    print(f"Downloading {safe_id}: {query}")
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError:
        print(f"❌ Failed: {safe_id} - {query}")
    
    sleep(1)