## Setup

In [None]:
!pip install yt-dlp pytube ffmpeg-python
!pip install librosa soundfile



In [None]:
# for youtube2wav
import os
from pytube import Search
from yt_dlp import YoutubeDL
import ffmpeg

# for wav preprocessing
from typing import List
import glob
import shutil
import librosa
import soundfile as sf
import os
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Collect Data

### YouTube2Wav

In [None]:
# Step 1: Search YouTube for Videos
def search_youtube(keyword, max_results=15):
    search = Search(keyword)
    results = search.results[:max_results]
    video_links = [f"https://www.youtube.com/watch?v={video.video_id}" for video in results]
    return video_links

# Step 2: Download and Convert Videos
def download_and_convert_to_wav(video_links, output_folder="./downloads"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(output_folder, '%(title)s.%(ext)s'),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }]
    }

    with YoutubeDL(ydl_opts) as ydl:
        for link in video_links:
            try:
                ydl.download([link])
            except Exception as e:
                print(f"Failed to download {link}: {e}")

# Main Workflow
def get_yt_music(keywords: str = "guitar jamming session", max_results: int = 20):
    print(f"Searching YouTube for: {keywords}")
    video_links = search_youtube(keywords, max_results)
    print(f"Found {len(video_links)} videos:", "\n".join(video_links))

    print("\nDownloading and converting to .wav...")
    download_and_convert_to_wav(video_links, output_folder = f"./downloads/{keywords}")
    print("\nProcess complete!")


In [None]:
get_yt_music("guitar jamming session", 20)
get_yt_music("jazz guitar trio", 20)
get_yt_music("neo soul guitar instrumental live record", 20)
get_yt_music("best acoustic guitar rock songs", 20)
get_yt_music("guitar live performance old songs", 20)


Searching YouTube for: guitar jamming session




Found 18 videos: https://www.youtube.com/watch?v=HspKIEa1qwk
https://www.youtube.com/watch?v=f6dnI1WsFrA
https://www.youtube.com/watch?v=Hmn0p5JwK-w
https://www.youtube.com/watch?v=btAvJJT9kys
https://www.youtube.com/watch?v=sPqPN18BsVQ
https://www.youtube.com/watch?v=P9-kzchT3dE
https://www.youtube.com/watch?v=hu7PKBhBqLw
https://www.youtube.com/watch?v=2E0s2H-lQoI
https://www.youtube.com/watch?v=AVML8rtc7hw
https://www.youtube.com/watch?v=L1Bg94UspU0
https://www.youtube.com/watch?v=Feg3wrZP0Pk
https://www.youtube.com/watch?v=mPcGJahjsHY
https://www.youtube.com/watch?v=cUFVR5sgbt0
https://www.youtube.com/watch?v=3rkwBiMhZjM
https://www.youtube.com/watch?v=BsbcDSDuILY
https://www.youtube.com/watch?v=q7RhqX2t9A4
https://www.youtube.com/watch?v=fGFd5Gzn5Is
https://www.youtube.com/watch?v=vd38lhfJQwk

Downloading and converting to .wav...
[youtube] Extracting URL: https://www.youtube.com/watch?v=HspKIEa1qwk
[youtube] HspKIEa1qwk: Downloading webpage
[youtube] HspKIEa1qwk: Downloading ios 



Found 18 videos: https://www.youtube.com/watch?v=eGgraRbGduI
https://www.youtube.com/watch?v=eJNQmU6UmEA
https://www.youtube.com/watch?v=W2_gdQ20Mc8
https://www.youtube.com/watch?v=xkLA7VHpGSQ
https://www.youtube.com/watch?v=uTM67UhBVxk
https://www.youtube.com/watch?v=FTcUTww0gMU
https://www.youtube.com/watch?v=A-pop39QLBY
https://www.youtube.com/watch?v=OdFYaK_nUCs
https://www.youtube.com/watch?v=DKeSsbnn33c
https://www.youtube.com/watch?v=xZfTgV590kU
https://www.youtube.com/watch?v=p0UYqN-wUh8
https://www.youtube.com/watch?v=2aU_ASdsvtc
https://www.youtube.com/watch?v=qvr8hBkYEFo
https://www.youtube.com/watch?v=Jc1S1-2J2Og
https://www.youtube.com/watch?v=vCnT8prUqFY
https://www.youtube.com/watch?v=9WvexMiZ8dg
https://www.youtube.com/watch?v=Chj5zdD5azs
https://www.youtube.com/watch?v=Cp9v2tLG7Js

Downloading and converting to .wav...
[youtube] Extracting URL: https://www.youtube.com/watch?v=eGgraRbGduI
[youtube] eGgraRbGduI: Downloading webpage
[youtube] eGgraRbGduI: Downloading ios 



Found 18 videos: https://www.youtube.com/watch?v=KQWQEbvth8o
https://www.youtube.com/watch?v=1MSRHUKncU0
https://www.youtube.com/watch?v=3VNm3rGXPXs
https://www.youtube.com/watch?v=polOG2kXTkQ
https://www.youtube.com/watch?v=5zqlgMh4aYs
https://www.youtube.com/watch?v=7x3svtckdhY
https://www.youtube.com/watch?v=z-KsEPcsbls
https://www.youtube.com/watch?v=VZItGcbUTNY
https://www.youtube.com/watch?v=s4U1TmDbeB8
https://www.youtube.com/watch?v=DnU3hrvMPk0
https://www.youtube.com/watch?v=eHE_odJq8uU
https://www.youtube.com/watch?v=CTnf9wqIuIY
https://www.youtube.com/watch?v=WZqyiL3o0u0
https://www.youtube.com/watch?v=J016xjUYkWI
https://www.youtube.com/watch?v=qORwZ2opdvM
https://www.youtube.com/watch?v=w4WmIjqBl94
https://www.youtube.com/watch?v=TqQv3CENif0
https://www.youtube.com/watch?v=6yQa0gyfMLU

Downloading and converting to .wav...
[youtube] Extracting URL: https://www.youtube.com/watch?v=KQWQEbvth8o
[youtube] KQWQEbvth8o: Downloading webpage
[youtube] KQWQEbvth8o: Downloading ios 



Found 10 videos: https://www.youtube.com/watch?v=mYtC2CXImi8
https://www.youtube.com/watch?v=BUm0MLbepOw
https://www.youtube.com/watch?v=6aSjeSUwh6o
https://www.youtube.com/watch?v=8J1HLEF2MgU
https://www.youtube.com/watch?v=KOFcsCljhK8
https://www.youtube.com/watch?v=EHrkPL6YW1Y
https://www.youtube.com/watch?v=cSu4c67Fojc
https://www.youtube.com/watch?v=WbDgdy7jhIs
https://www.youtube.com/watch?v=SjZ56mFZ5bo
https://www.youtube.com/watch?v=T-U3c1nU3eM

Downloading and converting to .wav...
[youtube] Extracting URL: https://www.youtube.com/watch?v=mYtC2CXImi8
[youtube] mYtC2CXImi8: Downloading webpage
[youtube] mYtC2CXImi8: Downloading ios player API JSON
[youtube] mYtC2CXImi8: Downloading mweb player API JSON
[youtube] mYtC2CXImi8: Downloading m3u8 information
[info] mYtC2CXImi8: Downloading 1 format(s): 251
[download] Destination: ./downloads/best acoustic guitar rock songs/Acoustic Rock Cover ｜ The Best Rock Songs Of 80s 90s.webm
[download] 100% of   20.27MiB in 00:00:00 at 27.04MiB



Found 18 videos: https://www.youtube.com/watch?v=2rwfqsjimRM
https://www.youtube.com/watch?v=zWVqNrEhY4g
https://www.youtube.com/watch?v=syKZdZJeqGU
https://www.youtube.com/watch?v=uOx0mQ7wLZs
https://www.youtube.com/watch?v=09839DpTctU
https://www.youtube.com/watch?v=OuVIJlSDOs0
https://www.youtube.com/watch?v=VVwMuxqLiy8
https://www.youtube.com/watch?v=9Iibj8bpr6Q
https://www.youtube.com/watch?v=CrTMc2i6Lzc
https://www.youtube.com/watch?v=LNrAFb3I2js
https://www.youtube.com/watch?v=WZn9QZykx10
https://www.youtube.com/watch?v=bm03wqLY3Nc
https://www.youtube.com/watch?v=e26zZ83Oh6Y
https://www.youtube.com/watch?v=An2a1_Do_fc
https://www.youtube.com/watch?v=qvr8hBkYEFo
https://www.youtube.com/watch?v=UOwvmgjbwDo
https://www.youtube.com/watch?v=EgmLuloZN3A
https://www.youtube.com/watch?v=zlKLtnbU0xE

Downloading and converting to .wav...
[youtube] Extracting URL: https://www.youtube.com/watch?v=2rwfqsjimRM
[youtube] 2rwfqsjimRM: Downloading webpage
[youtube] 2rwfqsjimRM: Downloading ios 

KeyboardInterrupt: 

### Resize YT Music (to 30 sec chunks)

In [None]:
import random
random.seed(42)

def reformat_filename(filename: str) -> str:
    return (
        filename
        .replace("⧸", "|")
        .replace(" ", "_")
        .lower()
        [:-4][:50]
    ) + "_.wav"

def chunk_audio_files(
        audio_files: List[str],
        output_dir: str = "./yt_music",
        chunk_sec: int = 30,
        max_chunks_per_video: int = 10,
    ):
    # create output dir
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # chunk
    for f in audio_files:
        audio, sr = librosa.load(f, sr=32000)
        chunk_size = sr * chunk_sec
        num_chunks = len(audio) // chunk_size
        filename = reformat_filename(os.path.basename(f))

        # drop first two and last two chunks, random samples at most max_chunks_per_video
        if num_chunks - 4 > max_chunks_per_video:
            iters = random.sample(range(2, num_chunks - 2), max_chunks_per_video)
        else:
            iters = range(1, num_chunks - 1)

        for i in iters:
            start = i * chunk_size
            end = start + chunk_size
            chunk = audio[start:end]
            output_path = os.path.join(output_dir, f"{filename[:-4]}_{i}.wav")
            sf.write(output_path, chunk, sr)

    print(f"Total {len(glob.glob(output_dir + '/*.wav'))} audio files chunked and saved to {output_dir}")
    return


In [None]:
folders = [
    "guitar jamming session",
    "jazz guitar trio",
    "neo soul guitar instrumental live record",
    "best acoustic guitar rock songs",
    "guitar live performance old songs"
]
for folder in folders:
    audio_path = f"./downloads/{folder}"
    audio_files = glob.glob(audio_path + "/*.wav")
    chunk_audio_files(audio_files, output_dir = f"/content/drive/MyDrive/GuitarJam/yt_music/{folder}")


Total 164 audio files chunked and saved to /content/drive/MyDrive/GuitarJam/yt_music/guitar live performance old songs


### Check the audio files

In [None]:
from IPython.display import Audio

In [None]:
Audio("/content/drive/MyDrive/GuitarJam/yt_music/guitar live performance old songs/40_fingers_-_a_guitar_night_in_ljubljana_(full_liv__132.wav")

## Re-organize Dataset Folder

### Create metadata

In [None]:
if not os.path.exists("/content/drive/MyDrive/GuitarJam/yt_music_dataset"):
    os.mkdir("/content/drive/MyDrive/GuitarJam/yt_music_dataset")
    os.mkdir("/content/drive/MyDrive/GuitarJam/yt_music_dataset/train")

In [None]:
folders = [
    "guitar jamming session",
    "jazz guitar trio",
    "neo soul guitar instrumental live record",
    "best acoustic guitar rock songs",
    "guitar live performance old songs"
]

meta = pd.DataFrame(columns = ["file_name", "prompt"])
for folder in folders:
    filenames = glob.glob(f"/content/drive/MyDrive/GuitarJam/yt_music/{folder}/*.wav")
    print(f"Files count in `{folder}`: {len(filenames)}")
    new_filenames = [f"train/{os.path.basename(f)}" for f in filenames]
    prompts = [f"Music with Guitar, Guitar, Guitar - {folder} | "] * len(filenames)
    df = pd.DataFrame({"file_name": new_filenames, "prompt": prompts})
    meta = pd.concat([meta, df], axis=0, ignore_index=True)

meta.to_csv("/content/drive/MyDrive/GuitarJam/yt_music_dataset/metadata.csv", index=False)
print("---"*10)
print(f"Total files count: {len(meta)}")


Files count in `guitar jamming session`: 161
Files count in `jazz guitar trio`: 151
Files count in `neo soul guitar instrumental live record`: 144
Files count in `best acoustic guitar rock songs`: 100
Files count in `guitar live performance old songs`: 144
------------------------------
Total files count: 700


### Move the Files

In [None]:
source_base_path = "/content/drive/MyDrive/GuitarJam/yt_music"
target_path = "/content/drive/MyDrive/GuitarJam/yt_music_dataset/train"

for folder in folders:
    source_path = os.path.join(source_base_path, folder)

    if os.path.exists(source_path):
        for file_name in os.listdir(source_path):
            if file_name.endswith(".wav"):
                file_path = os.path.join(source_path, file_name)

                # overwrite if exists
                shutil.move(file_path, os.path.join(target_path, file_name))
    else:
        print(f"Source folder {source_path} does not exist.")

In [None]:
for folder in folders:
    if len(glob.glob(f"/content/drive/MyDrive/GuitarJam/yt_music/{folder}/*.wav")) == 0:
        !rm -rf /content/drive/MyDrive/GuitarJam/yt_music/{folder}