In [12]:
!pip install -q youtube-comment-downloader yt-dlp pandas tqdm

In [13]:
# â”€â”€ ì„¤ì • â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
TARGET_SIZE = 100_000  # target number of videos to collect
BATCH_SIZE = 100  # number of videos to fetch per query
MAX_DURATION = 60  # max duration in seconds (for shorts)

OUTPUT_DIR = "youtube_shorts_dataset"
ID_OUTPUT_PATH = f"{OUTPUT_DIR}/video_ids.csv"
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

In [14]:
# â”€â”€ Step 0. ê¸°ì¡´ ID ë¡œë“œ (ì¤‘ë³µ ë°©ì§€) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
import pandas as pd
import os

os.makedirs(OUTPUT_DIR, exist_ok=True)

if os.path.exists(ID_OUTPUT_PATH):
    existing_df = pd.read_csv(ID_OUTPUT_PATH)
    seen = set(existing_df["video_id"].dropna().tolist())
    print(
        f"Loaded existing IDs: {len(seen)} -> skipping already collected videos"
    )
else:
    seen = set()
    print("No existing file found -> starting fresh")

print(
    f"Target: {TARGET_SIZE} / Current: {len(seen)} / Need to add: {max(0, TARGET_SIZE - len(seen))}"
)

Loaded existing IDs: 3747 -> skipping already collected videos
Target: 100000 / Current: 3747 / Need to add: 96253


In [15]:
# â”€â”€ Step 1. Define search query list â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Query diversity directly improves dataset diversity

GENERAL_QUERIES = [
    # viral
    "funny",
    "viral",
    "trending",
    "satisfying",
    # meme
    "meme",
    "funny meme",
    "relatable",
    # food
    "food",
    "cooking",
    "recipe",
    "asmr food",
    # animals
    "animals",
    "cute dog",
    "dog",
    "cute cat",
    "cat",
    # sports
    "sports",
    "basketball",
    "soccer",
    "gym",
    # games
    "gaming",
    "minecraft",
    "roblox",
    # beauty/fashion
    "makeup",
    "fashion",
    "outfit",
    # music
    "music",
    "dance",
    "singing",
    # education
    "did you know",
    "facts",
    "life hack",
    # etc
    "prank",
    "challenge",
    "travel",
    "nature",
    "science",
    "art",
    "diy",
    "comedy",
]

TRENDING_QUERIES = [
    # --------- treding.google.com filtering youtube
    # trending topics (2026)
    "half-life",
    "ai",
    "olympics",
    "bad bunny",
    # trending topics (2025)
    "kpop demon hunters",
    "soda pop",
    "67",
    # ---------
]

MADE_UP_QUERIES = [
    "fyp",
    "",  # for general shorts without specific keywords
]

BASE_QUERIES = GENERAL_QUERIES + TRENDING_QUERIES + MADE_UP_QUERIES
# BASE_QUERIES = MADE_UP_QUERIES

QUERIES = []
for q in BASE_QUERIES:
    QUERIES.append(f"{q} #shorts")

print(f"Total number of queries: {len(QUERIES)}")
print(
    f"{BATCH_SIZE} results per query â†’ max candidates: {len(QUERIES) * BATCH_SIZE:,}"
)
print("Sample queries:", QUERIES[:5])

Total number of queries: 49
100 results per query â†’ max candidates: 4,900
Sample queries: ['funny #shorts', 'viral #shorts', 'trending #shorts', 'satisfying #shorts', 'meme #shorts']


In [16]:
# â”€â”€ Step 2. Collect IDs by iterating through queries â”€â”€â”€â”€â”€â”€â”€â”€â”€
import yt_dlp

ydl_opts = {
    "quiet": True,
    "extract_flat": True,
    "skip_download": True,
}

newly_collected = []
need = TARGET_SIZE - len(seen)

for query in QUERIES:
    if len(newly_collected) >= need:
        break

    print(
        f"[{len(seen) + len(newly_collected)}/{TARGET_SIZE}] Searching: '{query}'"
    )

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            result = ydl.extract_info(
                f"ytsearch{BATCH_SIZE}:{query}", download=False
            )
        entries = result.get("entries", [])
    except Exception as ex:
        print(f"  â†³ error: {ex}")
        continue

    for e in entries:
        if len(newly_collected) >= need:
            break

        vid = e.get("id")
        if not vid or vid in seen:
            continue

        duration = e.get("duration")
        if duration is not None and duration > MAX_DURATION:
            continue

        seen.add(vid)
        newly_collected.append(
            {
                "video_id": vid,
                "title": e.get("title", ""),
                "duration": duration,
                "view_count": e.get("view_count"),
                "url": f"https://www.youtube.com/shorts/{vid}",
                "query": query,
            }
        )

print(f"\nCompleted: {len(newly_collected)} new videos collected")

[3747/100000] Searching: 'funny #shorts'
[3767/100000] Searching: 'viral #shorts'
[3791/100000] Searching: 'trending #shorts'
[3833/100000] Searching: 'satisfying #shorts'
[3849/100000] Searching: 'meme #shorts'
[3853/100000] Searching: 'funny meme #shorts'
[3857/100000] Searching: 'relatable #shorts'
[3887/100000] Searching: 'food #shorts'
[3903/100000] Searching: 'cooking #shorts'
[3907/100000] Searching: 'recipe #shorts'
[3911/100000] Searching: 'asmr food #shorts'
[3911/100000] Searching: 'animals #shorts'
[3911/100000] Searching: 'cute dog #shorts'
[3915/100000] Searching: 'dog #shorts'
[4025/100000] Searching: 'cute cat #shorts'
[4025/100000] Searching: 'cat #shorts'
[4065/100000] Searching: 'sports #shorts'
[4091/100000] Searching: 'basketball #shorts'
[4107/100000] Searching: 'soccer #shorts'
[4113/100000] Searching: 'gym #shorts'
[4119/100000] Searching: 'gaming #shorts'
[4137/100000] Searching: 'minecraft #shorts'
[4147/100000] Searching: 'roblox #shorts'
[4149/100000] Search

In [17]:
# â”€â”€ Step 3. Append new data to existing CSV â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
new_df = pd.DataFrame(newly_collected)

if os.path.exists(ID_OUTPUT_PATH) and len(newly_collected) > 0:
    new_df.to_csv(ID_OUTPUT_PATH, mode="a", header=False, index=False)
    print(f"Appended {len(new_df)} new videos â†’ {ID_OUTPUT_PATH}")
elif len(newly_collected) > 0:
    new_df.to_csv(ID_OUTPUT_PATH, index=False)
    print(f"Saved {len(new_df)} new videos â†’ {ID_OUTPUT_PATH}")
else:
    print("No new videos collected")

total = pd.read_csv(ID_OUTPUT_PATH)
print(f"Total accumulated: {len(total)} videos")
total.tail()

Appended 275 new videos â†’ youtube_shorts_dataset/video_ids.csv
Total accumulated: 4022 videos


Unnamed: 0,video_id,title,duration,view_count,url,query
4017,xgHaL-A81-E,GRANNY 1 KHELE ðŸ”¥ #shortsfeed #shortslive #chai...,,,https://www.youtube.com/shorts/xgHaL-A81-E,#shorts
4018,as6SkfSMrLU,Missing Animals Caught on Video ðŸ˜± #shorts,22.0,12934.0,https://www.youtube.com/shorts/as6SkfSMrLU,#shorts
4019,MaSoCXtBW_g,1VS4 ON LIVE || WAIT FOR END END || #freefire ...,,,https://www.youtube.com/shorts/MaSoCXtBW_g,#shorts
4020,_E5Rpsh6JuY,GTA 5 Epic Water Ragdolls | Spider-Man Jumps /...,16.0,62405.0,https://www.youtube.com/shorts/_E5Rpsh6JuY,#shorts
4021,QGGiRRbFoHQ,Gabriel no longer believes in God.#foryou #sho...,56.0,6496.0,https://www.youtube.com/shorts/QGGiRRbFoHQ,#shorts
