In [2]:
import os

os.environ["YOUTUBE_API_KEY"] = "AIzaSyAPsy891r9b1nKOJCs9RjkO4YnXaMW8i68"


In [3]:
!pip install google-api-python-client isodate pandas


Collecting isodate
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate
Successfully installed isodate-0.7.2


In [5]:
!pip install isodate
import os
import pandas as pd
import isodate
from googleapiclient.discovery import build

API_KEY = os.getenv("YOUTUBE_API_KEY")
MAX_VIDEOS_PER_CHANNEL = 300

CHANNELS = {
    "RJ Karishma": "UCJvE3uJFyBS1WhL1Tou85Hg",
    "That's So Viraj": "UCec_55-cMrTXx_gpeVIIzjg"
}

youtube = build(
    "youtube",
    "v3",
    developerKey=API_KEY
)


def get_channel_data(channel_id, channel_name):
    """Fetches basic channel information (publishedAt)."""
    request = youtube.channels().list(
        part="snippet",
        id=channel_id
    )
    response = request.execute()
    item = response["items"][0]

    return {
        "channel_name": channel_name,
        "channel_id": channel_id,
        "channel_published_at": item["snippet"]["publishedAt"]
    }

def get_video_ids_for_channel(channel_id, max_videos):
    """Fetches video IDs for a given channel, up to max_videos."""
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_videos:
        request = youtube.search().list(
            part="id",
            channelId=channel_id,
            maxResults=50, # Max results per API call
            order="date",
            type="video",
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            video_ids.append(item["id"]["videoId"])
            if len(video_ids) >= max_videos:
                break

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return video_ids


def get_video_details_from_ids(video_ids, channel_name):
    """Fetches detailed video information for a list of video IDs."""
    video_details = []

    # The YouTube API allows fetching details for up to 50 videos at once
    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i + 50]
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=",".join(batch_ids)
        )
        response = request.execute()

        for item in response["items"]:
            # Parse video duration from ISO 8601 format
            duration_seconds = isodate.parse_duration(
                item["contentDetails"]["duration"]
            ).total_seconds()

            # Safely get statistics, defaulting to 0 if not available
            stats = item.get("statistics", {})

            video_details.append({
                "channel_name": channel_name,
                "video_id": item["id"],
                "published_at": item["snippet"]["publishedAt"],
                "duration_seconds": duration_seconds,
                "views": int(stats.get("viewCount", 0)),
                "likes": int(stats.get("likeCount", 0)),
                "comments": int(stats.get("commentCount", 0))
            })

    return video_details

# --- Main Data Collection Logic ---
all_channel_data = []
all_video_data = []

for channel_name, channel_id in CHANNELS.items():
    print(f"--- Processing channel: {channel_name} ({channel_id}) ---")

    # 1. Get channel metadata
    channel_info = get_channel_data(channel_id, channel_name)
    all_channel_data.append(channel_info)
    print(f"  Collected channel metadata for {channel_name}")

    # 2. Get video IDs for the channel
    video_ids_for_current_channel = get_video_ids_for_channel(channel_id, MAX_VIDEOS_PER_CHANNEL)
    print(f"  Found {len(video_ids_for_current_channel)} video IDs for {channel_name}")

    # 3. Get detailed data for the collected video IDs
    detailed_video_data = get_video_details_from_ids(video_ids_for_current_channel, channel_name)
    all_video_data.extend(detailed_video_data)
    print(f"  Collected detailed data for {len(detailed_video_data)} videos from {channel_name}")

# Create DataFrames
channels_df = pd.DataFrame(all_channel_data)
videos_df = pd.DataFrame(all_video_data)

# Save to CSV files
channels_df.to_csv("channels.csv", index=False)
videos_df.to_csv("videos.csv", index=False)

print("\n✅ DATA COLLECTION COMPLETE")
print("Channels collected:", len(channels_df))
print("Videos collected:", len(videos_df))

--- Processing channel: RJ Karishma (UCJvE3uJFyBS1WhL1Tou85Hg) ---
  Collected channel metadata for RJ Karishma
  Found 300 video IDs for RJ Karishma
  Collected detailed data for 300 videos from RJ Karishma
--- Processing channel: That's So Viraj (UCec_55-cMrTXx_gpeVIIzjg) ---
  Collected channel metadata for That's So Viraj
  Found 68 video IDs for That's So Viraj
  Collected detailed data for 68 videos from That's So Viraj

✅ DATA COLLECTION COMPLETE
Channels collected: 2
Videos collected: 368


In [None]:
from google.colab import files

files.download("channels.csv")
files.download("videos.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

pd.read_csv("channels.csv")
pd.read_csv("videos.csv").head()


Unnamed: 0,channel_name,video_id,published_at,duration_seconds,views,likes,comments
0,RJ Karishma,gjuR99hUXMM,2026-01-05T05:05:17Z,16.0,4375,295,6
1,RJ Karishma,DPNadf4m05s,2025-12-29T06:39:07Z,101.0,1373919,45814,419
2,RJ Karishma,XFOY2JUu0fA,2025-12-03T06:40:20Z,67.0,965957,34581,526
3,RJ Karishma,koAxe5FII3k,2025-11-09T10:55:24Z,169.0,7828385,243301,505
4,RJ Karishma,evH59COjmxw,2025-11-02T06:59:20Z,88.0,6994130,273833,914


In [None]:
def get_video_ids(channel_id, max_videos):
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_videos:
        request = youtube.search().list(
            part="id",
            channelId=channel_id,
            maxResults=50,
            order="date",
            type="video",
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            video_ids.append(item["id"]["videoId"])
            if len(video_ids) >= max_videos:
                break

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return video_ids


In [None]:
def get_titles(video_ids, channel_name):
    rows = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet",
            id=",".join(video_ids[i:i + 50])
        )
        response = request.execute()

        for item in response["items"]:
            rows.append({
                "channel_name": channel_name,
                "video_id": item["id"],
                "title": item["snippet"]["title"],
                "published_at": item["snippet"]["publishedAt"]
            })

    return rows


In [None]:
all_rows = []

for channel_name, channel_id in CHANNELS.items():
    print(f"Fetching titles for {channel_name}")

    video_ids = get_video_ids(channel_id, MAX_VIDEOS_PER_CHANNEL)
    all_rows.extend(get_titles(video_ids, channel_name))


Fetching titles for RJ Karishma
Fetching titles for That's So Viraj


In [None]:
titles_df = pd.DataFrame(all_rows)
titles_df.to_csv("video_titles.csv", index=False)

print("✅ Titles collected:", len(titles_df))


✅ Titles collected: 368
