In [None]:
# Install yt-dlp
!pip install yt-dlp

# Import Required Libraries


In [1]:
# Import required libraries
import os
import subprocess
import requests
import json
import time
import random
import csv
import pandas as pd
import cv2
import glob
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from multiprocessing import Pool

# Mount Google Drive to save files
from google.colab import drive
drive.mount('/content/drive')

Collecting yt-dlp
  Downloading yt_dlp-2025.2.19-py3-none-any.whl.metadata (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.9/171.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.2.19-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.2.19
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Extract all video URLs from the playlist

In [None]:
# Set up the base download directory in Google Drive
BASE_DOWNLOAD_DIR = "/content/drive/My Drive/Youtube_Game_Ads"
os.makedirs(BASE_DOWNLOAD_DIR, exist_ok=True)

# Define the playlist URL
PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLCH9Nc-e33qH9KuteMYCyiAgRibMe-sVn"

# Extract the PLAYLIST_ID from the URL
PLAYLIST_ID = PLAYLIST_URL.split("list=")[-1]

# Define your YouTube Data API key
API_KEY = "AIzaSyDb2aliQiuBVO3bOFQd0bYSd9J5772l7_w"

# Define game names and their corresponding folders
GAME_CATEGORIES = {
    "project makeover": "Project_Makeover",
    "last fortress": "Last_Fortress",
    "hero wars": "Hero_Wars",
    "hustle castle": "Hustle_Castle",
    "puzzles and survival": "Puzzles_and_Survival"
}

# Extract all video URLs from the playlist (handling pagination)
video_urls = []
next_page_token = ""

print("Extracting video URLs from playlist...")

while True:
    search_url = f"https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId={PLAYLIST_ID}&maxResults=50&pageToken={next_page_token}&key={API_KEY}"

    response = requests.get(search_url).json()

    for item in response.get("items", []):
        video_id = item["snippet"]["resourceId"]["videoId"]
        video_urls.append(f"https://www.youtube.com/watch?v={video_id}")

    next_page_token = response.get("nextPageToken", "")

    if not next_page_token:
        break  # Stop when there are no more pages

print(f"Found {len(video_urls)} videos in the playlist.")


Extracting video URLs from playlist...
Found 4581 videos in the playlist.


# Categorize videos

In [None]:
# Categorize videos based on their metadata
categorized_videos = {category: [] for category in GAME_CATEGORIES.values()}

# Define CSV file path
CSV_FILE_PATH = "/content/drive/My Drive/Youtube_Game_Ads/categorized_videos.csv"

# Ensure directory exists
os.makedirs(os.path.dirname(CSV_FILE_PATH), exist_ok=True)

# Initialize CSV file with headers
if not os.path.exists(CSV_FILE_PATH):
    with open(CSV_FILE_PATH, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow(["Category", "URL"])

# Process video metadata and categorize URLs
for url in video_urls:
    print(f"Checking video metadata: {url}")

    try:
        result = subprocess.run(
            ["yt-dlp", "--skip-download", "--print-json", url],
            capture_output=True, text=True)
        video_data = json.loads(result.stdout)

        # Extract title and convert to lowercase
        title = video_data.get("title", "").lower()

        # Check if the title matches any game category
        for game, folder in GAME_CATEGORIES.items():
            if game in title:
                categorized_videos[folder].append(url)
                print(f"Matched '{game}' → Adding to {folder} folder")

                # Write the URL to CSV immediately
                with open(CSV_FILE_PATH, mode='a', newline='', encoding='utf-8-sig') as file:
                    writer = csv.writer(file)
                    writer.writerow([folder, url])

                break

    except Exception as e:
        print(f"Error processing {url}: {e}")

print(f"\nCategorized video URLs saved to: {CSV_FILE_PATH}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Checking video metadata: https://www.youtube.com/watch?v=GNtKNlU8hJ0
Checking video metadata: https://www.youtube.com/watch?v=UVRoPz1NRug
Matched 'puzzles and survival' → Adding to Puzzles_and_Survival folder
Checking video metadata: https://www.youtube.com/watch?v=mEdwcvGGTnA
Matched 'last fortress' → Adding to Last_Fortress folder
Checking video metadata: https://www.youtube.com/watch?v=FxjAyWXFyWo
Checking video metadata: https://www.youtube.com/watch?v=9Tih5oj6nTU
Checking video metadata: https://www.youtube.com/watch?v=oF9vkHVYHDM
Checking video metadata: https://www.youtube.com/watch?v=r16Yb4gx_uI
Matched 'project makeover' → Adding to Project_Makeover folder
Checking video metadata: https://www.youtube.com/watch?v=NbPMZ64jaUM
Checking video metadata: https://www.youtube.com/watch?v=Uqyl4EsxKaI
Matched 'project makeover' → Adding to Project_Makeover folder
Checking video metadata: https://www.youtube.com/watch?v=kUr

In [5]:
# Define file paths
CSV_FILE_PATH = "/content/drive/My Drive/Youtube_Game_Ads/categorized_videos.csv"
OUTPUT_CSV_PATH = "/content/drive/My Drive/Youtube_Game_Ads/restructured_videos.csv"

# Load the original CSV
df = pd.read_csv(CSV_FILE_PATH)

# Pivot the data to organize URLs under their respective game categories
df_pivot = df.groupby("Category")["URL"].apply(list).to_dict()

# Find the max number of URLs in any category
max_length = max(len(urls) for urls in df_pivot.values())

# Convert dictionary to DataFrame with equal-length columns
structured_df = pd.DataFrame({category: urls + [''] * (max_length - len(urls)) for category, urls in df_pivot.items()})

# Save the structured CSV
structured_df.to_csv(OUTPUT_CSV_PATH, index=False)
structured_df

Unnamed: 0,Hero_Wars,Hustle_Castle,Last_Fortress,Project_Makeover,Puzzles_and_Survival
0,https://www.youtube.com/watch?v=7LFOrJxZzK8,https://www.youtube.com/watch?v=LTcgs1ByqCU,https://www.youtube.com/watch?v=ugwCBB--k1o,https://www.youtube.com/watch?v=B8qmb3Awl2o,https://www.youtube.com/watch?v=ZF7q_M_4XoI
1,https://www.youtube.com/watch?v=IkyCFFbGgOg,https://www.youtube.com/watch?v=zB8QImfNoN4,https://www.youtube.com/watch?v=KqRXOIGSdY0,https://www.youtube.com/watch?v=Ia7tatLhbU4,https://www.youtube.com/watch?v=9g4Rmd04k-4
2,https://www.youtube.com/watch?v=PSwQiyO2mXo,https://www.youtube.com/watch?v=QcIDjIBy0pk,https://www.youtube.com/watch?v=7fE5NIoHw-g,https://www.youtube.com/watch?v=qQa4N3TKbxU,https://www.youtube.com/watch?v=4yFhPKFvVA0
3,https://www.youtube.com/watch?v=AaTARNhImm8,https://www.youtube.com/watch?v=Yl80xl0h7pI,https://www.youtube.com/watch?v=Y6eImoFNTGU,https://www.youtube.com/watch?v=z4IpdbPsYiw,https://www.youtube.com/watch?v=XMQXZh-v58E
4,https://www.youtube.com/watch?v=1APwFoGvUxk,https://www.youtube.com/watch?v=LLq_KUW4aKo,https://www.youtube.com/watch?v=acWFPPnU7pk,https://www.youtube.com/watch?v=P2dDpR32kAc,https://www.youtube.com/watch?v=8QtpRUXYEEU
...,...,...,...,...,...
504,https://www.youtube.com/watch?v=8MxMsSE7wi0,,,,
505,https://www.youtube.com/watch?v=zp0ZeTwQHP0,,,,
506,https://www.youtube.com/watch?v=wZW5CJBkRtQ,,,,
507,https://www.youtube.com/watch?v=SQorLvuka1c,,,,


# Downloade Videos to Folders

In [7]:
# Define CSV file path
CSV_FILE_PATH = "/content/drive/My Drive/Youtube_Game_Ads/restructured_videos.csv"

# Define base directory for downloaded videos
BASE_DOWNLOAD_DIR = "/content/drive/My Drive/Youtube_Game_Ads/Videos"
os.makedirs(BASE_DOWNLOAD_DIR, exist_ok=True)

# Define game categories and corresponding folders
GAME_FOLDERS = {
    "Hero_Wars": "Hero_Wars",
    "Hustle_Castle": "Hustle_Castle",
    "Last_Fortress": "Last_Fortress",
    "Project_Makeover": "Project_Makeover",
    "Puzzles_and_Survival": "Puzzles_and_Survival"
}

for folder in GAME_FOLDERS.values():
    os.makedirs(os.path.join(BASE_DOWNLOAD_DIR, folder), exist_ok=True)

# Read CSV and collect video URLs into a DataFrame
df = pd.read_csv(CSV_FILE_PATH)

# Number of rows to process
NUM_ROWS = min(50, len(df))

# Process videos row by row
for i in range(NUM_ROWS):
    print(f"\nProcessing row {i+1}/{NUM_ROWS}...")

    for category, folder in GAME_FOLDERS.items():
        if category in df.columns:
            url = str(df.at[i, category]).strip()

            if pd.notna(url) and url.startswith("https://"):
                folder_path = os.path.join(BASE_DOWNLOAD_DIR, folder)
                print(f"Downloading {url} into {folder_path}...")
                command = [
                    "yt-dlp",
                    "-f", "best",
                    "-o", f"{folder_path}/%(title)s.%(ext)s",
                    url]
                subprocess.run(command)

                # Random delay to prevent blocking
                sleep_time = random.randint(7, 15)
                print(f"Sleeping for {sleep_time} seconds before next download...")
                time.sleep(sleep_time)

print("\nAll selected ads downloaded successfully!")


Processing row 1/50...
Downloading https://www.youtube.com/watch?v=7LFOrJxZzK8 into /content/drive/My Drive/Youtube_Game_Ads/Videos/Hero_Wars...
Sleeping for 12 seconds before next download...
Downloading https://www.youtube.com/watch?v=LTcgs1ByqCU into /content/drive/My Drive/Youtube_Game_Ads/Videos/Hustle_Castle...
Sleeping for 15 seconds before next download...
Downloading https://www.youtube.com/watch?v=ugwCBB--k1o into /content/drive/My Drive/Youtube_Game_Ads/Videos/Last_Fortress...
Sleeping for 10 seconds before next download...
Downloading https://www.youtube.com/watch?v=B8qmb3Awl2o into /content/drive/My Drive/Youtube_Game_Ads/Videos/Project_Makeover...
Sleeping for 15 seconds before next download...
Downloading https://www.youtube.com/watch?v=ZF7q_M_4XoI into /content/drive/My Drive/Youtube_Game_Ads/Videos/Puzzles_and_Survival...
Sleeping for 10 seconds before next download...

Processing row 2/50...
Downloading https://www.youtube.com/watch?v=IkyCFFbGgOg into /content/drive/

# Extracte 3 Frames from each Video

In [5]:
# Set directories
BASE_VIDEO_DIR = "/content/drive/My Drive/Youtube_Game_Ads/Videos"
OUTPUT_BASE_DIR = "/content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames"
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

# Ensure extracted frames folders exist for each game
for folder in GAME_FOLDERS.values():
    os.makedirs(os.path.join(OUTPUT_BASE_DIR, folder), exist_ok=True)

# Process videos for each game category
for game, folder in GAME_FOLDERS.items():
    video_dir = os.path.join(BASE_VIDEO_DIR, folder)
    output_folder = os.path.join(OUTPUT_BASE_DIR, folder)

    # Get all MP4 files in the directory
    video_files = glob.glob(os.path.join(video_dir, "*.mp4"))

    for video_path in video_files:
        cap = cv2.VideoCapture(video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        video_name = os.path.basename(video_path).split(".")[0]

        # Define frame positions (10%, 50%, 90% of total video length)
        start_frame = int(frame_count * 0.10)
        mid_frame = int(frame_count * 0.50)
        end_frame = int(frame_count * 0.90)

        frame_positions = [start_frame, mid_frame, end_frame]

        for frame_no in frame_positions:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
            ret, frame = cap.read()
            if ret:
                frame_filename = os.path.join(output_folder, f"{video_name}_frame_{frame_no}.jpg")
                cv2.imwrite(frame_filename, frame)

        cap.release()
        print(f"Extracted 3 frames from {video_name} and saved to {output_folder}")

print("All frames have been extracted and saved in their respective folders.")

Extracted 3 frames from Hero Wars Mighty Party Latest Mobile Game Ads '241' Win LOVE HELP 16+ and saved to /content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames/Hero_Wars
Extracted 3 frames from Hero Wars mobile game ads '450' Shark attacking raft sleeping hero and saved to /content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames/Hero_Wars
Extracted 3 frames from Mighty Party, Hero Wars, Hero Rescue mobile games ads collection #53 Get treasure chest and saved to /content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames/Hero_Wars
Extracted 3 frames from Hero Wars Mighty Party Latest Mobile Game Ads '238' Bell LOVE HELP 16+ and saved to /content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames/Hero_Wars
Extracted 3 frames from Hero Wars mobile game ads '394' Hero Transformation in Prison Break attempt and saved to /content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames/Hero_Wars
Extracted 3 frames from Mighty Party Hero Rescue Hero Wars mobile games ads '128' HELP! and saved to /c

# Extract Dominant Colors

In [16]:
# Define directories
FRAME_DIR = "/content/drive/My Drive/Youtube_Game_Ads/Extracted_Frames"
AD_COLOR_CSV = "/content/drive/My Drive/Youtube_Game_Ads/Ad_Color_Analysis.csv"
GAME_COLOR_CSV = "/content/drive/My Drive/Youtube_Game_Ads/Game_Color_Analysis.csv"

# Dictionary to store game-level colors
game_colors = {}

# Function to extract dominant colors
def get_dominant_colors(image_path, k=5):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (100, 100))  # Resize for faster processing
    img = img.reshape((-1, 3))  # Reshape to list of pixels
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=1000, n_init=10)
    kmeans.fit(img)
    return kmeans.cluster_centers_.astype(int)  # Get dominant colors as RGB values

# Function to process a single ad in parallel
def process_ad(ad_data):
    ad_name, frames = ad_data
    all_colors = []

    for frame in frames:
        colors = get_dominant_colors(frame)
        all_colors.extend(colors.tolist())

    # Cluster colors across frames for the ad
    kmeans_ad = MiniBatchKMeans(n_clusters=5, random_state=42, batch_size=1000, n_init=10)
    kmeans_ad.fit(all_colors)
    dominant_colors = kmeans_ad.cluster_centers_.astype(int)
    color_hex = ["#{:02x}{:02x}{:02x}".format(*color) for color in dominant_colors]
    return [ad_name] + color_hex

game_ads = {}

# Process each game folder
for game_folder in os.listdir(FRAME_DIR):
    game_path = os.path.join(FRAME_DIR, game_folder)

    if os.path.isdir(game_path):
        game_ads[game_folder] = {}

        # Process each ad in the game folder
        for frame_path in glob.glob(os.path.join(game_path, "*.jpg")):
            ad_name = "_".join(os.path.basename(frame_path).split("_")[:-2])

            if ad_name not in game_ads[game_folder]:
                game_ads[game_folder][ad_name] = []

            game_ads[game_folder][ad_name].append(frame_path)

# Process ads in parallel
all_ad_colors = []
for game, ads in game_ads.items():
    with Pool(processes=4) as pool:
        ad_colors = pool.map(process_ad, ads.items())

    # Append game name to each ad's result
    for ad in ad_colors:
        all_ad_colors.append([ad[0], game] + ad[1:])

# Save per-ad colors to CSV
ad_df = pd.DataFrame(all_ad_colors, columns=["Ad Name", "Game", "Color 1", "Color 2", "Color 3", "Color 4", "Color 5"])
ad_df.to_csv(AD_COLOR_CSV, index=False)

# Process aggregated colors per game
game_color_data = []
for game, ads in game_ads.items():
    all_colors = []

    for ad_name, frames in ads.items():
        for frame in frames:
            colors = get_dominant_colors(frame)
            all_colors.extend(colors.tolist())

    # Cluster colors across all ads for the game
    kmeans_game = MiniBatchKMeans(n_clusters=5, random_state=42, batch_size=1000, n_init=10)
    kmeans_game.fit(all_colors)
    dominant_colors = kmeans_game.cluster_centers_.astype(int)
    color_hex = ["#{:02x}{:02x}{:02x}".format(*color) for color in dominant_colors]
    game_color_data.append([game] + color_hex)

# Save per-game colors to CSV
game_df = pd.DataFrame(game_color_data, columns=["Game Name", "Color 1", "Color 2", "Color 3", "Color 4", "Color 5"])
game_df.to_csv(GAME_COLOR_CSV, index=False)

print(f"Dominant colors per ad saved in: {AD_COLOR_CSV}")
print(f"Aggregated colors per game saved in: {GAME_COLOR_CSV}")

Dominant colors per ad saved in: /content/drive/My Drive/Youtube_Game_Ads/Ad_Color_Analysis.csv
Aggregated colors per game saved in: /content/drive/My Drive/Youtube_Game_Ads/Game_Color_Analysis.csv


# Convert HEX Colors to Closest Colors Names

In [30]:
import pandas as pd
import webcolors

# Load CSS3 color names
css3_color_names = webcolors.names("css3")

# Define function to get the closest color name
def closest_color(hex_code):
    try:
        return webcolors.hex_to_name(hex_code, spec="css3")
    except ValueError:
        r, g, b = webcolors.hex_to_rgb(hex_code)
        min_distance = float("inf")
        closest_name = None

        # Find the closest named color
        for name in css3_color_names:
            r_c, g_c, b_c = webcolors.hex_to_rgb(webcolors.name_to_hex(name))
            distance = (r - r_c)**2 + (g - g_c)**2 + (b - b_c)**2
            if distance < min_distance:
                min_distance = distance
                closest_name = name
        return closest_name

# Load CSV
df = pd.read_csv("/content/drive/My Drive/Youtube_Game_Ads/Ad_Color_Analysis.csv")
for i in range(1, 6):
    df[f"Color {i} Name"] = df[f"Color {i}"].apply(closest_color)
output_csv_path = "/content/drive/My Drive/Youtube_Game_Ads/Ad_Color_Analysis_Named.csv"
df.to_csv(output_csv_path, index=False)

# Load CSV
df = pd.read_csv("/content/drive/My Drive/Youtube_Game_Ads/Game_Color_Analysis.csv")
for i in range(1, 6):
    df[f"Color {i} Name"] = df[f"Color {i}"].apply(closest_color)
output_csv_path = "/content/drive/My Drive/Youtube_Game_Ads/Game_Color_Analysis_Named.csv"
df.to_csv(output_csv_path, index=False)

print(f"Converted colors to English names and saved to: {output_csv_path}")

✅ Converted colors to English names and saved to: /content/drive/My Drive/Youtube_Game_Ads/Game_Color_Analysis_Named.csv
