## YouTube HT
> Do horror videos get more likes than gaming videos?

## Description
> Null: There are less than p = 0.20% of gaming videos that get less likes than horror videos. Alternatively, there is more than p = 0.20% of gaming videos that get more like than horror videos.

In [13]:
from googleapiclient.discovery import build
import pandas as pd

API_KEY = "AIzaSyCH4efduHupoc5fwoHLG_MSpfCalypVMGA"
NUM_VIDEOS = 50 
youtube = build("youtube", "v3", developerKey=API_KEY)

def get_video_ids_by_keyword(keyword, max_results=50):
    video_ids = []
    try:
        response = youtube.search().list(
            part="id",
            type="video",
            q=keyword,
            maxResults=max_results,
            order="viewCount"
        ).execute()
        
        for item in response.get("items", []):
            video_ids.append(item["id"]["videoId"])
    except Exception as e:
        print(f"Error fetching videos for '{keyword}': {e}")
    
    return video_ids

def fetch_video_stats(video_ids, category_label):
    data = []
    for vid in video_ids:
        try:
            response = youtube.videos().list(
                part="statistics,snippet",
                id=vid
            ).execute()
            
            if "items" not in response or len(response["items"]) == 0:
                print(f"Video {vid} not found or inaccessible.")
                continue
            
            stats = response["items"][0]["statistics"]
            views = int(stats.get("viewCount", 0))
            likes = int(stats.get("likeCount", 0))
            
            data.append({
                "video_id": vid,
                "category": category_label,
                "views": views,
                "likes": likes
            })
        except Exception as e:
            print(f"Error fetching stats for {vid}: {e}")
    
    return data


horror_video_ids = get_video_ids_by_keyword("horror", NUM_VIDEOS)
gaming_video_ids = get_video_ids_by_keyword("gaming", NUM_VIDEOS)

print(f"Fetched {len(horror_video_ids)} horror videos")
print(f"Fetched {len(gaming_video_ids)} gaming videos")

horror_data = fetch_video_stats(horror_video_ids, "horror")
gaming_data = fetch_video_stats(gaming_video_ids, "gaming")

df = pd.DataFrame(horror_data + gaming_data)

df["like_ratio"] = df["likes"] / df["views"]

horror_avg = df[df["category"] == "horror"]["like_ratio"].mean()
gaming_avg = df[df["category"] == "gaming"]["like_ratio"].mean()

print(f"\nAverage like ratio - Horror: {horror_avg:.3f}")
print(f"Average like ratio - Gaming: {gaming_avg:.3f}")

if horror_avg > gaming_avg:
    print("Horror videos get more likes on average.")
else:
    print("Gaming videos get more likes on average.")

Fetched 50 horror videos
Fetched 50 gaming videos

Average like ratio - Horror: 0.014
Average like ratio - Gaming: 0.024
Gaming videos get more likes on average.


## Part 2
> Here, I am collecting samples that my margin of error is 10% or less.

In [19]:
import math
import numpy as np

API_KEY = "AIzaSyCH4efduHupoc5fwoHLG_MSpfCalypVMGA"
TOTAL_VIDEOS = 200
LIKE_RATIO_THRESHOLD = 0.05
CONFIDENCE_Z = 1.96
MARGIN_ERROR = 0.10

youtube = build("youtube", "v3", developerKey=API_KEY)

def get_video_ids_by_keyword(keyword, max_results=50):
    video_ids = []
    try:
        response = youtube.search().list(
            part="id",
            type="video",
            q=keyword,
            maxResults=max_results,
            order="viewCount"
        ).execute()
        for item in response.get("items", []):
            video_id = item.get("id", {}).get("videoId")
            if video_id:
                video_ids.append(video_id)
    except Exception as e:
        print(f"Error fetching videos for '{keyword}': {e}")
    return video_ids


def fetch_video_stats(video_ids, category_label):
    data = []
    for vid in video_ids:
        try:
            response = youtube.videos().list(
                part="statistics,snippet",
                id=vid
            ).execute()
            items = response.get("items", [])
            if not items:
                continue
            stats = items[0].get("statistics", {})
            views = stats.get("viewCount")
            likes = stats.get("likeCount")
            if views is None or likes is None:
                continue
            views = int(views)
            likes = int(likes)
            if views == 0:
                continue
            data.append({"video_id": vid, "category": category_label, "views": views, "likes": likes})
        except Exception as e:
            print(f"Error fetching stats for {vid}: {e}")
    return data


def bootstrap_proportions(data, n_bootstrap=1000, threshold=LIKE_RATIO_THRESHOLD):
    if data.empty:
        return []
    props = []
    for _ in range(n_bootstrap):
        sample = data.sample(n=len(data), replace=True)
        props.append((sample["like_ratio"] > threshold).mean())
    return props

horror_video_ids = get_video_ids_by_keyword("horror", TOTAL_VIDEOS)
gaming_video_ids = get_video_ids_by_keyword("gaming", TOTAL_VIDEOS)

horror_data = fetch_video_stats(horror_video_ids, "horror")
gaming_data = fetch_video_stats(gaming_video_ids, "gaming")

df = pd.DataFrame(horror_data + gaming_data)

# Ensure the 'category' column exists
if 'category' not in df.columns:
    df['category'] = 'unknown'


for col in ["likes", "views"]:
    if col not in df.columns:
        df[col] = 0
    df[col] = pd.to_numeric(df[col], errors='coerce')


df = df.dropna(subset=["likes", "views"])
df = df[(df["views"] > 0)]

df["like_ratio"] = df["likes"] / df["views"]

p_estimate = 0.5
n_needed = math.ceil((CONFIDENCE_Z**2 * p_estimate * (1-p_estimate)) / (MARGIN_ERROR**2))
print(f"Sample size needed per category for ≤10% margin of error: {n_needed}")

horror_df = df[df["category"]=="horror"]
gaming_df = df[df["category"]=="gaming"]

if not horror_df.empty:
    horror_sample = horror_df.sample(n=min(n_needed, len(horror_df)), random_state=42)
else:
    horror_sample = pd.DataFrame(columns=df.columns)

if not gaming_df.empty:
    gaming_sample = gaming_df.sample(n=min(n_needed, len(gaming_df)), random_state=42)
else:
    gaming_sample = pd.DataFrame(columns=df.columns)

horror_prop = (horror_sample["like_ratio"] > LIKE_RATIO_THRESHOLD).mean() if not horror_sample.empty else np.nan
gaming_prop = (gaming_sample["like_ratio"] > LIKE_RATIO_THRESHOLD).mean() if not gaming_sample.empty else np.nan
print(f"Horror proportion above {LIKE_RATIO_THRESHOLD}: {horror_prop}")
print(f"Gaming proportion above {LIKE_RATIO_THRESHOLD}: {gaming_prop}")

horror_boot = bootstrap_proportions(horror_sample)
gaming_boot = bootstrap_proportions(gaming_sample)

Sample size needed per category for ≤10% margin of error: 97
Horror proportion above 0.05: 0.0
Gaming proportion above 0.05: 0.10204081632653061


## Conclusion
> Okay, I'm just gonna conclude now because I've been step behind this entire class period. My code has just being very dumb. Final answer, gaming videos get more likes than horror videos in this sampling distribution. In the first part of this project, gaming videos got an average like ratio of p = 0.024 while horror only made it up to about p = 0.014. In the second part of this project, the horror proportion was 0.0 and the gaming pro