## YouTube features

This notebook reads a CSV file containing a list of YouTube video links (must have a column named "pre_contest_url").


Extracts metadata for each video using yt_dlp, including:
- Video title
- Channel name
- Upload date
- Total views, Likes, Comments
- Days since upload
- Views-per-day ratio
- Extracts and analyzes the YouTube heatmap (if available), which represents the “most replayed” segments of the video.


Computes several features from the heatmap:
- 📊 Average replay score (average_value)
- 📈 Skewness of the replay distribution (skewness_value)
- 🔀 Smoothness of the curve using: Mean of absolute first differences and Variance of first differences
- 📍 Time position of the peak (most replayed moment), expressed as a percentage of total video length
- 🧠 Count of meaningful local replay peaks, based on prominence
- 🕒 Time locations of additional peaks (if more than one)


In [None]:
pip install yt_dlp

In [None]:
import pandas as pd
import yt_dlp
from datetime import datetime
import numpy as np
from scipy.stats import skew
from scipy.signal import find_peaks

# === INPUT/OUTPUT SETTINGS ===
input_csv = "input_file.csv"  # Replace with your input CSV path
output_csv = "output_file.csv"  # Replace with your output CSV path

# === LOAD INPUT ===
df = pd.read_csv(input_csv)

# Check if t he required column exists
if "pre_contest_url" not in df.columns:
    raise ValueError("The input CSV must contain a column named 'pre_contest_url'.")

# === FUNCTIONS ===

# Extract video info using yt_dlp
def get_video_info(link):
    try:
        with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
            info = ydl.extract_info(link, download=False)
            video_title = info.get('title', 'Unknown')
            channel_name = info.get('uploader', 'Unknown')
            upload_date_str = info.get('upload_date', 'Unknown')
            total_views = info.get('view_count', 0)
            like_count = info.get('like_count', 'Unknown')
            comment_count = info.get('comment_count', 'Unknown')

            # Upload date and days since upload
            if upload_date_str != 'Unknown':
                upload_date = datetime.strptime(upload_date_str, '%Y%m%d')
                days_since_upload = (datetime.now() - upload_date).days
            else:
                upload_date = "Unknown"
                days_since_upload = "Unknown"

            # Views per day
            if days_since_upload != "Unknown" and days_since_upload > 0:
                views_per_day = total_views / days_since_upload
            else:
                views_per_day = total_views

            # Extract heatmap if available
            heatmap_data = info.get('heatmap', None)

            return {
                "Video Title": video_title,
                "Channel Name": channel_name,
                "Upload Date": upload_date if upload_date == "Unknown" else upload_date.strftime('%Y-%m-%d'),
                "Total Views": total_views,
                "Likes": like_count,
                "Comments": comment_count,
                "Days Since Upload": days_since_upload,
                "Views per Day Ratio": round(views_per_day, 2) if isinstance(views_per_day, float) else views_per_day,
                "Link": link,
                "Heatmap": heatmap_data  # Save heatmap raw
            }

    except Exception as e:
        print(f"Error processing {link}: {e}")
        return {
            "Video Title": "Error",
            "Channel Name": "Error",
            "Upload Date": "Error",
            "Total Views": "Error",
            "Likes": "Error",
            "Comments": "Error",
            "Days Since Upload": "Error",
            "Views per Day Ratio": "Error",
            "Link": link,
            "Heatmap": None
        }

# Analyze heatmap
def extract_heatmap_features(heatmap):
    if heatmap is None or len(heatmap) == 0:
        # If no heatmap available, return NaN
        return {
            "average_value": np.nan,
            "skewness_value": np.nan,
            "mean_abs_difference": np.nan,
            "variance_difference": np.nan,
            "primary_peak_percentage": np.nan,
            "number_of_maxima": np.nan,
            "additional_peak_percentages": np.nan
        }
    
    values = np.array([point['value'] for point in heatmap])
    start_times = np.array([point['start_time'] for point in heatmap])
    end_times = np.array([point['end_time'] for point in heatmap])
    mid_times = (start_times + end_times) / 2

    avg_value = np.mean(values)
    skewness_value = skew(values)
    difference = np.diff(values)
    mean_abs_difference = np.mean(np.abs(difference))
    variance_difference = np.var(difference)

    total_duration = end_times[-1]
    peak_indices = np.where(values == np.max(values))[0]
    primary_peak_time_percentage = (mid_times[peak_indices[0]] / total_duration) * 100

    # Find peaks with minimum prominence
    peaks, properties = find_peaks(values, prominence=0.1)

    # Always include the global peak
    if peak_indices[0] not in peaks:
        peaks = np.append(peaks, peak_indices[0])

    num_maxima = len(peaks)

    if num_maxima > 1:
        additional_peak_times = (mid_times[peaks] / total_duration) * 100
        # Convert additional_peak_times to a list if it's a NumPy array
        additional_peak_times = additional_peak_times.tolist() if isinstance(additional_peak_times, np.ndarray) else additional_peak_times
    else:
        additional_peak_times = []

    return {
        "average_value": avg_value,
        "skewness_value": skewness_value,
        "mean_abs_difference": mean_abs_difference,
        "variance_difference": variance_difference,
        "primary_peak_percentage": primary_peak_time_percentage,
        "number_of_maxima": num_maxima,
        "additional_peak_percentages": additional_peak_times
    }

# === MAIN EXECUTION ===

# List to collect everything
video_data = []

# Process each video
for index, row in df.iterrows():
    link = row["pre_contest_url"]
    video_info = get_video_info(link)
    heatmap_features = extract_heatmap_features(video_info.pop("Heatmap"))  # Remove heatmap raw, replace with features
    combined_data = {**video_info, **heatmap_features}
    video_data.append(combined_data)
    print(f"Processed: {link}")

# Save results to CSV
result_df = pd.DataFrame(video_data)
result_df.to_csv(output_csv, index=False)

print(f"✅ All results saved to {output_csv}")