In [None]:
    # 02_feature_engineering.ipynb
    # Feature Engineering for US YouTube Trending Videos

    import pandas as pd
    import numpy as np

    print("Feature engineering notebook ready.")
    

In [None]:
    # Load raw data
    df = pd.read_csv("../data/raw/USvideos.csv")

    # Recreate ratios in case this notebook is run standalone
    df["like_view_ratio"] = df["likes"] / (df["views"] + 1e-6)
    df["comment_view_ratio"] = df["comment_count"] / (df["views"] + 1e-6)

    df.head()
    

In [None]:
    # Parse publish_time into datetime, date and hour components
    df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")
    df["publish_date"] = df["publish_time"].dt.date
    df["publish_hour"] = df["publish_time"].dt.hour

    df[["publish_time", "publish_date", "publish_hour"]].head()
    

In [None]:
    # Fix trending_date format: original is 'YY.DD.MM', e.g. '17.14.11' meaning 2017-11-14
    def fix_trending_date(x: str) -> str:
        yy, dd, mm = x.split(".")
        return f"20{yy}-{mm}-{dd}"

    df["trending_date_fixed"] = df["trending_date"].apply(fix_trending_date)
    df["trending_date"] = pd.to_datetime(df["trending_date_fixed"], errors="coerce")
    df = df.drop(columns=["trending_date_fixed"])

    df[["trending_date"]].head()
    

In [None]:
    # Sort by video_id and trending_date so that we can compute next-day views
    df = df.sort_values(by=["video_id", "trending_date"])
    df[["video_id", "trending_date", "views"]].head(10)
    

In [None]:
    # Compute next-day view count per video
    df["views_next_day"] = df.groupby("video_id")["views"].shift(-1)

    df[["video_id", "trending_date", "views", "views_next_day"]].head(10)
    

In [None]:
    # Compute absolute and relative growth
    df["view_growth"] = df["views_next_day"] - df["views"]
    df["growth_rate"] = df["view_growth"] / (df["views"] + 1e-6)

    df[["views", "views_next_day", "view_growth", "growth_rate"]].head(10)
    

In [None]:
    # Drop rows where growth_rate is NaN (typically the last trending day of each video)
    df_valid = df.dropna(subset=["growth_rate"]).copy()

    # Define high_growth label as top 25% of growth_rate
    threshold = df_valid["growth_rate"].quantile(0.75)
    df_valid["high_growth"] = (df_valid["growth_rate"] >= threshold).astype(int)

    print("High growth threshold (75th percentile):", threshold)
    df_valid["high_growth"].value_counts()
    

In [None]:
    # Select feature columns for modeling
    feature_cols = [
        "video_id",
        "trending_date",
        "publish_date",
        "publish_hour",
        "views",
        "likes",
        "dislikes",
        "comment_count",
        "like_view_ratio",
        "comment_view_ratio",
        "view_growth",
        "growth_rate",
        "high_growth",
        "category_id",
    ]

    features = df_valid[feature_cols].copy()
    features.head()
    

In [None]:
    # Save processed features
    features.to_csv("../data/processed/features.csv", index=False)
    print("Saved processed features to ../data/processed/features.csv")
    