# Feature Engineering: YouTube Trending + Google Trends

This notebook builds the modeling dataset used in `03_modeling...ipynb`.

**Outputs**
- `data/processed/features.csv`
- `data/processed/features_with_trends.csv`

The pipeline is designed to be reproducible and to avoid target leakage by defining the target using *next-day* views within each `video_id` time series.

In [None]:
# Core libraries
import pandas as pd
import numpy as np

from pathlib import Path

# Stats
from scipy.stats import mannwhitneyu

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)


## 1) Paths and configuration

In [None]:
# Adjust these paths if your folder structure differs.
RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

YOUTUBE_CSV = RAW_DIR / "USvideos.csv"
TRENDS_CSV  = RAW_DIR / "google_trends_category.csv"  # created by 00_fetch_google_trends_final.ipynb

FEATURES_OUT = PROCESSED_DIR / "features.csv"
FEATURES_TRENDS_OUT = PROCESSED_DIR / "features_with_trends.csv"

assert YOUTUBE_CSV.exists(), f"Missing file: {YOUTUBE_CSV}. Please place USvideos.csv under data/raw/."


## 2) Load and clean YouTube trending data

In [None]:
df = pd.read_csv(YOUTUBE_CSV)

# --- Parse trending_date (format like '17.14.11' meaning YY.DD.MM in this dataset)
def fix_trending_date(x: str) -> str:
    # Expect YY.DD.MM
    yy, dd, mm = x.split(".")
    return f"20{yy}-{mm}-{dd}"

df["trending_date"] = pd.to_datetime(df["trending_date"].astype(str).apply(fix_trending_date), errors="coerce")

# --- Parse publish_time -> publish_date and publish_hour
# publish_time example: '2017-11-13T17:13:01.000Z'
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce", utc=True)
df["publish_date"] = df["publish_time"].dt.date.astype(str)
df["publish_hour"] = df["publish_time"].dt.hour

# Basic type cleanup
numeric_cols = ["views", "likes", "dislikes", "comment_count", "category_id"]
for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Keep only the columns we need for features
base_cols = ["video_id", "trending_date", "publish_date", "publish_hour",
             "views", "likes", "dislikes", "comment_count", "category_id"]
df = df[base_cols].dropna(subset=["video_id", "trending_date", "views", "category_id"]).copy()

# Sort for time-series operations
df = df.sort_values(["video_id", "trending_date"]).reset_index(drop=True)

df.head()


## 3) Feature engineering (ratios, next-day growth, labels)

In [None]:
# Safe ratios
df["like_view_ratio"] = df["likes"] / df["views"].replace(0, np.nan)
df["comment_view_ratio"] = df["comment_count"] / df["views"].replace(0, np.nan)

# Next-day views within each video_id series
df["views_next_day"] = df.groupby("video_id")["views"].shift(-1)

# Absolute and relative growth (target is based on next-day)
df["view_growth"] = df["views_next_day"] - df["views"]
df["growth_rate"] = df["view_growth"] / df["views"].replace(0, np.nan)

# Drop rows where next-day is missing (can't define target)
df_model = df.dropna(subset=["views_next_day", "growth_rate"]).copy()

# Category-normalized high-growth label:
# within each category, label top 25% by growth_rate
q75 = df_model.groupby("category_id")["growth_rate"].transform(lambda s: s.quantile(0.75))
df_model["high_growth"] = (df_model["growth_rate"] >= q75).astype(int)

# Keep output columns consistent with downstream notebooks
out_cols = ["video_id", "trending_date", "publish_date", "publish_hour",
            "views", "likes", "dislikes", "comment_count",
            "like_view_ratio", "comment_view_ratio",
            "view_growth", "growth_rate", "high_growth", "category_id"]

features = df_model[out_cols].copy()

features.head(), features.shape


## 4) Save base features

In [None]:
features.to_csv(FEATURES_OUT, index=False)
print(f"Saved: {FEATURES_OUT}  shape={features.shape}")


## 5) Load Google Trends (category-level) and merge

In [None]:
# Trends file expected format: date, category_id, keyword, trend_score
assert TRENDS_CSV.exists(), f"Missing file: {TRENDS_CSV}. Run 00_fetch_google_trends_final.ipynb first."

tr = pd.read_csv(TRENDS_CSV)
tr["date"] = pd.to_datetime(tr["date"], errors="coerce")
tr["category_id"] = pd.to_numeric(tr["category_id"], errors="coerce")
tr["trend_score"] = pd.to_numeric(tr["trend_score"], errors="coerce")

# Merge on (trending_date, category_id)
tr = tr.rename(columns={"date": "trending_date"})[["trending_date", "category_id", "trend_score"]].dropna(subset=["trending_date", "category_id"])
tr = tr.sort_values(["category_id", "trending_date"]).reset_index(drop=True)

features_tr = features.merge(tr, on=["trending_date", "category_id"], how="left")

# Rolling means per category (handles missing dates by sorting within category)
features_tr = features_tr.sort_values(["category_id", "trending_date"]).reset_index(drop=True)
features_tr["trend_score_3d_mean"] = (
    features_tr.groupby("category_id")["trend_score"]
    .transform(lambda s: s.rolling(window=3, min_periods=1).mean())
)
features_tr["trend_score_7d_mean"] = (
    features_tr.groupby("category_id")["trend_score"]
    .transform(lambda s: s.rolling(window=7, min_periods=1).mean())
)

features_tr.head(), features_tr.shape


## 6) Save features with trends

In [None]:
features_tr.to_csv(FEATURES_TRENDS_OUT, index=False)
print(f"Saved: {FEATURES_TRENDS_OUT}  shape={features_tr.shape}")


## 7) Statistical hypothesis test

**Question:** Do Google Trends scores differ between high-growth and low-growth videos?

- **H0:** No difference in Trends scores between the two groups.
- **H1:** Trends scores differ between the groups.

We use Mann–Whitney U (non-parametric) since the distributions are typically non-normal.

In [None]:
x_high = features_tr.loc[features_tr["high_growth"] == 1, "trend_score_7d_mean"].dropna()
x_low  = features_tr.loc[features_tr["high_growth"] == 0, "trend_score_7d_mean"].dropna()

stat, p_value = mannwhitneyu(x_high, x_low, alternative="two-sided")

print(f"n_high={len(x_high)}, n_low={len(x_low)}")
print(f"Mann–Whitney U statistic: {stat:.2f}")
print(f"p-value: {p_value:.6g}")

alpha = 0.05
if p_value < alpha:
    print("Decision: Reject H0 (statistically significant difference).")
else:
    print("Decision: Fail to reject H0 (no statistically significant difference detected).")


## 8) Quick sanity checks

In [None]:
# Check label balance
print("High-growth rate:", features_tr["high_growth"].mean())

# Missingness of trend scores
print("Missing trend_score:", features_tr["trend_score"].isna().mean())

# Basic descriptive stats
features_tr[["growth_rate", "trend_score", "trend_score_3d_mean", "trend_score_7d_mean"]].describe().T
