In [None]:
# 02_feature_engineering.ipynb
# Feature Engineering for YouTube Trending Videos (US dataset)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("Feature Engineering notebook ready.")

In [None]:
df = pd.read_csv("../data/raw/USvideos.csv")
df.head()


In [None]:
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['publish_date'] = df['publish_time'].dt.date
df['publish_hour'] = df['publish_time'].dt.hour

df[['publish_time', 'publish_date', 'publish_hour']].head()


In [None]:
def fix_trending_date(x):
    # format: YY.DD.MM
    # Example: 17.14.11 â†’ 2017-11-14
    yy, dd, mm = x.split('.')
    return f"20{yy}-{mm}-{dd}"

df['trending_date_fixed'] = df['trending_date'].apply(fix_trending_date)
df['trending_date_fixed'] = pd.to_datetime(df['trending_date_fixed'])

df[['trending_date', 'trending_date_fixed']].head()


In [None]:
df['trending_date'] = df['trending_date_fixed']
df.drop(columns=['trending_date_fixed'], inplace=True)


In [None]:
df = df.sort_values(by=['video_id', 'trending_date'])
df.head()


In [None]:
df['views_next_day'] = df.groupby('video_id')['views'].shift(-1)
df[['video_id', 'trending_date', 'views', 'views_next_day']].head(10)


In [None]:
df['view_growth'] = df['views_next_day'] - df['views']
df['growth_rate'] = df['view_growth'] / (df['views'] + 1e-6)

df[['views', 'views_next_day', 'view_growth', 'growth_rate']].head(10)


In [None]:
threshold = df['growth_rate'].quantile(0.75)
df['high_growth'] = (df['growth_rate'] >= threshold).astype(int)

threshold, df['high_growth'].value_counts()


In [None]:
# Final feature table
features = df[['video_id',
               'trending_date',
               'publish_date',
               'publish_hour',
               'views',
               'likes',
               'dislikes',
               'comment_count',
               'like_view_ratio',
               'comment_view_ratio',
               'view_growth',
               'growth_rate',
               'high_growth',
               'category_id']]

features.head()


In [None]:
features.to_csv("../data/processed/features.csv", index=False)
print("features.csv saved.")
