In [1]:
    # 02_feature_engineering.ipynb
    # Feature Engineering for US YouTube Trending Videos

    import pandas as pd
    import numpy as np

    print("Feature engineering notebook ready.")
    

Feature engineering notebook ready.


In [2]:
    # Load raw data
    df = pd.read_csv("../data/raw/USvideos.csv")

    # Recreate ratios in case this notebook is run standalone
    df["like_view_ratio"] = df["likes"] / (df["views"] + 1e-6)
    df["comment_view_ratio"] = df["comment_count"] / (df["views"] + 1e-6)

    df.head()
    

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,like_view_ratio,comment_view_ratio
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,0.076869,0.021318
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",0.040179,0.005252
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,0.045758,0.002563
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,0.029641,0.006253
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,0.063097,0.008359


In [3]:
    # Parse publish_time into datetime, date and hour components
    df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")
    df["publish_date"] = df["publish_time"].dt.date
    df["publish_hour"] = df["publish_time"].dt.hour

    df[["publish_time", "publish_date", "publish_hour"]].head()
    

Unnamed: 0,publish_time,publish_date,publish_hour
0,2017-11-13 17:13:01+00:00,2017-11-13,17
1,2017-11-13 07:30:00+00:00,2017-11-13,7
2,2017-11-12 19:05:24+00:00,2017-11-12,19
3,2017-11-13 11:00:04+00:00,2017-11-13,11
4,2017-11-12 18:01:41+00:00,2017-11-12,18


In [4]:
    # Fix trending_date format: original is 'YY.DD.MM', e.g. '17.14.11' meaning 2017-11-14
    def fix_trending_date(x: str) -> str:
        yy, dd, mm = x.split(".")
        return f"20{yy}-{mm}-{dd}"

    df["trending_date_fixed"] = df["trending_date"].apply(fix_trending_date)
    df["trending_date"] = pd.to_datetime(df["trending_date_fixed"], errors="coerce")
    df = df.drop(columns=["trending_date_fixed"])

    df[["trending_date"]].head()
    

Unnamed: 0,trending_date
0,2017-11-14
1,2017-11-14
2,2017-11-14
3,2017-11-14
4,2017-11-14


In [5]:
    # Sort by video_id and trending_date so that we can compute next-day views
    df = df.sort_values(by=["video_id", "trending_date"])
    df[["video_id", "trending_date", "views"]].head(10)
    

Unnamed: 0,video_id,trending_date,views
39169,-0CMnp02rNY,2018-06-06,475965
39375,-0CMnp02rNY,2018-06-07,605506
39584,-0CMnp02rNY,2018-06-08,705986
39795,-0CMnp02rNY,2018-06-09,754273
40007,-0CMnp02rNY,2018-06-10,792613
40208,-0CMnp02rNY,2018-06-11,800359
15457,-0NYY8cqdiQ,2018-02-01,563746
31553,-1Hm41N0dUs,2018-04-29,1566807
31773,-1Hm41N0dUs,2018-04-30,1882352
31992,-1Hm41N0dUs,2018-05-01,2058516


In [6]:
    # Compute next-day view count per video
    df["views_next_day"] = df.groupby("video_id")["views"].shift(-1)

    df[["video_id", "trending_date", "views", "views_next_day"]].head(10)
    

Unnamed: 0,video_id,trending_date,views,views_next_day
39169,-0CMnp02rNY,2018-06-06,475965,605506.0
39375,-0CMnp02rNY,2018-06-07,605506,705986.0
39584,-0CMnp02rNY,2018-06-08,705986,754273.0
39795,-0CMnp02rNY,2018-06-09,754273,792613.0
40007,-0CMnp02rNY,2018-06-10,792613,800359.0
40208,-0CMnp02rNY,2018-06-11,800359,
15457,-0NYY8cqdiQ,2018-02-01,563746,
31553,-1Hm41N0dUs,2018-04-29,1566807,1882352.0
31773,-1Hm41N0dUs,2018-04-30,1882352,2058516.0
31992,-1Hm41N0dUs,2018-05-01,2058516,


In [7]:
    # Compute absolute and relative growth
    df["view_growth"] = df["views_next_day"] - df["views"]
    df["growth_rate"] = df["view_growth"] / (df["views"] + 1e-6)

    df[["views", "views_next_day", "view_growth", "growth_rate"]].head(10)
    

Unnamed: 0,views,views_next_day,view_growth,growth_rate
39169,475965,605506.0,129541.0,0.272165
39375,605506,705986.0,100480.0,0.165944
39584,705986,754273.0,48287.0,0.068397
39795,754273,792613.0,38340.0,0.05083
40007,792613,800359.0,7746.0,0.009773
40208,800359,,,
15457,563746,,,
31553,1566807,1882352.0,315545.0,0.201394
31773,1882352,2058516.0,176164.0,0.093587
31992,2058516,,,


In [8]:
    # Drop rows where growth_rate is NaN (typically the last trending day of each video)
    df_valid = df.dropna(subset=["growth_rate"]).copy()

    # Define high_growth label as top 25% of growth_rate
    threshold = df_valid["growth_rate"].quantile(0.75)
    df_valid["high_growth"] = (df_valid["growth_rate"] >= threshold).astype(int)

    print("High growth threshold (75th percentile):", threshold)
    df_valid["high_growth"].value_counts()
    

High growth threshold (75th percentile): 0.13315463229482438


high_growth
0    25948
1     8650
Name: count, dtype: int64

In [9]:
    # Select feature columns for modeling
    feature_cols = [
        "video_id",
        "trending_date",
        "publish_date",
        "publish_hour",
        "views",
        "likes",
        "dislikes",
        "comment_count",
        "like_view_ratio",
        "comment_view_ratio",
        "view_growth",
        "growth_rate",
        "high_growth",
        "category_id",
    ]

    features = df_valid[feature_cols].copy()
    features.head()
    

Unnamed: 0,video_id,trending_date,publish_date,publish_hour,views,likes,dislikes,comment_count,like_view_ratio,comment_view_ratio,view_growth,growth_rate,high_growth,category_id
39169,-0CMnp02rNY,2018-06-06,2018-06-04,13,475965,6531,172,271,0.013722,0.000569,129541.0,0.272165,1,24
39375,-0CMnp02rNY,2018-06-07,2018-06-04,13,605506,7848,232,354,0.012961,0.000585,100480.0,0.165944,1,24
39584,-0CMnp02rNY,2018-06-08,2018-06-04,13,705986,8930,277,371,0.012649,0.000526,48287.0,0.068397,0,24
39795,-0CMnp02rNY,2018-06-09,2018-06-04,13,754273,9395,303,383,0.012456,0.000508,38340.0,0.05083,0,24
40007,-0CMnp02rNY,2018-06-10,2018-06-04,13,792613,9720,330,413,0.012263,0.000521,7746.0,0.009773,0,24


In [10]:
    # Save processed features
    features.to_csv("../data/processed/features.csv", index=False)
    print("Saved processed features to ../data/processed/features.csv")
    

Saved processed features to ../data/processed/features.csv
