## Step 1: Aggregate comments → video-hourly activity

In [1]:
# --- CELL: install_dependencies.py ---
# Install required packages for TrendSpotter pipeline (Kaggle-safe, Python 3.11)

!pip install --quiet numpy
!pip install --quiet hawkeslib
!pip install --quiet sentence-transformers # NLP embedding
!pip install --quiet giotto-tda # Topological Data Analysis (latest works)
!pip install --quiet polars # Data handling (compatible with cudf-polars)
!pip install --quiet lightgbm xgboost # ML models
!pip install --quiet prophet cmdstanpy # Forecasting (use the newer `prophet` package, not fbprophet)

print("[OK] Installed all dependencies")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.2/100.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hawkeslib (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load enriched comments
df = pd.read_parquet("/kaggle/input/data-cleaning/comments_enriched.parquet")
display(df.head())

Unnamed: 0,commentId,videoId,textOriginal,likeCount,publishedAt_comment,publishedAt_video,channelId,title,description,text_norm,hashtags,emoji_count,lang,date,hour,day_of_week,week_start
0,1781382,74288,PLEASE LESBIAN FLAG I BEG YOU You would rock it,0,2023-08-15 21:48:52+00:00,2023-08-15 21:22:52+00:00,14492.0,I tried hair inspired by the PAN flag 🩷💛🩵 #pan...,,PLEASE LESBIAN FLAG I BEG YOU You would rock it,[],0,en,2023-08-15,21,1,2023-08-15
1,289571,79618,Apply mashed potato juice and mixed it with curd,0,2023-10-02 13:08:22+00:00,2023-10-01 06:30:15+00:00,14727.0,5 Foundation Mistakes that Every Girl Should Know,5 Foundation Mistakes that Every Girl Should K...,Apply mashed potato juice and mixed it with curd,[],0,en,2023-10-02,13,0,2023-09-26
2,569077,51826,69 missed calls from mars👽,0,2024-05-31 12:03:12+00:00,2023-03-05 17:36:18+00:00,3314.0,How To Make Small Eyes Look Bigger,How To Make Small Eyes Look Bigger,69 missed calls from mars,[],0,en,2024-05-31,12,4,2024-05-28
3,2957962,58298,Baaa,0,2024-02-13 15:48:37+00:00,2024-02-13 14:02:42+00:00,5008.0,20sec beauty test: BLUSH PLACEMENT for YOUR FA...,,Baaa,[],0,so,2024-02-13,15,1,2024-02-13
4,673093,1265,you look like raven from phenomena raven no cap,0,2020-02-15 22:28:44+00:00,2020-01-23 21:00:00+00:00,21411.0,BLACK GIRL TRIES KYLIE JENNER MAKEUP,Today on Black Girl Tries we are trying Kylie ...,you look like raven from phenomena raven no cap,[],0,sl,2020-02-15,22,5,2020-02-11


In [4]:
# Ensure datetime index exists
df["datetime"] = pd.to_datetime(df["date"].astype(str) + " " + df["hour"].astype(str) + ":00:00")

# Aggregate by datetime (hourly) and videoId
agg = (
    df.groupby([pd.Grouper(key="datetime", freq="h"), "videoId"])
    .agg(
        comment_count=("text_norm", "count"),
        unique_commenters=("lang", "nunique"),  # proxy if authorId not available
        avg_emoji_per_comment=("emoji_count", "mean"),
        hashtag_density=("hashtags", lambda x: np.mean([len(h) for h in x]) if len(x) > 0 else 0)
    )
    .reset_index()
    .sort_values(["videoId", "datetime"])
    .reset_index(drop=True)
)

# Ensure correct column order
column_order = ["datetime", "videoId", "comment_count", "unique_commenters", "avg_emoji_per_comment", "hashtag_density"]
agg = agg[column_order]

# Save for downstream
agg.to_parquet("/kaggle/working/video_hourly_activity.parquet", index=False)

print(f"[OK] Aggregated {len(df)} comments → {len(agg)} video-hour rows")

# Display first 10 rows
print("First 10 rows:")
display(agg.head(10))

# Display last 10 rows  
print("Last 10 rows:")
display(agg.tail(10))

[OK] Aggregated 4725012 comments → 2143123 video-hour rows
First 10 rows:


Unnamed: 0,datetime,videoId,comment_count,unique_commenters,avg_emoji_per_comment,hashtag_density
0,2020-02-12 05:00:00,0,1,1,0.0,0.0
1,2020-02-12 09:00:00,0,4,2,0.25,0.0
2,2020-02-12 10:00:00,0,21,5,0.095238,0.0
3,2020-02-12 11:00:00,0,4,1,0.0,0.0
4,2020-02-12 12:00:00,0,7,3,0.142857,0.142857
5,2020-02-12 13:00:00,0,22,6,0.0,0.0
6,2020-02-12 14:00:00,0,4,2,0.5,0.0
7,2020-02-12 15:00:00,0,6,2,0.333333,0.0
8,2020-02-12 16:00:00,0,2,1,0.0,0.0
9,2020-02-12 17:00:00,0,2,2,0.0,0.0


Last 10 rows:


Unnamed: 0,datetime,videoId,comment_count,unique_commenters,avg_emoji_per_comment,hashtag_density
2143113,2024-07-09 19:00:00,92852,1,1,0.0,0.0
2143114,2024-07-09 21:00:00,92852,1,1,0.0,0.0
2143115,2024-07-13 04:00:00,92852,2,1,0.0,0.0
2143116,2024-08-14 15:00:00,92852,1,1,0.0,0.0
2143117,2024-08-15 10:00:00,92852,1,1,0.0,0.0
2143118,2024-08-15 15:00:00,92852,1,1,0.0,0.0
2143119,2024-08-15 16:00:00,92852,1,1,0.0,0.0
2143120,2024-09-19 15:00:00,92852,1,1,0.0,0.0
2143121,2025-02-16 17:00:00,92854,1,1,0.0,0.0
2143122,2025-02-19 04:00:00,92854,1,1,0.0,0.0


## Step 2: Load video-hourly dataset for signal computation

In [5]:
activity = pd.read_parquet("/kaggle/working/video_hourly_activity.parquet")

print(f"[OK] Loaded {len(activity)} video-hour records for {activity['videoId'].nunique()} videos")
display(activity.head())
display(activity.tail())

[OK] Loaded 2143123 video-hour records for 39938 videos


Unnamed: 0,datetime,videoId,comment_count,unique_commenters,avg_emoji_per_comment,hashtag_density
0,2020-02-12 05:00:00,0,1,1,0.0,0.0
1,2020-02-12 09:00:00,0,4,2,0.25,0.0
2,2020-02-12 10:00:00,0,21,5,0.095238,0.0
3,2020-02-12 11:00:00,0,4,1,0.0,0.0
4,2020-02-12 12:00:00,0,7,3,0.142857,0.142857


Unnamed: 0,datetime,videoId,comment_count,unique_commenters,avg_emoji_per_comment,hashtag_density
2143118,2024-08-15 15:00:00,92852,1,1,0.0,0.0
2143119,2024-08-15 16:00:00,92852,1,1,0.0,0.0
2143120,2024-09-19 15:00:00,92852,1,1,0.0,0.0
2143121,2025-02-16 17:00:00,92854,1,1,0.0,0.0
2143122,2025-02-19 04:00:00,92854,1,1,0.0,0.0
