In [1]:
# --- CELL: install_dependencies.py ---
# Install required packages for TrendSpotter pipeline (Kaggle-safe, Python 3.11)

!pip install --quiet numpy
!pip install --quiet hawkeslib
print("[OK] Installed all dependencies")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.2/100.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hawkeslib (setup.py) ... [?25l[?25hdone
[OK] Installed all dependencies


In [2]:
import pandas as pd
import numpy as np

## Apply Hawkes Momentum

The intensity function of the Hawkes process is defined as:

$$
\lambda(t) = \mu + \sum_{t_i < t} \alpha e^{-\beta (t - t_i)}
$$

Where:

- $\mu$ = baseline activity rate  
- $\alpha$ = excitation strength (how much one event triggers future ones)  
- $\beta$ = decay rate (how quickly influence fades)  
- $t_i$ = timestamps of past events occurring before time $t$

This intensity function $\lambda(t)$ captures the instantaneous rate of events, with each prior event $t_i$ contributing an exponentially decaying excitation.

To compute the **trend momentum score** $R_t$, we normalize the intensity (or event counts weighted by influence) within discrete time buckets. This normalization (e.g., z-score, min-max, or relative growth) allows comparison across time periods and produces a dimensionless score indicating momentum.

$$
R_t = \text{Normalize}\left( \lambda(t) \text{ over time bucket } t \right)
$$

The normalization step can be, for example:

- Z-score: $ R_t = \frac{\lambda_t - \mu_b}{\sigma_b} $
- Min-Max: $ R_t = \frac{\lambda_t - \min(\lambda)}{\max(\lambda) - \min(\lambda)} $
- Or relative to baseline: $ R_t = \frac{\lambda_t}{\mu} $

This score $R_t$ reflects the relative excitement or activity surge in each time bucket, adjusted for endogenous triggering and decay.

In [3]:
# --- CELL: simplified_momentum.py ---
# Compute simplified momentum metrics without Hawkes process

import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# ====== CONFIG ======
INPUT = "/kaggle/input/aggregate-to-video-hourly-activity/video_hourly_activity.parquet"
OUTPUT = "/kaggle/working/signal_hawkes.csv"
MIN_ACTIVE_HOURS = 10
MIN_TOTAL_COMMENTS = 50
MOMENTUM_WINDOW_HOURS = 168  # 7 days for momentum calculation
# ===================

print(f"[INFO] Loading video-hourly activity from {INPUT}")
df = pd.read_parquet(INPUT)
df["datetime"] = pd.to_datetime(df["datetime"], utc=True)

# Check column names
comment_col = "commentCount" if "commentCount" in df.columns else "comment_count"
video_id_col = "videoId" if "videoId" in df.columns else "video_id"

print(f"[DEBUG] Using columns: {video_id_col}, {comment_col}")

# ---- FILTER: Only videos with sufficient activity ----
print("[INFO] Filtering videos with sufficient activity...")
video_stats = df.groupby(video_id_col).agg({
    comment_col: ["sum", "count"],
    "datetime": ["min", "max"]
}).round(2)
video_stats.columns = ["total_comments", "active_hours", "first_active", "last_active"]
video_stats["duration_hours"] = (video_stats["last_active"] - video_stats["first_active"]).dt.total_seconds() / 3600

# Filter eligible videos
eligible_videos = video_stats[
    (video_stats["total_comments"] >= MIN_TOTAL_COMMENTS) & 
    (video_stats["active_hours"] >= MIN_ACTIVE_HOURS) &
    (video_stats["duration_hours"] >= 1)
].index

print(f"[INFO] Found {len(eligible_videos)} eligible videos (from {video_stats.shape[0]} total)")

subset = df[df[video_id_col].isin(eligible_videos) & (df[comment_col] > 0)].copy()
print(f"[INFO] Working with {len(subset)} comment-active hourly records")

# ---- SIMPLIFIED MOMENTUM CALCULATION ----
print(f"[INFO] Computing simplified momentum for {len(eligible_videos)} videos...")

momentum_results = []
grouped = subset.groupby(video_id_col)

for vid, group in tqdm(grouped, total=len(grouped)):
    group = group.sort_values("datetime").reset_index(drop=True)
    
    # Calculate hours since first activity
    group["hours_since_start"] = (group["datetime"] - group["datetime"].iloc[0]).dt.total_seconds() / 3600
    
    total_comments = group[comment_col].sum()
    duration_hours = group["hours_since_start"].iloc[-1]
    active_hours = len(group)
    peak_hourly = group[comment_col].max()
    
    # ---- MOMENTUM METRICS ----
    
    # 1. Activity Decay Rate (how fast engagement drops over time)
    if len(group) >= 3:
        # Fit exponential decay: y = a * exp(-b * t)
        x = group["hours_since_start"].values
        y = group[comment_col].values + 1e-6  # Avoid log(0)
        
        try:
            # Log-linear regression for exponential decay
            log_y = np.log(y)
            slope, intercept, r_value, _, _ = stats.linregress(x, log_y)
            decay_rate = -slope  # Positive = decaying, Negative = growing
            decay_r2 = r_value ** 2
        except:
            decay_rate = 0.0
            decay_r2 = 0.0
    else:
        decay_rate = 0.0
        decay_r2 = 0.0
    
    # 2. Early vs Late Activity Ratio
    mid_point = len(group) // 2
    early_activity = group.iloc[:mid_point][comment_col].mean() if mid_point > 0 else 0
    late_activity = group.iloc[mid_point:][comment_col].mean() if len(group) - mid_point > 0 else 0
    
    if early_activity > 0:
        momentum_ratio = late_activity / early_activity  # >1 = accelerating, <1 = decaying
    else:
        momentum_ratio = 1.0
    
    # 3. Recent Momentum (last 24 hours vs previous 24 hours)
    recent_mask = group["hours_since_start"] >= max(0, duration_hours - 24)
    prev_mask = (group["hours_since_start"] >= max(0, duration_hours - 48)) & (group["hours_since_start"] < duration_hours - 24)
    
    recent_activity = group.loc[recent_mask, comment_col].mean() if recent_mask.any() else 0
    prev_activity = group.loc[prev_mask, comment_col].mean() if prev_mask.any() else early_activity
    
    if prev_activity > 0:
        recent_momentum = recent_activity / prev_activity
    else:
        recent_momentum = 1.0
    
    # 4. Velocity (comments per hour trend)
    if duration_hours > 0:
        velocity = total_comments / duration_hours
        
        # Acceleration (change in velocity over time)
        if len(group) >= 4:
            # Split into quarters and compare velocity
            q1_end = len(group) // 4
            q4_start = 3 * len(group) // 4
            
            q1_velocity = group.iloc[:q1_end][comment_col].sum() / (group.iloc[q1_end-1]["hours_since_start"] + 1e-6)
            q4_velocity = group.iloc[q4_start:][comment_col].sum() / max(1, group.iloc[-1]["hours_since_start"] - group.iloc[q4_start]["hours_since_start"])
            
            if q1_velocity > 0:
                acceleration = (q4_velocity - q1_velocity) / q1_velocity
            else:
                acceleration = 0.0
        else:
            acceleration = 0.0
    else:
        velocity = 0.0
        acceleration = 0.0
    
    # 5. Consistency Score (how steady is the engagement)
    if len(group) > 1:
        cv = group[comment_col].std() / (group[comment_col].mean() + 1e-6)  # Coefficient of variation
        consistency = 1 / (1 + cv)  # Higher = more consistent
    else:
        consistency = 1.0
    
    # 6. Composite Momentum Score
    # Combine signals with weights
    momentum_components = {
        "growth_rate": max(0, -decay_rate),  # Negative decay = positive growth
        "momentum_ratio": momentum_ratio,
        "recent_momentum": recent_momentum,
        "acceleration": max(0, acceleration),  # Only positive acceleration
        "velocity_normalized": velocity / max(1, np.log(duration_hours + 1)),  # Normalize by time
    }
    
    # Weighted composite (can be tuned)
    weights = {
        "growth_rate": 0.3,
        "momentum_ratio": 0.25, 
        "recent_momentum": 0.25,
        "acceleration": 0.1,
        "velocity_normalized": 0.1
    }
    
    composite_momentum = sum(weights[k] * momentum_components[k] for k in weights.keys())
    
    # Store results
    momentum_results.append({
        "videoId": int(vid),
        "total_comments": int(total_comments),
        "active_hours": int(active_hours),
        "duration_hours": float(duration_hours),
        "peak_hourly_comments": int(peak_hourly),
        "decay_rate": float(decay_rate),
        "decay_r2": float(decay_r2),
        "momentum_ratio": float(momentum_ratio),
        "recent_momentum": float(recent_momentum),
        "velocity": float(velocity),
        "acceleration": float(acceleration),
        "consistency": float(consistency),
        "composite_momentum": float(composite_momentum),
        **{f"momentum_{k}": float(v) for k, v in momentum_components.items()}
    })

# Create DataFrame
momentum_df = pd.DataFrame(momentum_results)

# ---- ROBUST SCALING ----
print("\n[INFO] Applying robust scaling to momentum metrics...")

momentum_metrics = ["composite_momentum", "momentum_ratio", "recent_momentum", "acceleration", "velocity"]

for metric in momentum_metrics:
    valid_values = momentum_df[metric].replace([np.inf, -np.inf], np.nan).dropna()
    
    if len(valid_values) > 0:
        # Robust scaling (5th to 95th percentile)
        p5, p95 = np.percentile(valid_values, [5, 95])
        clipped = np.clip(valid_values, p5, p95)
        
        # Scale to [0, 10]
        if clipped.max() > clipped.min():
            scaled_name = f"{metric}_scaled"
            
            def scale_value(x):
                if pd.isna(x) or np.isinf(x):
                    return 0.0
                x_clipped = np.clip(x, p5, p95)
                return (x_clipped - clipped.min()) / (clipped.max() - clipped.min()) * 10.0
            
            momentum_df[scaled_name] = momentum_df[metric].apply(scale_value)
        else:
            momentum_df[f"{metric}_scaled"] = 0.0
    else:
        momentum_df[f"{metric}_scaled"] = 0.0

# Primary momentum score (composite, scaled)
momentum_df["hawkes_R_current_scaled"] = momentum_df["composite_momentum_scaled"]

# ---- MOMENTUM CATEGORIES ----
momentum_df["momentum_category"] = pd.cut(
    momentum_df["hawkes_R_current_scaled"], 
    bins=[0, 2.5, 5.0, 7.5, 10], 
    labels=["Low", "Medium", "High", "Viral"],
    include_lowest=True
)

# ---- SUMMARY ----
print("\n" + "="*50)
print("SIMPLIFIED MOMENTUM ANALYSIS SUMMARY")
print("="*50)

print(f"\nDataset Overview:")
print(f"  Total videos processed: {len(momentum_df)}")
print(f"  Success rate: 100.0%")

print(f"\nComposite Momentum Distribution:")
for i in range(0, 101, 20):
    val = np.percentile(momentum_df["hawkes_R_current_scaled"], i)
    print(f"  {i:3d}%: {val:.2f}")

print(f"\nMomentum Categories:")
print(momentum_df["momentum_category"].value_counts().sort_index())

# ---- LOAD VIDEO METADATA ----
print("[INFO] Loading video metadata from fixed path...")

video_metadata = pd.read_csv("/kaggle/input/datathon-loreal/videos.csv")
print(f"[INFO] Loaded metadata for {len(video_metadata)} videos")
    
# Ensure videoId is consistent type
video_metadata["videoId"] = video_metadata["videoId"].astype(momentum_df["videoId"].dtype)
    
# Merge metadata with momentum data
momentum_with_meta = momentum_df.merge(
        video_metadata[["videoId", "title", "viewCount"]], 
        on="videoId", 
        how="left"
    )

# ---- PERCENTILE-BASED VIDEO OUTPUT WITH METADATA ----
print(f"\n" + "="*80)
print("TOP VIDEOS BY PERCENTILE (EVERY 10%) - WITH METADATA")
print("="*80)

percentiles = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
sorted_df = momentum_with_meta.sort_values("hawkes_R_current_scaled", ascending=False).reset_index(drop=True)

for p in percentiles:
    idx = int(np.percentile(range(len(sorted_df)), p)) if len(sorted_df) > 0 else 0
    
    if len(sorted_df) > 0:
        row_idx = min(idx, len(sorted_df) - 1)
        videos = sorted_df.iloc[row_idx:row_idx+10]  # Get 10 videos starting from that position
        
        print(f"\n--- PERCENTILE {p:3d}% (Starting Rank: {row_idx + 1}) ---")
        print(videos[[
            "videoId",
            "title",
            "viewCount",
            "hawkes_R_current_scaled", 
            "total_comments", 
            "momentum_ratio", 
            "recent_momentum", 
            "momentum_category"
        ]].to_string(index=False, max_colwidth=30))
# --- END METADATA ---

# ---- VALIDATION ----
print(f"\n" + "="*30)
print("VALIDATION CHECKS")
print("="*30)

correlation = momentum_df[["total_comments", "hawkes_R_current_scaled"]].corr().iloc[0,1]
print(f"✓ Comments vs Momentum correlation: {correlation:.3f}")

peak_correlation = momentum_df[["peak_hourly_comments", "hawkes_R_current_scaled"]].corr().iloc[0,1]
print(f"✓ Peak activity vs Momentum correlation: {peak_correlation:.3f}")

print(f"✓ Scaled values range: [{momentum_df['hawkes_R_current_scaled'].min():.2f}, {momentum_df['hawkes_R_current_scaled'].max():.2f}]")

# ---- SAVE RESULTS ----
# Add compatibility columns for the pipeline
momentum_df["hawkes_R_max"] = momentum_df["hawkes_R_current_scaled"]
momentum_df["hawkes_R_mean"] = momentum_df["hawkes_R_current_scaled"] 
momentum_df["hawkes_R_early"] = momentum_df["momentum_ratio_scaled"]

momentum_df.to_csv(OUTPUT, index=False)
print(f"\n[SUCCESS] Results saved to: {OUTPUT}")
print("\n[DONE] Simplified momentum analysis complete!")

[INFO] Loading video-hourly activity from /kaggle/input/aggregate-to-video-hourly-activity/video_hourly_activity.parquet
[DEBUG] Using columns: videoId, comment_count
[INFO] Filtering videos with sufficient activity...
[INFO] Found 5383 eligible videos (from 39938 total)
[INFO] Working with 1970654 comment-active hourly records
[INFO] Computing simplified momentum for 5383 videos...


100%|██████████| 5383/5383 [00:15<00:00, 345.40it/s]



[INFO] Applying robust scaling to momentum metrics...

SIMPLIFIED MOMENTUM ANALYSIS SUMMARY

Dataset Overview:
  Total videos processed: 5383
  Success rate: 100.0%

Composite Momentum Distribution:
    0%: 0.00
   20%: 1.83
   40%: 3.80
   60%: 5.25
   80%: 6.68
  100%: 10.00

Momentum Categories:
momentum_category
Low       1428
Medium    1622
High      1737
Viral      596
Name: count, dtype: int64
[INFO] Loading video metadata from fixed path...
[INFO] Loaded metadata for 92759 videos

TOP VIDEOS BY PERCENTILE (EVERY 10%) - WITH METADATA

--- PERCENTILE   0% (Starting Rank: 1) ---
 videoId                          title  viewCount  hawkes_R_current_scaled  total_comments  momentum_ratio  recent_momentum momentum_category
   41736 #makeuptutorial #transforma...    30390.0                     10.0              59        2.470588         0.764706             Viral
   49608 Madhuri Dixit Inspired Make...   259420.0                     10.0             101        0.813759         2.6181