In [1]:
# --- CELL: fundamentals.py ---
# Compute fundamental health metrics for each video

import pandas as pd
import numpy as np
from tqdm import tqdm

# ====== CONFIG ======
INPUT  = "/kaggle/input/data-cleaning/comments_enriched.parquet"
OUTPUT = "/kaggle/working/signal_fundamentals.csv"
# ===================

print(f"[INFO] Loading enriched comments from {INPUT}")
df = pd.read_parquet(INPUT)

# Validate required columns
required_cols = ["videoId", "text_norm", "likeCount", "emoji_count", "hashtags", "lang"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise KeyError(f"Missing required columns: {missing_cols}")

print(f"[INFO] Computing fundamentals for {df['videoId'].nunique()} videos...")

# ---- COMPUTE FUNDAMENTAL METRICS ----
fundamental_metrics = []

# Group by video
video_groups = df.groupby("videoId")

for vid, group in tqdm(video_groups, total=len(video_groups)):
    # Basic counts
    total_comments = len(group)
    total_likes = group["likeCount"].sum()
    unique_commenters = group["commentId"].nunique() if "commentId" in group.columns else total_comments
    
    # Engagement metrics
    engagement_ratio = total_likes / (total_comments + 1)  # likes per comment
    
    # Diversity metrics
    commenter_depth = unique_commenters / (total_comments + 1)  # unique users per comment
    
    # Content quality metrics
    avg_emojis = group["emoji_count"].mean()
    total_emojis = group["emoji_count"].sum()
    
    # Hashtag usage
    hashtag_lists = group["hashtags"].dropna()
    total_hashtags = sum(len(tags) for tags in hashtag_lists if isinstance(tags, list))
    avg_hashtags_per_comment = total_hashtags / (total_comments + 1)
    
    # Language diversity (if available)
    if "lang" in group.columns:
        lang_diversity = group["lang"].nunique() / (total_comments + 1)
    else:
        lang_diversity = 0.0
    
    # Text quality (length and complexity)
    text_lengths = group["text_norm"].str.len().fillna(0)
    avg_text_length = text_lengths.mean()
    
    # Saturation metrics (requires viewCount from video metadata)
    # We'll compute this later if metadata is available
    
    fundamental_metrics.append({
        "videoId": int(vid),
        "total_comments": int(total_comments),
        "total_likes": int(total_likes),
        "unique_commenters": int(unique_commenters),
        "engagement_ratio": float(engagement_ratio),
        "commenter_depth": float(commenter_depth),
        "avg_emojis_per_comment": float(avg_emojis),
        "total_emojis": int(total_emojis),
        "avg_hashtags_per_comment": float(avg_hashtags_per_comment),
        "lang_diversity": float(lang_diversity),
        "avg_text_length": float(avg_text_length)
    })

# Create DataFrame
fund_df = pd.DataFrame(fundamental_metrics)

# ---- ADD VIDEO METADATA (if available) ----
print("[INFO] Adding video metadata for saturation metrics...")

try:
    # Try to load video metadata
    video_meta_paths = [
        "/kaggle/input/datathon-loreal/videos.csv",
        "/kaggle/working/videos.csv",
        "/kaggle/input/data-cleaning/videos.csv"
    ]
    
    video_meta = None
    import glob
    
    for path_pattern in video_meta_paths:
        matches = glob.glob(path_pattern)
        if matches:
            video_meta = pd.read_csv(matches[0])
            print(f"[INFO] Loaded video metadata from: {matches[0]}")
            break
    
    if video_meta is not None:
        # Ensure videoId consistency
        video_meta["videoId"] = video_meta["videoId"].astype(fund_df["videoId"].dtype)
        
        # Merge viewCount
        fund_df = fund_df.merge(
            video_meta[["videoId", "viewCount"]].rename(columns={"viewCount": "video_viewCount"}),
            on="videoId",
            how="left"
        )
        
        # Compute saturation (comments per 1000 views)
        fund_df["saturation"] = (
            fund_df["total_comments"] / (fund_df["video_viewCount"] / 1000 + 1)
        ).fillna(0)
        
        print("[INFO] Added saturation metrics")
    else:
        print("[WARN] No video metadata found, skipping saturation metrics")
        fund_df["saturation"] = 0.0
        
except Exception as e:
    print(f"[WARN] Failed to load video metadata: {e}")
    fund_df["saturation"] = 0.0

# ---- NORMALIZE METRICS ----
print("[INFO] Computing normalized health scores...")

# Engagement quality score (0-1)
# Higher engagement ratio = better, but cap at reasonable level
engagement_normalized = np.clip(fund_df["engagement_ratio"] / 10, 0, 1)  # Assume 10 likes/comment is max good
fund_df["engagement_quality"] = engagement_normalized

# Commenter depth score (0-1)
# Higher diversity = better
fund_df["depth_score"] = np.clip(fund_df["commenter_depth"], 0, 1)

# Content richness score (combines emojis, hashtags, text length)
richness_score = (
    0.4 * np.clip(fund_df["avg_emojis_per_comment"] / 5, 0, 1) +  # Max 5 emojis/comment
    0.3 * np.clip(fund_df["avg_hashtags_per_comment"] / 3, 0, 1) +  # Max 3 hashtags/comment
    0.3 * np.clip(fund_df["avg_text_length"] / 200, 0, 1)  # Max 200 chars/comment
)
fund_df["content_richness"] = richness_score

# Overall health score
fund_df["fundamental_health"] = (
    0.4 * fund_df["engagement_quality"] +
    0.3 * fund_df["depth_score"] +
    0.3 * fund_df["content_richness"]
)

# ---- SUMMARY ----
print("\n" + "="*50)
print("FUNDAMENTAL METRICS SUMMARY")
print("="*50)

print(f"Total videos processed: {len(fund_df)}")

print(f"\nKey Metrics Distribution:")
metrics = ["engagement_ratio", "commenter_depth", "saturation", "fundamental_health"]
for metric in metrics:
    if metric in fund_df.columns:
        p50 = fund_df[metric].median()
        p90 = fund_df[metric].quantile(0.9)
        print(f"  {metric}: median={p50:.3f}, 90th={p90:.3f}")

print(f"\nTop 10 Healthiest Videos:")
top_health = fund_df.nlargest(10, "fundamental_health")[[
    "videoId", "fundamental_health", "engagement_ratio", "commenter_depth", "saturation"
]]
print(top_health.to_string(index=False))

# ---- SAVE ----
fund_df.to_csv(OUTPUT, index=False)
print(f"\n[OK] Saved fundamental metrics → {OUTPUT}")

# ---- VALIDATION ----
print(f"\n" + "="*30)
print("VALIDATION")
print("="*30)

if len(fund_df) > 0:
    print(f"✓ Health score range: [{fund_df['fundamental_health'].min():.3f}, {fund_df['fundamental_health'].max():.3f}]")
    print(f"✓ Mean health score: {fund_df['fundamental_health'].mean():.3f}")
    print(f"✓ Correlation with comments: {fund_df[['total_comments', 'fundamental_health']].corr().iloc[0,1]:.3f}")
else:
    print("✗ No data to validate")

print("\n[DONE] Fundamental metrics computation complete!")

[INFO] Loading enriched comments from /kaggle/input/data-cleaning/comments_enriched.parquet
[INFO] Computing fundamentals for 39938 videos...


100%|██████████| 39938/39938 [00:46<00:00, 852.33it/s] 


[INFO] Adding video metadata for saturation metrics...
[INFO] Loaded video metadata from: /kaggle/input/datathon-loreal/videos.csv
[INFO] Added saturation metrics
[INFO] Computing normalized health scores...

FUNDAMENTAL METRICS SUMMARY
Total videos processed: 39938

Key Metrics Distribution:
  engagement_ratio: median=0.333, 90th=2.599
  commenter_depth: median=0.800, 90th=0.989
  saturation: median=0.705, 90th=3.511
  fundamental_health: median=0.270, 90th=0.419

Top 10 Healthiest Videos:
 videoId  fundamental_health  engagement_ratio  commenter_depth  saturation
   56370            0.793179         12.783784         0.986486    0.890744
   46891            0.791929         23.264039         0.999015    1.190961
   68937            0.789461         29.285541         0.998785    1.631215
   60732            0.789085         19.130926         0.997743    2.263150
   30730            0.788618         17.916988         0.998069    0.824309
   48889            0.787276         20.305699  