## Load, Clean & Merge Comment + Video Data (Chunked)

In [1]:
import glob
import pandas as pd
from pathlib import Path

def read_comments(pattern, usecols=None, chunksize=200_000):
    files = sorted(glob.glob(pattern))
    if not files:
        print(f"[ERROR] No files matched {pattern}")
        return pd.DataFrame()
    frames = []
    for f in files:
        print(f"[INFO] Reading {f}")
        for chunk in pd.read_csv(f, usecols=usecols, chunksize=chunksize):
            frames.append(chunk)
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

def clean_text(s: pd.Series) -> pd.Series:
    s = s.fillna("")
    s = s.str.replace(r"http\S+|www\.\S+", "", regex=True)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

def run_pipeline(comments_glob, videos_csv, out_path, chunksize=200_000):
    comments_cols = ["commentId","videoId","textOriginal","likeCount","publishedAt"]
    videos_cols   = ["videoId","title","description","channelId","category","publishedAt"]

    comments = read_comments(comments_glob, usecols=comments_cols, chunksize=chunksize)
    print(f"[INFO] Comments loaded: {len(comments):,} rows")

    videos = pd.read_csv(videos_csv, usecols=lambda c: c in videos_cols)
    print(f"[INFO] Videos loaded: {len(videos):,} rows")

    # type conversions
    comments["likeCount"] = pd.to_numeric(comments["likeCount"], errors="coerce").fillna(0).astype(int)
    comments["publishedAt"] = pd.to_datetime(comments["publishedAt"], errors="coerce", utc=True)
    comments["textOriginal"] = clean_text(comments["textOriginal"])

    if "publishedAt" in videos.columns:
        videos["publishedAt"] = pd.to_datetime(videos["publishedAt"], errors="coerce", utc=True)

    merged = comments.merge(videos, on="videoId", how="left", suffixes=("_comment","_video"))
    print(f"[INFO] Final shape: {merged.shape}")

    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    merged.to_parquet(out_path, engine="pyarrow", index=False)
    print(f"[OK] Saved to {out_path}")

In [2]:
COMMENTS_GLOB = "/kaggle/input/datathon-loreal/comments*.csv"  # matches comments1.csv ... comments5.csv
VIDEOS_CSV    = "/kaggle/input/datathon-loreal/videos.csv"
OUT_PATH      = "/kaggle/working/comments_merged.parquet"

In [3]:
CHUNKSIZE = 200_000
run_pipeline(COMMENTS_GLOB, VIDEOS_CSV, OUT_PATH, chunksize=CHUNKSIZE)

[INFO] Reading /kaggle/input/datathon-loreal/comments1.csv
[INFO] Reading /kaggle/input/datathon-loreal/comments2.csv
[INFO] Reading /kaggle/input/datathon-loreal/comments3.csv
[INFO] Reading /kaggle/input/datathon-loreal/comments4.csv
[INFO] Reading /kaggle/input/datathon-loreal/comments5.csv
[INFO] Comments loaded: 4,725,012 rows
[INFO] Videos loaded: 92,759 rows
[INFO] Final shape: (4725012, 9)
[OK] Saved to /kaggle/working/comments_merged.parquet


In [4]:
!ls -lh /kaggle/working

total 1.4G
-rw-r--r-- 1 root root 1.4G Aug 25 05:24 comments_merged.parquet
---------- 1 root root  15K Aug 25 05:24 __notebook__.ipynb


## Normalize Text, Extract Hashtags & Emojis, Add Time Features

In [5]:
# --- deps (only first run needs to download) ---
!pip -q install ftfy langdetect emoji

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [6]:
import re
import pandas as pd
from pathlib import Path
from langdetect import detect, DetectorFactory
import ftfy
import emoji as emoji_lib  # renamed to avoid conflict

# Reproducible language detection
DetectorFactory.seed = 42

# ====== CONFIG (edit as needed) ======
INPUT  = "/kaggle/working/comments_merged.parquet"
OUTPUT = "/kaggle/working/comments_enriched.parquet"
ADD_LANG = True  # Set to False to skip language detection (faster)
# =====================================

# Pre-compile regex patterns for efficiency
URL_RE     = re.compile(r"http\S+|www\.\S+")           # Match URLs
HASHTAG_RE = re.compile(r"(#\w+)")                    # Match hashtags like #Trending
MENTION_RE = re.compile(r"@\w+")                      # Match @mentions
EMOJI_RE   = re.compile("[\U0001F600-\U0001F64F]+")   # Match emoticons (faces, etc.)

def norm(t):
    """Normalize text: fix encoding, remove URLs, clean whitespace"""
    t = ftfy.fix_text(str(t))
    t = URL_RE.sub(" <URL> ", t)
    t = MENTION_RE.sub(" <MENTION> ", t)
    t = re.sub(r"[^\w\s\#\.\!\?\U0001F600-\U0001F64F]", " ", t, flags=re.UNICODE)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def extract_hashtags(t):
    """Extract all hashtags, lowercase them"""
    t = t or ""
    return [h.lower() for h in HASHTAG_RE.findall(t)]

def count_emojis(t):
    """Count number of emoji characters in text"""
    t = t or ""
    return len(EMOJI_RE.findall(t))

def detect_lang_safe(t):
    """Safely detect language with fallback"""
    try:
        t = (t or "").strip()
        if len(t) < 3:
            return "unk"
        return detect(t)
    except Exception:
        return "unk"

In [7]:
# ---- RUN ENRICHMENT ----
print(f"[INFO] Loading data from {INPUT}")
df = pd.read_parquet(INPUT)
print(f"[INFO] Loaded: {len(df):,} rows | {df.shape[1]} columns")

# Handle timestamp column (support both naming conventions)
tcol = "publishedAt_comment" if "publishedAt_comment" in df.columns else "publishedAt"
if tcol not in df.columns:
    raise ValueError(f"Timestamp column '{tcol}' not found in data")

# Handle text source
text_col = "textOriginal" if "textOriginal" in df.columns else "text"
if text_col not in df.columns:
    raise ValueError(f"Text column '{text_col}' not found in data")

# Apply text enrichment
print("[INFO] Applying text normalization and feature extraction...")
df["text_norm"] = df[text_col].apply(norm)
df["hashtags"] = df["text_norm"].apply(extract_hashtags)
df["emoji_count"] = df["text_norm"].apply(count_emojis)

if ADD_LANG:
    print("[INFO] Detecting language (this may take a few minutes)...")
    df["lang"] = df["text_norm"].apply(detect_lang_safe)

# Time features
print("[INFO] Adding time-based features...")
df[tcol] = pd.to_datetime(df[tcol], errors="coerce", utc=True)
n_dropped = df[df[tcol].isna()].shape[0]
df = df.dropna(subset=[tcol])
print(f"[INFO] Dropped {n_dropped:,} rows with invalid timestamps")

df["date"] = df[tcol].dt.date
df["hour"] = df[tcol].dt.hour
df["day_of_week"] = df[tcol].dt.dayofweek
df["week_start"] = df[tcol].dt.to_period("W-MON").apply(lambda p: p.start_time.date())

# Save enriched data
print(f"[INFO] Saving enriched data to {OUTPUT}")
Path(OUTPUT).parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(OUTPUT, engine="pyarrow", index=False)

print(f"[OK] Enrichment complete → {OUTPUT} | Final shape: {df.shape}")

# Quick preview
display(df[["text_norm", "hashtags", "emoji_count", "lang", "date", "hour"]].head())

[INFO] Loading data from /kaggle/working/comments_merged.parquet
[INFO] Loaded: 4,725,012 rows | 9 columns
[INFO] Applying text normalization and feature extraction...
[INFO] Detecting language (this may take a few minutes)...
[INFO] Adding time-based features...
[INFO] Dropped 0 rows with invalid timestamps


  df["week_start"] = df[tcol].dt.to_period("W-MON").apply(lambda p: p.start_time.date())


[INFO] Saving enriched data to /kaggle/working/comments_enriched.parquet
[OK] Enrichment complete → /kaggle/working/comments_enriched.parquet | Final shape: (4725012, 17)


Unnamed: 0,text_norm,hashtags,emoji_count,lang,date,hour
0,PLEASE LESBIAN FLAG I BEG YOU You would rock it,[],0,en,2023-08-15,21
1,Apply mashed potato juice and mixed it with curd,[],0,en,2023-10-02,13
2,69 missed calls from mars,[],0,en,2024-05-31,12
3,Baaa,[],0,so,2024-02-13,15
4,you look like raven from phenomena raven no cap,[],0,sl,2020-02-15,22
