In [None]:
import re
import pandas as pd
import numpy as np

from pathlib import Path


In [None]:
# =========================
# 프로젝트 루트 자동 탐색 + 경로
# =========================

def find_project_root() -> Path:
    p = Path.cwd()

    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent

    return p

PROJECT_ROOT = find_project_root()

ACCUM_DIR = PROJECT_ROOT / "data" / "raw" / "api"
FEAT_DIR = PROJECT_ROOT / "data" / "interim" / "02_comment_features"
FEAT_DIR.mkdir(parents=True, exist_ok=True)

# 입력 (우선순위): 한국어 필터 결과 → 없으면 raw 누적본
IN_KO = FEAT_DIR / "comments_filtered_kr.csv"
IN_RAW = ACCUM_DIR / "comments_raw_kr.csv"
IN_PATH = IN_KO if IN_KO.exists() else IN_RAW

OUT_COMMENT_LEVEL = FEAT_DIR / "comment_features_comment_level_kr.csv"
OUT_VIDEO_LEVEL   = FEAT_DIR / "comment_features_video_level_kr.csv"

print("IN_PATH:", IN_PATH)
print("OUT_COMMENT_LEVEL:", OUT_COMMENT_LEVEL)
print("OUT_VIDEO_LEVEL  :", OUT_VIDEO_LEVEL)


In [None]:
# =========================
# 로드 및 기본 검증
# =========================

df = pd.read_csv(IN_PATH, low_memory=False)

need = ["video_id","comment_id","author_channel_id","published_at","like_count","text"]
missing = [c for c in need if c not in df.columns]

if missing:
    raise ValueError(f"입력 파일에 필요한 컬럼이 없습니다: {missing}\nIN_PATH={IN_PATH}")

# =========================
# 타입 정리 및 결측 처리
# =========================

df["text"] = df["text"].astype(str).fillna("").str.strip()
df["video_id"] = df["video_id"].astype(str).str.strip()

# =========================
# 정규표현식 패턴 정의
# =========================

url_re = re.compile(r"https?://\S+|www\.\S+")
hashtag_re = re.compile(r"#\w+")
mention_re = re.compile(r"@\w+")
hangul_re = re.compile(r"[가-힣]")


In [None]:
# =========================
# 패턴 카운트 및 한글 비율 계산 함수
# =========================

def count_pat(pattern, s: str) -> int:
    if not isinstance(s, str) or not s:
        return 0
    return len(pattern.findall(s))

def hangul_ratio(s: str) -> float:
    if not isinstance(s, str) or not s.strip():
        return 0.0
    
    chars = [c for c in s if c.strip()]
    if not chars:
        return 0.0
    h = sum(1 for c in chars if hangul_re.search(c))

    return h / len(chars)


# 1. 댓글 단위 데이터 구성 (comment_features_comment_level_kr)

In [None]:
# =========================
# 댓글 단위 (comment-level) 데이터 구성
# =========================

comment_level = df[["video_id", "comment_id", "author_channel_id", "published_at", "like_count", "text"]].copy()
comment_level["text_len"] = comment_level["text"].str.len()
comment_level["url_cnt"] = comment_level["text"].apply(lambda x: count_pat(url_re, x))
comment_level["hashtag_cnt"] = comment_level["text"].apply(lambda x: count_pat(hashtag_re, x))
comment_level["mention_cnt"] = comment_level["text"].apply(lambda x: count_pat(mention_re, x))
comment_level["hangul_ratio"] = comment_level["text"].apply(hangul_ratio)
comment_level["is_korean_like"] = (comment_level["hangul_ratio"] >= 0.15).astype(int)

# 결과 저장
comment_level.to_csv(OUT_COMMENT_LEVEL, index=False, encoding="utf-8-sig")
print("✅ saved comment_level:", OUT_COMMENT_LEVEL, "| rows:", len(comment_level))


# 2. 영상 단위 데이터 구성 (comment_features_comment_level_kr)

댓글 단위(comment-level)로 계산된 텍스트 및 참여 지표들을 영상(video_id) 단위로 집계하여 영상 특성(feature) 생성

In [None]:
# =========================
# 영상 단위(video-level) 집계
# =========================

video_level = (
    comment_level
    .groupby("video_id", as_index=False)
    .agg(
        comment_count=("comment_id", "count"),
        unique_authors=("author_channel_id", pd.Series.nunique),
        mean_like_count=("like_count", "mean"),
        mean_text_len=("text_len", "mean"),
        url_ratio=("url_cnt", lambda x: (x > 0).mean()),
        hashtag_ratio=("hashtag_cnt", lambda x: (x > 0).mean()),
        mention_ratio=("mention_cnt", lambda x: (x > 0).mean()),
        korean_comment_ratio=("is_korean_like", "mean"),
        mean_hangul_ratio=("hangul_ratio", "mean"),
    )
)

# =========================
# 결측 처리 및 결과 저장
# =========================

video_level["mean_like_count"] = video_level["mean_like_count"].fillna(0)
video_level.to_csv(OUT_VIDEO_LEVEL, index=False, encoding="utf-8-sig")

print("✅ saved video_level:", OUT_VIDEO_LEVEL, "| rows:", len(video_level))
display(video_level.head())
