In [1]:
import re
import pandas as pd
import numpy as np
from pathlib import Path

def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

PROJECT_ROOT = find_project_root()
ACCUM_DIR = PROJECT_ROOT / "data" / "processed" / "01_daily_accumulated"
FEAT_DIR = PROJECT_ROOT / "data" / "processed" / "02_comment_features"
FEAT_DIR.mkdir(parents=True, exist_ok=True)

# 입력(우선순위): 한국어 필터 결과 → 없으면 raw 누적본
IN_KO = FEAT_DIR / "youtube_comments_kr_v1.csv"
IN_RAW = ACCUM_DIR / "youtube_comments_raw_kr_v1.csv"
IN_PATH = IN_KO if IN_KO.exists() else IN_RAW

OUT_COMMENT_LEVEL = FEAT_DIR / "youtube_comment_features_comment_level_kr_v1.csv"
OUT_VIDEO_LEVEL   = FEAT_DIR / "youtube_comment_features_video_level_kr_v1.csv"

print("IN_PATH:", IN_PATH)
print("OUT_COMMENT_LEVEL:", OUT_COMMENT_LEVEL)
print("OUT_VIDEO_LEVEL  :", OUT_VIDEO_LEVEL)


IN_PATH: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\02_comment_features\youtube_comments_kr_v1.csv
OUT_COMMENT_LEVEL: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\02_comment_features\youtube_comment_features_comment_level_kr_v1.csv
OUT_VIDEO_LEVEL  : c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\02_comment_features\youtube_comment_features_video_level_kr_v1.csv


In [2]:
df = pd.read_csv(IN_PATH, low_memory=False)

need = ["video_id","comment_id","author_channel_id","published_at","like_count","text"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise ValueError(f"입력 파일에 필요한 컬럼이 없습니다: {missing}\nIN_PATH={IN_PATH}")

df["text"] = df["text"].astype(str).fillna("").str.strip()
df["video_id"] = df["video_id"].astype(str).str.strip()

url_re = re.compile(r"https?://\S+|www\.\S+")
hashtag_re = re.compile(r"#\w+")
mention_re = re.compile(r"@\w+")
hangul_re = re.compile(r"[가-힣]")

def count_pat(pattern, s: str) -> int:
    if not isinstance(s, str) or not s:
        return 0
    return len(pattern.findall(s))

def hangul_ratio(s: str) -> float:
    if not isinstance(s, str) or not s.strip():
        return 0.0
    chars = [c for c in s if c.strip()]
    if not chars:
        return 0.0
    h = sum(1 for c in chars if hangul_re.search(c))
    return h / len(chars)

comment_level = df[["video_id", "comment_id", "author_channel_id", "published_at", "like_count", "text"]].copy()
comment_level["text_len"] = comment_level["text"].str.len()
comment_level["url_cnt"] = comment_level["text"].apply(lambda x: count_pat(url_re, x))
comment_level["hashtag_cnt"] = comment_level["text"].apply(lambda x: count_pat(hashtag_re, x))
comment_level["mention_cnt"] = comment_level["text"].apply(lambda x: count_pat(mention_re, x))
comment_level["hangul_ratio"] = comment_level["text"].apply(hangul_ratio)
comment_level["is_korean_like"] = (comment_level["hangul_ratio"] >= 0.15).astype(int)

# 저장(선택 출력)
comment_level.to_csv(OUT_COMMENT_LEVEL, index=False, encoding="utf-8-sig")
print("✅ saved comment_level:", OUT_COMMENT_LEVEL, "| rows:", len(comment_level))


✅ saved comment_level: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\02_comment_features\youtube_comment_features_comment_level_kr_v1.csv | rows: 4428


In [3]:
video_level = (
    comment_level
    .groupby("video_id", as_index=False)
    .agg(
        comment_count=("comment_id", "count"),
        unique_authors=("author_channel_id", pd.Series.nunique),
        mean_like_count=("like_count", "mean"),
        mean_text_len=("text_len", "mean"),
        url_ratio=("url_cnt", lambda x: (x > 0).mean()),
        hashtag_ratio=("hashtag_cnt", lambda x: (x > 0).mean()),
        mention_ratio=("mention_cnt", lambda x: (x > 0).mean()),
        korean_comment_ratio=("is_korean_like", "mean"),
        mean_hangul_ratio=("hangul_ratio", "mean"),
    )
)

video_level["mean_like_count"] = video_level["mean_like_count"].fillna(0)
video_level.to_csv(OUT_VIDEO_LEVEL, index=False, encoding="utf-8-sig")
print("✅ saved video_level:", OUT_VIDEO_LEVEL, "| rows:", len(video_level))

display(video_level.head())


✅ saved video_level: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\02_comment_features\youtube_comment_features_video_level_kr_v1.csv | rows: 34


Unnamed: 0,video_id,comment_count,unique_authors,mean_like_count,mean_text_len,url_ratio,hashtag_ratio,mention_ratio,korean_comment_ratio,mean_hangul_ratio
0,-WGFbInX6JI,190,178,4.352632,36.226316,0.0,0.0,0.0,1.0,0.732096
1,0HXwT4gefnQ,53,53,0.09434,25.471698,0.0,0.0,0.0,1.0,0.824822
2,5y1YQx1g4Mg,192,178,1.166667,26.302083,0.005208,0.0,0.005208,1.0,0.744385
3,6BRMs8EH1Co,199,175,1.638191,27.18593,0.0,0.0,0.0,1.0,0.747631
4,80kIVHdpT_w,199,193,3.678392,32.045226,0.0,0.0,0.0,1.0,0.817846
