In [1]:
import re
import pandas as pd

from pathlib import Path


In [2]:
# =========================
# 프로젝트 루트 자동 탐색 + 경로
# =========================

def find_project_root() -> Path:

    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent

    return p

PROJECT_ROOT = find_project_root()

ACCUM_DIR = PROJECT_ROOT / "data" / "raw" / "api"
OUT_DIR = PROJECT_ROOT / "data" / "interim" / "02_comment_features"
OUT_DIR.mkdir(parents=True, exist_ok=True)

IN_PATH = ACCUM_DIR / "comments_raw_kr.csv"
OUT_KO_PATH = OUT_DIR / "comments_filtered_kr.csv"

print("IN_PATH:", IN_PATH)
print("OUT_KO_PATH:", OUT_KO_PATH)



IN_PATH: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\raw\api\comments_raw_kr.csv
OUT_KO_PATH: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\interim\02_comment_features\comments_filtered_kr.csv


In [3]:
# =========================
# 한국어 판별 함수
# =========================

hangul_re = re.compile(r"[가-힣]")

def is_korean_text(text: str, threshold: float = 0.15) -> bool:
    if not isinstance(text, str) or not text.strip():
        return False
    
    chars = [c for c in text if c.strip()]
    if not chars:
        return False
    
    hangul_cnt = sum(1 for c in chars if hangul_re.search(c))

    return (hangul_cnt / len(chars)) >= threshold


In [4]:
# =========================
# 로드
# =========================

df = pd.read_csv(IN_PATH, low_memory=False)

# =========================
# 중복 컬럼 표준화: likeCount/like_count, comment_publishedAt/published_at
# =========================

# like_count: like_count 우선, 없으면 likeCount 사용
if "like_count" not in df.columns and "likeCount" in df.columns:
    df["like_count"] = df["likeCount"]
elif "like_count" in df.columns and "likeCount" in df.columns:
    df["like_count"] = df["like_count"].fillna(df["likeCount"])

# published_at: published_at 우선, 없으면 comment_publishedAt 사용
if "published_at" not in df.columns and "comment_publishedAt" in df.columns:
    df["published_at"] = df["comment_publishedAt"]
elif "published_at" in df.columns and "comment_publishedAt" in df.columns:
    df["published_at"] = df["published_at"].fillna(df["comment_publishedAt"])

# =========================
# 타입 정규화
# =========================

# 문자열형
for c in ["video_id", "comment_id", "author_channel_id"]:
    if c in df.columns:
        df[c] = df[c].astype(str).fillna("").str.strip()

df["text"] = df.get("text", "").astype(str).fillna("").str.strip()

# 숫자형
df["like_count"] = pd.to_numeric(df.get("like_count", 0), errors="coerce").fillna(0)

# 날짜형 (문자열 유지해도 되지만, 최소 정규화)
df["published_at"] = pd.to_datetime(df.get("published_at", pd.NaT), errors="coerce")

# =========================
# 한국어 필터
# =========================

df["is_korean"] = df["text"].apply(is_korean_text)
df_ko = df[df["is_korean"]].copy()



In [5]:
# =========================
# 출력 컬럼
# =========================

keep_cols = [
    "video_id",
    "comment_id",
    "author_channel_id",
    "published_at",
    "like_count",
    "text",
    # 아래는 있으면 같이 보관 (재현/추적용) — 필요 없으면 지워도 됨
    "run_id",
    "country",
    "run_ts_utc",
    "collected_date",
    "updated_at",
    "author_display_name",
    "category_name",
]
keep_cols = [c for c in keep_cols if c in df_ko.columns]
df_ko = df_ko[keep_cols]

# published_at을 문자열로 저장(Excel/CSV 호환)
if "published_at" in df_ko.columns:
    df_ko["published_at"] = df_ko["published_at"].dt.strftime("%Y-%m-%d %H:%M:%S")

df_ko.to_csv(OUT_KO_PATH, index=False, encoding="utf-8-sig")

print("✅ raw rows:", len(df))
print("✅ ko rows :", len(df_ko))
print("✅ saved columns:", df_ko.columns.tolist())

✅ raw rows: 5714
✅ ko rows : 4428
✅ saved columns: ['video_id', 'comment_id', 'author_channel_id', 'published_at', 'like_count', 'text', 'run_ts_utc', 'collected_date', 'updated_at', 'author_display_name']
