In [1]:
import pandas as pd
from datetime import timedelta
from sklearn.linear_model import LinearRegression
import numpy as np
from ast import literal_eval
from collections import Counter

# 🔧 입력 경로 설정
input_path = "video_data_with_keywords_fin.csv"

# 1. 데이터 불러오기
df = pd.read_csv(input_path)
df['uploadDate'] = pd.to_datetime(df['uploadDate'])

# 2. 카테고리 매핑
category_map = {
    1: "엔터테인먼트",
    2: "차량",
    3: "여행/음식",
    4: "게임",
    5: "스포츠",
    6: "라이프",
    7: "정치",
    8: "반려동물",
    9: "교육",
    10: "과학/기술"
}
df['category'] = df['categoryID'].map(category_map)

# 3. 분석 결과 저장용 리스트
results = []

# 4. 카테고리별 분석 루프
for cat, sub_df in df.groupby('category'):
    # 일자별 키워드 빈도수 집계
    date_keyword_freq = {}
    for _, row in sub_df.iterrows():
        date = row['uploadDate'].date()
        try:
            keywords = literal_eval(row['keywords'])
        except:
            continue
        for kw in keywords:
            date_keyword_freq.setdefault(date, Counter())[kw] += 1

    keyword_freq_df = pd.DataFrame(date_keyword_freq).fillna(0).astype(int).T.sort_index()
    if len(keyword_freq_df) < 3:
        continue  # 최소 분석기간 확보
    
    latest_date = keyword_freq_df.index.max()
    recent_3 = keyword_freq_df.loc[latest_date - timedelta(days=3): latest_date]
    prev_7 = keyword_freq_df.loc[latest_date - timedelta(days=10): latest_date - timedelta(days=4)]

    recent_mean = recent_3.mean()

    # 반짝 키워드 분석
    if prev_7.empty:
        for kw in recent_mean.sort_values(ascending=False).head(20).index:
            results.append({'category': cat, 'keyword': kw, 'type': 'flash', 'score': 'NEW'})
    else:
        prev_mean = prev_7.mean().replace(0, 1e-6)
        ratio = (recent_mean / prev_mean).sort_values(ascending=False)
        safe_prev_mean = prev_mean.copy()
        safe_prev_mean[safe_prev_mean < 1e-3] = np.nan  # 너무 작은 값은 신생 키워드로 처리
        ratio = (recent_mean / safe_prev_mean).dropna().sort_values(ascending=False)

        for kw in recent_mean.sort_values(ascending=False).head(20).index:
            results.append({
                'category': cat,
                'keyword': kw,
                'type': 'flash',
                'score': 'NEW',
                'uploadDate': latest_date
            })

        for kw, val in ratio[ratio > 2].head(20).items():
            results.append({
                'category': cat,
                'keyword': kw,
                'type': 'flash',
                'score': round(val, 2),
                'uploadDate': latest_date
            })

    # 꾸준 키워드 분석
    slopes = {}
    X = np.arange(len(keyword_freq_df)).reshape(-1, 1)
    for kw in keyword_freq_df.columns:
        y = keyword_freq_df[kw].values
        if y.sum() < 5:
            continue
        model = LinearRegression().fit(X, y)
        slopes[kw] = model.coef_[0]

    sorted_trending = sorted(slopes.items(), key=lambda x: -x[1])
    top_steady_keywords = dict(sorted_trending[:20])

    # 결과 통합 저장
    for kw, val in dict(sorted_trending[:20]).items():
        results.append({
            'category': cat,
            'keyword': kw,
            'type': 'steady',
            'score': round(val, 4),
            'uploadDate': latest_date
        })

# 5. 결과 저장
result_df = pd.DataFrame(results)
result_df.to_csv("keyword_trend_by_category.csv", index=False)
print("✅ 저장 완료: 'keyword_trend_by_category.csv'")

✅ 저장 완료: 'keyword_trend_by_category.csv'
