In [None]:
# tfidf_lightgbm_colab.ipynb (또는 .py 파일로도 가능)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import joblib

# 1. 데이터 불러오기
train_df = pd.read_csv("train.csv")

# 2. TF-IDF 벡터화
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=100000
)
X = vectorizer.fit_transform(train_df['URL'])
y = train_df['label']

# 3. 모델 학습 (3-Fold CV)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n📂 Fold {fold+1}")

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=100,
        max_depth=7,
        learning_rate=0.2,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)
    print(f"✅ Fold {fold+1} ROC-AUC: {auc:.5f}")

print("\n📈 平均 ROC-AUC:", np.mean(auc_scores))

# 4. 벡터화기, 모델 저장
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(model, "final_model.pkl")
