# 1. RF 학습

In [None]:
import os
import pandas as pd
import numpy as np
import random
import joblib
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# ===== 경로 설정 =====
# 단일 데이터셋(scam.csv)만 사용
PROJECT_ROOT = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATASET_DIR = os.path.join(PROJECT_ROOT, "dataset")
SCAM_DATA_PATH = os.path.join(DATASET_DIR, "scam_spam_stt.csv")
MODEL_SAVE_PATH = os.path.join(PROJECT_ROOT, "model/rf_voicephishing_251108.pkl")

# 모델 저장 경로 폴더 보장
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

# ===== 난수 시드 고정 =====
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ===== wandb 초기화 =====
wandb.init(
    project="Voicephishing",
    name="tfidf_rf_251108_5runs",
    config={
        "model": "RandomForest",
        "max_features": 5000,
        "ngram_range": (1, 2),
        "random_state": SEED,
        "n_estimators": 100,
        "max_depth": 20,
        "min_samples_split": 2,
        "min_samples_leaf": 5,
        "n_jobs": -1,
        "num_runs": 5,
        "split_ratio": "train 0.70, val 0.15, test 0.15"
    }
)

# (수정) 단일 CSV 로드 및 결측치 제거
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"

df = pd.read_csv(SCAM_DATA_PATH)
df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])

X_all = df[TEXT_COLUMN].astype(str)
y_all = df[LABEL_COLUMN]

print(f"Total samples: {len(df)}")

# 메트릭 누적용
val_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}
test_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

last_model = None
last_vectorizer = None

for run_idx in range(wandb.config.num_runs):
    run_seed = SEED + run_idx

    # ===== 데이터 분할: 70% train, 30% temp =====
    X_train, X_temp, y_train, y_temp = train_test_split(
        X_all,
        y_all,
        test_size=0.30,
        random_state=run_seed,
        stratify=y_all
    )

    # ===== temp을 15% val, 15% test로 분할 (temp의 절반) =====
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp,
        y_temp,
        test_size=0.50,
        random_state=run_seed,
        stratify=y_temp
    )

    print(f"[Run {run_idx+1}] Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    # ===== TF-IDF 벡터화 (train에만 fit) =====
    vectorizer = TfidfVectorizer(
        max_features=wandb.config.max_features,
        ngram_range=tuple(wandb.config.ngram_range),
        min_df=2,
        max_df=0.9
    )
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    print(f"[Run {run_idx+1}] TF-IDF Train shape: {X_train_tfidf.shape}")
    print(f"[Run {run_idx+1}] TF-IDF Val shape: {X_val_tfidf.shape}")
    print(f"[Run {run_idx+1}] TF-IDF Test shape: {X_test_tfidf.shape}")

    # ===== 랜덤 포레스트 모델 =====
    rf = RandomForestClassifier(
        n_estimators=wandb.config.n_estimators,
        max_depth=wandb.config.max_depth,
        min_samples_split=wandb.config.min_samples_split,
        min_samples_leaf=wandb.config.min_samples_leaf,
        random_state=run_seed,
        n_jobs=wandb.config.n_jobs
    )

    # ===== 학습 =====
    rf.fit(X_train_tfidf, y_train)

    # ===== 검증 (Validation) =====
    y_val_pred = rf.predict(X_val_tfidf)
    print(f"--- [Run {run_idx+1}] Validation Set Results ---")
    print(classification_report(y_val, y_val_pred))

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, zero_division=0)
    val_recall = recall_score(y_val, y_val_pred, zero_division=0)
    val_f1 = f1_score(y_val, y_val_pred, zero_division=0)

    print(f"[Run {run_idx+1}] Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "val_accuracy": val_accuracy,
        "val_precision": val_precision,
        "val_recall": val_recall,
        "val_f1_score": val_f1
    })

    val_metrics["accuracy"].append(val_accuracy)
    val_metrics["precision"].append(val_precision)
    val_metrics["recall"].append(val_recall)
    val_metrics["f1"].append(val_f1)

    # ===== 최종 평가 (Test) =====
    y_test_pred = rf.predict(X_test_tfidf)
    print(f"\n--- [Run {run_idx+1}] Test Set Results (Final) ---")
    print(classification_report(y_test, y_test_pred))

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)

    print(f"[Run {run_idx+1}] Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "test_accuracy": test_accuracy,
        "test_precision": test_precision,
        "test_recall": test_recall,
        "test_f1_score": test_f1
    })

    test_metrics["accuracy"].append(test_accuracy)
    test_metrics["precision"].append(test_precision)
    test_metrics["recall"].append(test_recall)
    test_metrics["f1"].append(test_f1)

    # 마지막 모델과 벡터라이저 저장을 위해 보관
    last_model = rf
    last_vectorizer = vectorizer

# ===== 5회 평균 결과 출력 =====
def _mean(v):
    return float(np.mean(v)) if len(v) > 0 else float("nan")

val_avg = {k: _mean(v) for k, v in val_metrics.items()}
test_avg = {k: _mean(v) for k, v in test_metrics.items()}

print("\n=== Validation Averages over 5 runs ===")
print(f"Val Accuracy: {val_avg['accuracy']:.4f}, Val Precision: {val_avg['precision']:.4f}, Val Recall: {val_avg['recall']:.4f}, Val F1: {val_avg['f1']:.4f}")

print("\n=== Test Averages over 5 runs ===")
print(f"Test Accuracy: {test_avg['accuracy']:.4f}, Test Precision: {test_avg['precision']:.4f}, Test Recall: {test_avg['recall']:.4f}, Test F1: {test_avg['f1']:.4f}")

# wandb summary에 평균 기록
wandb.summary["val_accuracy_mean"] = val_avg["accuracy"]
wandb.summary["val_precision_mean"] = val_avg["precision"]
wandb.summary["val_recall_mean"] = val_avg["recall"]
wandb.summary["val_f1_mean"] = val_avg["f1"]

wandb.summary["test_accuracy_mean"] = test_avg["accuracy"]
wandb.summary["test_precision_mean"] = test_avg["precision"]
wandb.summary["test_recall_mean"] = test_avg["recall"]
wandb.summary["test_f1_mean"] = test_avg["f1"]

# ===== 모델 저장 (마지막 라운드 모델) =====
if last_model is not None:
    joblib.dump(last_model, MODEL_SAVE_PATH)
    try:
        wandb.save(MODEL_SAVE_PATH)
    except Exception:
        pass
    print(f"모델 저장 완료: {MODEL_SAVE_PATH}")

wandb.finish() 

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkujoon13413[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Total samples: 15463
[Run 1] Train: 10824, Val: 2319, Test: 2320
[Run 1] TF-IDF Train shape: (10824, 5000)
[Run 1] TF-IDF Val shape: (2319, 5000)
[Run 1] TF-IDF Test shape: (2320, 5000)
--- [Run 1] Validation Set Results ---
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1744
         1.0       1.00      0.84      0.91       575

    accuracy                           0.96      2319
   macro avg       0.97      0.92      0.94      2319
weighted avg       0.96      0.96      0.96      2319

[Run 1] Val Accuracy: 0.9590, Val Precision: 0.9979, Val Recall: 0.8365, Val F1: 0.9101

--- [Run 1] Test Set Results (Final) ---
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      1745
         1.0       1.00      0.81      0.89       575

    accuracy                           0.95      2320
   macro avg       0.97      0.90      0.93      2320
weighted avg       0.96      0.95      0.95   



모델 저장 완료: /content/drive/MyDrive/Github/2025_Voicephishing/model/rf_voicephishing_251108.pkl


0,1
run_idx,▁▁▃▃▅▅▆▆██
test_accuracy,▅▇██▁
test_f1_score,▅▇██▁
test_precision,███▁█
test_recall,▅▇▇█▁
val_accuracy,█▄▄▆▁
val_f1_score,█▄▄▆▁
val_precision,▁▁▁██
val_recall,█▄▄▆▁

0,1
run_idx,5
test_accuracy,0.94569
test_accuracy_mean,0.95328
test_f1_mean,0.89588
test_f1_score,0.87695
test_precision,1
test_precision_mean,0.99958
test_recall,0.78087
test_recall_mean,0.81183
val_accuracy,0.94437


# 2. LGBM 학습

In [2]:
import os
import pandas as pd
import numpy as np
import random
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, accuracy_score
import lightgbm as lgb

# ===== 경로 설정 =====
# 단일 데이터셋(scam.csv)만 사용
PROJECT_ROOT = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATASET_DIR = os.path.join(PROJECT_ROOT, "dataset")
SCAM_DATA_PATH = os.path.join(DATASET_DIR, "scam_spam_stt.csv")
MODEL_SAVE_PATH = os.path.join(PROJECT_ROOT, "model/lgbm_voicephishing_251108.txt")

# 모델 저장 경로 폴더 보장
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

# ===== 난수 시드 고정 =====
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ===== wandb 초기화 =====
wandb.init(
    project="Voicephishing",
    name="tfidf_lgbm_251108_5runs",
    config={
        "model": "LightGBM",
        "max_features": 5000,
        "ngram_range": (1, 2),
        "random_state": SEED,
        "learning_rate": 0.05,
        "num_boost_round": 200,
        "max_depth": 6,
        "early_stopping_rounds": 20,
        "num_runs": 5,
        "split_ratio": "train 0.70, val 0.15, test 0.15"
    }
)

# ===== 단일 CSV 로드 및 결측치 제거 =====
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"

try:
    df = pd.read_csv(SCAM_DATA_PATH)
except FileNotFoundError as e:
    print(f"오류: 데이터 파일을 찾을 수 없습니다. 경로를 확인하세요: {e}")
    wandb.finish()
    raise SystemExit(1)

df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])
X_all = df[TEXT_COLUMN].astype(str)
y_all = df[LABEL_COLUMN]

print(f"Total samples: {len(df)}")

# 메트릭 누적용
val_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}
test_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

last_model = None
last_vectorizer = None

for run_idx in range(wandb.config.num_runs):
    run_seed = SEED + run_idx

    # ===== 데이터 분할: 70% train, 30% temp =====
    X_train, X_temp, y_train, y_temp = train_test_split(
        X_all,
        y_all,
        test_size=0.30,
        random_state=run_seed,
        stratify=y_all
    )

    # ===== temp을 15% val, 15% test로 분할 (temp의 절반) =====
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp,
        y_temp,
        test_size=0.50,
        random_state=run_seed,
        stratify=y_temp
    )

    print(f"[Run {run_idx+1}] Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    # ===== TF-IDF 벡터화 (train에만 fit) =====
    vectorizer = TfidfVectorizer(
        max_features=wandb.config.max_features,
        ngram_range=tuple(wandb.config.ngram_range),
        min_df=2,
        max_df=0.9
    )
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    print(f"[Run {run_idx+1}] TF-IDF Train shape: {X_train_tfidf.shape}")
    print(f"[Run {run_idx+1}] TF-IDF Val shape: {X_val_tfidf.shape}")
    print(f"[Run {run_idx+1}] TF-IDF Test shape: {X_test_tfidf.shape}")

    # ===== LGBM Dataset 생성 =====
    lgb_train = lgb.Dataset(X_train_tfidf, label=y_train)
    lgb_val = lgb.Dataset(X_val_tfidf, label=y_val, reference=lgb_train)

    # ===== 하이퍼파라미터 =====
    pos_count = float(np.sum(y_train))
    neg_count = float(len(y_train) - pos_count)
    scale_pos_weight = (neg_count / pos_count) if pos_count > 0 else 1.0

    params = {
        "objective": "binary",
        "metric": ["binary_logloss"],
        "learning_rate": wandb.config.learning_rate,
        "num_leaves": 31,
        "max_depth": int(wandb.config.max_depth) if wandb.config.max_depth is not None else -1,
        "scale_pos_weight": scale_pos_weight,
        "seed": run_seed,
        "verbosity": -1
    }

    # ===== 학습 =====
    bst = lgb.train(
        params,
        lgb_train,
        num_boost_round=int(wandb.config.num_boost_round),
        valid_sets=[lgb_train, lgb_val],
        valid_names=["train", "val"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=int(wandb.config.early_stopping_rounds), verbose=False),
            lgb.log_evaluation(period=50)
        ]
    )

    # ===== 검증 (Validation) =====
    y_val_pred_prob = bst.predict(X_val_tfidf, num_iteration=bst.best_iteration)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)

    print(f"--- [Run {run_idx+1}] Validation Set Results ---")
    print(classification_report(y_val, y_val_pred))

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, zero_division=0)
    val_recall = recall_score(y_val, y_val_pred, zero_division=0)
    val_f1 = f1_score(y_val, y_val_pred, zero_division=0)

    print(f"[Run {run_idx+1}] Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "val_accuracy": val_accuracy,
        "val_precision": val_precision,
        "val_recall": val_recall,
        "val_f1_score": val_f1
    })

    val_metrics["accuracy"].append(val_accuracy)
    val_metrics["precision"].append(val_precision)
    val_metrics["recall"].append(val_recall)
    val_metrics["f1"].append(val_f1)

    # ===== 최종 평가 (Test) =====
    y_test_pred_prob = bst.predict(X_test_tfidf, num_iteration=bst.best_iteration)
    y_test_pred = (y_test_pred_prob > 0.5).astype(int)

    print(f"\n--- [Run {run_idx+1}] Test Set Results (Final) ---")
    print(classification_report(y_test, y_test_pred))

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)

    print(f"[Run {run_idx+1}] Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "test_accuracy": test_accuracy,
        "test_precision": test_precision,
        "test_recall": test_recall,
        "test_f1_score": test_f1
    })

    test_metrics["accuracy"].append(test_accuracy)
    test_metrics["precision"].append(test_precision)
    test_metrics["recall"].append(test_recall)
    test_metrics["f1"].append(test_f1)

    # 마지막 모델과 벡터라이저 저장을 위해 보관
    last_model = bst
    last_vectorizer = vectorizer

# ===== 5회 평균 결과 출력 =====
def _mean(v):
    return float(np.mean(v)) if len(v) > 0 else float("nan")

val_avg = {k: _mean(v) for k, v in val_metrics.items()}
test_avg = {k: _mean(v) for k, v in test_metrics.items()}

print("\n=== Validation Averages over 5 runs ===")
print(f"Val Accuracy: {val_avg['accuracy']:.4f}, Val Precision: {val_avg['precision']:.4f}, Val Recall: {val_avg['recall']:.4f}, Val F1: {val_avg['f1']:.4f}")

print("\n=== Test Averages over 5 runs ===")
print(f"Test Accuracy: {test_avg['accuracy']:.4f}, Test Precision: {test_avg['precision']:.4f}, Test Recall: {test_avg['recall']:.4f}, Test F1: {test_avg['f1']:.4f}")

# wandb summary에 평균 기록
wandb.summary["val_accuracy_mean"] = val_avg["accuracy"]
wandb.summary["val_precision_mean"] = val_avg["precision"]
wandb.summary["val_recall_mean"] = val_avg["recall"]
wandb.summary["val_f1_mean"] = val_avg["f1"]

wandb.summary["test_accuracy_mean"] = test_avg["accuracy"]
wandb.summary["test_precision_mean"] = test_avg["precision"]
wandb.summary["test_recall_mean"] = test_avg["recall"]
wandb.summary["test_f1_mean"] = test_avg["f1"]

# ===== 모델 저장 (마지막 라운드 모델) =====
if last_model is not None:
    last_model.save_model(MODEL_SAVE_PATH)
    try:
        wandb.save(MODEL_SAVE_PATH)
    except Exception:
        pass
    print(f"모델 저장 완료: {MODEL_SAVE_PATH}")

wandb.finish()

Total samples: 15463
[Run 1] Train: 10824, Val: 2319, Test: 2320
[Run 1] TF-IDF Train shape: (10824, 5000)
[Run 1] TF-IDF Val shape: (2319, 5000)
[Run 1] TF-IDF Test shape: (2320, 5000)
[50]	train's binary_logloss: 0.161115	val's binary_logloss: 0.169525
[100]	train's binary_logloss: 0.1002	val's binary_logloss: 0.110117
[150]	train's binary_logloss: 0.0756842	val's binary_logloss: 0.0861556
[200]	train's binary_logloss: 0.0602114	val's binary_logloss: 0.0714732
--- [Run 1] Validation Set Results ---
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1744
         1.0       0.96      0.97      0.97       575

    accuracy                           0.98      2319
   macro avg       0.98      0.98      0.98      2319
weighted avg       0.98      0.98      0.98      2319

[Run 1] Val Accuracy: 0.9836, Val Precision: 0.9637, Val Recall: 0.9704, Val F1: 0.9671

--- [Run 1] Test Set Results (Final) ---
              precision    recall  f1

0,1
run_idx,▁▁▃▃▅▅▆▆██
test_accuracy,▃▆▃▁█
test_f1_score,▃▆▃▁█
test_precision,▇▃█▁▇
test_recall,▂█▁▅█
val_accuracy,▄▁▃█▅
val_f1_score,▄▁▃█▅
val_precision,▂▁▄██
val_recall,█▇▅▄▁

0,1
run_idx,5
test_accuracy,0.98578
test_accuracy_mean,0.98302
test_f1_mean,0.96543
test_f1_score,0.97113
test_precision,0.97711
test_precision_mean,0.9742
test_recall,0.96522
test_recall_mean,0.95687
val_accuracy,0.98448


# 3. XGB 학습

In [3]:
import os
import pandas as pd
import numpy as np
import random
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, accuracy_score
import xgboost as xgb

# ===== 경로 설정 =====
# 단일 데이터셋(scam.csv)만 사용
PROJECT_ROOT = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATASET_DIR = os.path.join(PROJECT_ROOT, "dataset")
SCAM_DATA_PATH = os.path.join(DATASET_DIR, "scam_spam_stt.csv")
MODEL_SAVE_PATH = os.path.join(PROJECT_ROOT, "model/xgb_voicephishing_251108.model")

# 모델 저장 경로 폴더 보장
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

# ===== 난수 시드 고정 =====
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ===== wandb 초기화 =====
wandb.init(
    project="Voicephishing",
    name="tfidf_xgb_251108_5runs",
    config={
        "model": "XGBoost",
        "max_features": 5000,
        "ngram_range": (1, 2),
        "random_state": SEED,
        # xgb params
        "eta": 0.1,
        "max_depth": 6,
        "num_boost_round": 200,
        "early_stopping_rounds": 20,
        "num_runs": 5,
        "split_ratio": "train 0.70, val 0.15, test 0.15"
    }
)

# ===== 단일 CSV 로드 및 결측치 제거 =====
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"

try:
    df = pd.read_csv(SCAM_DATA_PATH)
except FileNotFoundError as e:
    print(f"오류: 데이터 파일을 찾을 수 없습니다. 경로를 확인하세요: {e}")
    wandb.finish()
    raise SystemExit(1)

df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])
X_all = df[TEXT_COLUMN].astype(str)
y_all = df[LABEL_COLUMN]

print(f"Total samples: {len(df)}")

# 메트릭 누적용
val_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}
test_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

last_model = None
last_vectorizer = None

for run_idx in range(wandb.config.num_runs):
    run_seed = SEED + run_idx

    # ===== 데이터 분할: 70% train, 30% temp =====
    X_train, X_temp, y_train, y_temp = train_test_split(
        X_all,
        y_all,
        test_size=0.30,
        random_state=run_seed,
        stratify=y_all
    )

    # ===== temp을 15% val, 15% test로 분할 (temp의 절반) =====
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp,
        y_temp,
        test_size=0.50,
        random_state=run_seed,
        stratify=y_temp
    )

    print(f"[Run {run_idx+1}] Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    # ===== TF-IDF 벡터화 (train에만 fit) =====
    vectorizer = TfidfVectorizer(
        max_features=wandb.config.max_features,
        ngram_range=tuple(wandb.config.ngram_range),
        min_df=2,
        max_df=0.9
    )
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    print(f"[Run {run_idx+1}] TF-IDF Train shape: {X_train_tfidf.shape}")
    print(f"[Run {run_idx+1}] TF-IDF Val shape: {X_val_tfidf.shape}")
    print(f"[Run {run_idx+1}] TF-IDF Test shape: {X_test_tfidf.shape}")

    # ===== XGBoost DMatrix 생성 =====
    dtrain = xgb.DMatrix(X_train_tfidf, label=y_train)
    dval = xgb.DMatrix(X_val_tfidf, label=y_val)
    dtest = xgb.DMatrix(X_test_tfidf, label=y_test)

    # ===== 하이퍼파라미터 =====
    pos_count = float(np.sum(y_train))
    neg_count = float(len(y_train) - pos_count)
    scale_pos_weight = (neg_count / pos_count) if pos_count > 0 else 1.0

    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "eta": float(wandb.config.eta),
        "max_depth": int(wandb.config.max_depth),
        "scale_pos_weight": scale_pos_weight,
        "seed": run_seed,
        "verbosity": 0
    }

    evals = [(dtrain, "train"), (dval, "val")]

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=int(wandb.config.num_boost_round),
        evals=evals,
        early_stopping_rounds=int(wandb.config.early_stopping_rounds),
        verbose_eval=False
    )

    # ===== 검증 (Validation) =====
    y_val_pred_prob = bst.predict(dval, iteration_range=(0, bst.best_iteration + 1) if bst.best_iteration is not None else None)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)

    print(f"--- [Run {run_idx+1}] Validation Set Results ---")
    print(classification_report(y_val, y_val_pred))

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, zero_division=0)
    val_recall = recall_score(y_val, y_val_pred, zero_division=0)
    val_f1 = f1_score(y_val, y_val_pred, zero_division=0)

    print(f"[Run {run_idx+1}] Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "val_accuracy": val_accuracy,
        "val_precision": val_precision,
        "val_recall": val_recall,
        "val_f1_score": val_f1
    })

    val_metrics["accuracy"].append(val_accuracy)
    val_metrics["precision"].append(val_precision)
    val_metrics["recall"].append(val_recall)
    val_metrics["f1"].append(val_f1)

    # ===== 최종 평가 (Test) =====
    y_test_pred_prob = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1) if bst.best_iteration is not None else None)
    y_test_pred = (y_test_pred_prob > 0.5).astype(int)

    print(f"\n--- [Run {run_idx+1}] Test Set Results (Final) ---")
    print(classification_report(y_test, y_test_pred))

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)

    print(f"[Run {run_idx+1}] Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "test_accuracy": test_accuracy,
        "test_precision": test_precision,
        "test_recall": test_recall,
        "test_f1_score": test_f1
    })

    test_metrics["accuracy"].append(test_accuracy)
    test_metrics["precision"].append(test_precision)
    test_metrics["recall"].append(test_recall)
    test_metrics["f1"].append(test_f1)

    # 마지막 모델과 벡터라이저 저장을 위해 보관
    last_model = bst
    last_vectorizer = vectorizer

# ===== 5회 평균 결과 출력 =====
def _mean(v):
    return float(np.mean(v)) if len(v) > 0 else float("nan")

val_avg = {k: _mean(v) for k, v in val_metrics.items()}
test_avg = {k: _mean(v) for k, v in test_metrics.items()}

print("\n=== Validation Averages over 5 runs ===")
print(f"Val Accuracy: {val_avg['accuracy']:.4f}, Val Precision: {val_avg['precision']:.4f}, Val Recall: {val_avg['recall']:.4f}, Val F1: {val_avg['f1']:.4f}")

print("\n=== Test Averages over 5 runs ===")
print(f"Test Accuracy: {test_avg['accuracy']:.4f}, Test Precision: {test_avg['precision']:.4f}, Test Recall: {test_avg['recall']:.4f}, Test F1: {test_avg['f1']:.4f}")

# wandb summary에 평균 기록
wandb.summary["val_accuracy_mean"] = val_avg["accuracy"]
wandb.summary["val_precision_mean"] = val_avg["precision"]
wandb.summary["val_recall_mean"] = val_avg["recall"]
wandb.summary["val_f1_mean"] = val_avg["f1"]

wandb.summary["test_accuracy_mean"] = test_avg["accuracy"]
wandb.summary["test_precision_mean"] = test_avg["precision"]
wandb.summary["test_recall_mean"] = test_avg["recall"]
wandb.summary["test_f1_mean"] = test_avg["f1"]

# ===== 모델 저장 (마지막 라운드 모델) =====
if last_model is not None:
    last_model.save_model(MODEL_SAVE_PATH)
    try:
        wandb.save(MODEL_SAVE_PATH)
    except Exception:
        pass
    print(f"모델 저장 완료: {MODEL_SAVE_PATH}")

wandb.finish()

Total samples: 15463
[Run 1] Train: 10824, Val: 2319, Test: 2320
[Run 1] TF-IDF Train shape: (10824, 5000)
[Run 1] TF-IDF Val shape: (2319, 5000)
[Run 1] TF-IDF Test shape: (2320, 5000)
--- [Run 1] Validation Set Results ---
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1744
         1.0       0.98      0.97      0.98       575

    accuracy                           0.99      2319
   macro avg       0.99      0.98      0.98      2319
weighted avg       0.99      0.99      0.99      2319

[Run 1] Val Accuracy: 0.9884, Val Precision: 0.9790, Val Recall: 0.9739, Val F1: 0.9765

--- [Run 1] Test Set Results (Final) ---
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1745
         1.0       0.98      0.96      0.97       575

    accuracy                           0.99      2320
   macro avg       0.98      0.98      0.98      2320
weighted avg       0.99      0.99      0.99   

0,1
run_idx,▁▁▃▃▅▅▆▆██
test_accuracy,▁▇▄▅█
test_f1_score,▁▇▃▅█
test_precision,▁▂█▅█
test_recall,▂█▁▅▆
val_accuracy,▁▁▁▂█
val_f1_score,▁▁▁▂█
val_precision,▃▁▄██
val_recall,▆█▄▁▄

0,1
run_idx,5
test_accuracy,0.99009
test_accuracy_mean,0.98793
test_f1_mean,0.97547
test_f1_score,0.97984
test_precision,0.98763
test_precision_mean,0.98273
test_recall,0.97217
test_recall_mean,0.96835
val_accuracy,0.99051


# 4. LSTM 학습

In [None]:
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from tqdm import tqdm
from transformers import BertTokenizer
import wandb

# ===== 경로 설정 =====
# 단일 데이터셋(scam.csv)만 사용
PROJECT_ROOT = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATASET_DIR = os.path.join(PROJECT_ROOT, "dataset")
SCAM_DATA_PATH = os.path.join(DATASET_DIR, "spam_stt.csv")
MODEL_SAVE_PATH = os.path.join(PROJECT_ROOT, "model/lstm_251107.pt")

os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

# ===== 시드 고정 =====
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

BASE_SEED = 42
set_seed(BASE_SEED)

# ===== GPU 설정 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ===== wandb 초기화 =====
wandb.init(
    project="Voicephishing",
    name="lstm_251107_5runs",
    config={
        "model": "LSTM",
        "embed_dim": 128,
        "hidden_dim": 256,
        "num_layers": 2,
        "dropout": 0.3,
        "batch_size": 16,
        "eval_batch_size": 32,
        "learning_rate": 2e-4,
        "epochs": 10,
        "max_len": 1024,
        "num_runs": 5,
        "split_ratio": "train 0.70, val 0.15, test 0.15"
    }
)

# ===== 1. 데이터 불러오기 =====
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"

try:
    df = pd.read_csv(SCAM_DATA_PATH)
except FileNotFoundError as e:
    print(f"오류: 데이터 파일을 찾을 수 없습니다. 경로를 확인하세요: {e}")
    wandb.finish()
    raise SystemExit(1)

df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])
X_all = df[TEXT_COLUMN].astype(str)
y_all = df[LABEL_COLUMN].astype(int)

print(f"Total samples: {len(df)}")

# ===== Tokenizer =====
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
MAX_LEN = int(wandb.config.max_len)

def encode_texts(texts, tokenizer, max_len):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        input_ids.append(encoded["input_ids"])  # [1, L]
        attention_masks.append(encoded["attention_mask"])  # [1, L]
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# ===== Dataset 클래스 =====
class LSTMDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx]
        }

# ===== LSTM 모델 정의 =====
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2, dropout=0.3):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        packed_output, (hidden, cell) = self.lstm(embedded)
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)  # bidirectional
        output = self.fc(hidden_cat)
        return output.squeeze()

# ===== 평가 함수 =====
def evaluate(model, dataloader, criterion):
    model.eval()
    preds, targets = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * input_ids.size(0)
            preds.extend((outputs > 0.5).long().cpu().numpy())
            targets.extend(labels.long().cpu().numpy())
    avg_loss = total_loss / len(dataloader.dataset) if len(dataloader.dataset) > 0 else 0.0
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, zero_division=0)
    precision = precision_score(targets, preds, zero_division=0)
    recall = recall_score(targets, preds, zero_division=0)
    return avg_loss, acc, f1, precision, recall, targets, preds

# ===== 5회 반복 학습/평가 =====
val_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}
test_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

last_saved_model_path = None

for run_idx in range(int(wandb.config.num_runs)):
    run_seed = BASE_SEED + run_idx
    set_seed(run_seed)

    # ----- 데이터 분할: 70% / 30% -----
    X_train_texts, X_temp_texts, y_train, y_temp = train_test_split(
        X_all,
        y_all,
        test_size=0.30,
        random_state=run_seed,
        stratify=y_all
    )

    # ----- temp을 15% val, 15% test (temp의 절반) -----
    X_val_texts, X_test_texts, y_val, y_test = train_test_split(
        X_temp_texts,
        y_temp,
        test_size=0.50,
        random_state=run_seed,
        stratify=y_temp
    )

    print(f"[Run {run_idx+1}] Train: {len(X_train_texts)}, Val: {len(X_val_texts)}, Test: {len(X_test_texts)}")

    # ----- 인코딩 -----
    train_inputs, train_masks = encode_texts(X_train_texts.tolist(), tokenizer, MAX_LEN)
    val_inputs, val_masks = encode_texts(X_val_texts.tolist(), tokenizer, MAX_LEN)
    test_inputs, test_masks = encode_texts(X_test_texts.tolist(), tokenizer, MAX_LEN)

    train_labels_tensor = torch.tensor(y_train.tolist(), dtype=torch.float32)
    val_labels_tensor = torch.tensor(y_val.tolist(), dtype=torch.float32)
    test_labels_tensor = torch.tensor(y_test.tolist(), dtype=torch.float32)

    # ----- Dataset / DataLoader -----
    train_dataset = LSTMDataset(train_inputs, train_masks, train_labels_tensor)
    val_dataset = LSTMDataset(val_inputs, val_masks, val_labels_tensor)
    test_dataset = LSTMDataset(test_inputs, test_masks, test_labels_tensor)

    train_loader = DataLoader(train_dataset, batch_size=int(wandb.config.batch_size), shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=int(wandb.config.eval_batch_size))
    test_loader = DataLoader(test_dataset, batch_size=int(wandb.config.eval_batch_size))

    # ----- 모델/옵티마이저 -----
    vocab_size = tokenizer.vocab_size
    model = LSTMClassifier(
        vocab_size=vocab_size,
        embed_dim=int(wandb.config.embed_dim),
        hidden_dim=int(wandb.config.hidden_dim),
        num_layers=int(wandb.config.num_layers),
        dropout=float(wandb.config.dropout)
    ).to(device)

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=float(wandb.config.learning_rate))

    # ----- 학습 루프 -----
    num_epochs = int(wandb.config.epochs)
    best_f1 = 0.0
    best_model_path = MODEL_SAVE_PATH  # 동일 경로 사용 (각 run마다 갱신)

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0.0
        loop = tqdm(train_loader, desc=f"Run {run_idx+1} | Epoch {epoch+1}")
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            if torch.isnan(loss):
                print("Warning: NaN loss detected. Skipping batch.")
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            total_train_loss += loss.item() * input_ids.size(0)
            loop.set_postfix(train_loss=f"{loss.item():.4f}")

        avg_train_loss = total_train_loss / len(train_loader.dataset) if len(train_loader.dataset) > 0 else 0.0

        val_loss, val_acc, val_f1, val_precision, val_recall, _, _ = evaluate(model, val_loader, criterion)

        wandb.log({
            "run_idx": run_idx + 1,
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "val_loss": val_loss,
            "val_accuracy": val_acc,
            "val_f1": val_f1,
            "val_precision": val_precision,
            "val_recall": val_recall
        })

        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), best_model_path)
            print("Best model saved!")
            try:
                wandb.save(best_model_path)
            except Exception:
                pass

    # ----- Best 모델 로드 후 Val/Test 평가 -----
    try:
        model.load_state_dict(torch.load(best_model_path, map_location=device))
        print(f"[Run {run_idx+1}] 베스트 모델 로드: {best_model_path}")
    except Exception as e:
        print(f"[Run {run_idx+1}] 베스트 모델 로드 실패: {e}")

    val_loss, val_acc, val_f1, val_precision, val_recall, val_targets, val_preds = evaluate(model, val_loader, criterion)
    print(f"\n--- [Run {run_idx+1}] Validation Set Results ---")
    print(classification_report(val_targets, val_preds))
    print(f"[Run {run_idx+1}] Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f}")

    test_loss, test_acc, test_f1, test_precision, test_recall, test_targets, test_preds = evaluate(model, test_loader, criterion)
    print(f"\n--- [Run {run_idx+1}] Test Set Results (Final) ---")
    print(classification_report(test_targets, test_preds))
    print(f"[Run {run_idx+1}] Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")

    wandb.log({
        "run_idx": run_idx + 1,
        "final_val_loss": val_loss,
        "final_val_accuracy": val_acc,
        "final_val_f1": val_f1,
        "final_val_precision": val_precision,
        "final_val_recall": val_recall,
        "test_loss": test_loss,
        "test_accuracy": test_acc,
        "test_f1": test_f1,
        "test_precision": test_precision,
        "test_recall": test_recall
    })

    val_metrics["accuracy"].append(val_acc)
    val_metrics["precision"].append(val_precision)
    val_metrics["recall"].append(val_recall)
    val_metrics["f1"].append(val_f1)

    test_metrics["accuracy"].append(test_acc)
    test_metrics["precision"].append(test_precision)
    test_metrics["recall"].append(test_recall)
    test_metrics["f1"].append(test_f1)

    last_saved_model_path = best_model_path

# ===== 5회 평균 결과 출력 =====
def _mean(v):
    return float(np.mean(v)) if len(v) > 0 else float("nan")

val_avg = {k: _mean(v) for k, v in val_metrics.items()}
test_avg = {k: _mean(v) for k, v in test_metrics.items()}

print("\n=== Validation Averages over 5 runs ===")
print(f"Val Accuracy: {val_avg['accuracy']:.4f}, Val Precision: {val_avg['precision']:.4f}, Val Recall: {val_avg['recall']:.4f}, Val F1: {val_avg['f1']:.4f}")

print("\n=== Test Averages over 5 runs ===")
print(f"Test Accuracy: {test_avg['accuracy']:.4f}, Test Precision: {test_avg['precision']:.4f}, Test Recall: {test_avg['recall']:.4f}, Test F1: {test_avg['f1']:.4f}")

# wandb summary에 평균 기록
wandb.summary["val_accuracy_mean"] = val_avg["accuracy"]
wandb.summary["val_precision_mean"] = val_avg["precision"]
wandb.summary["val_recall_mean"] = val_avg["recall"]
wandb.summary["val_f1_mean"] = val_avg["f1"]

wandb.summary["test_accuracy_mean"] = test_avg["accuracy"]
wandb.summary["test_precision_mean"] = test_avg["precision"]
wandb.summary["test_recall_mean"] = test_avg["recall"]
wandb.summary["test_f1_mean"] = test_avg["f1"]

# 마지막으로 저장된 베스트 모델 경로 안내
if last_saved_model_path:
    print(f"마지막 저장된 베스트 모델: {last_saved_model_path}")

wandb.finish()

Device: cuda


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkujoon13413[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Total samples: 11623


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

[Run 1] Train: 8136, Val: 1743, Test: 1744


Run 1 | Epoch 1: 100%|██████████| 509/509 [01:07<00:00,  7.59it/s, train_loss=0.0062]


Best model saved!


Run 1 | Epoch 2: 100%|██████████| 509/509 [01:09<00:00,  7.29it/s, train_loss=0.0087]


Best model saved!


Run 1 | Epoch 3: 100%|██████████| 509/509 [01:09<00:00,  7.31it/s, train_loss=0.0014]


Best model saved!


Run 1 | Epoch 4: 100%|██████████| 509/509 [01:10<00:00,  7.25it/s, train_loss=0.0003]


Best model saved!


Run 1 | Epoch 5: 100%|██████████| 509/509 [01:09<00:00,  7.34it/s, train_loss=0.0001]
Run 1 | Epoch 6: 100%|██████████| 509/509 [01:08<00:00,  7.45it/s, train_loss=0.0001]
Run 1 | Epoch 7: 100%|██████████| 509/509 [01:08<00:00,  7.38it/s, train_loss=0.0021]
Run 1 | Epoch 8: 100%|██████████| 509/509 [01:08<00:00,  7.42it/s, train_loss=0.0004]
Run 1 | Epoch 9: 100%|██████████| 509/509 [01:09<00:00,  7.36it/s, train_loss=0.0001]
Run 1 | Epoch 10: 100%|██████████| 509/509 [01:08<00:00,  7.38it/s, train_loss=0.0001]


[Run 1] 베스트 모델 로드: /content/drive/MyDrive/Github/2025_Voicephishing/model/lstm_251107.pt

--- [Run 1] Validation Set Results ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1456
           1       0.98      0.98      0.98       287

    accuracy                           0.99      1743
   macro avg       0.99      0.99      0.99      1743
weighted avg       0.99      0.99      0.99      1743

[Run 1] Val Loss: 0.0193 | Val Acc: 0.9943 | Val F1: 0.9826 | Val Precision: 0.9826 | Val Recall: 0.9826

--- [Run 1] Test Set Results (Final) ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1457
           1       0.99      0.94      0.96       287

    accuracy                           0.99      1744
   macro avg       0.99      0.97      0.98      1744
weighted avg       0.99      0.99      0.99      1744

[Run 1] Test Loss: 0.0663 | Test Acc: 0.9874 | Test F1: 0.9607 | Test 

Run 2 | Epoch 1: 100%|██████████| 509/509 [01:09<00:00,  7.33it/s, train_loss=0.1751]


Best model saved!


Run 2 | Epoch 2: 100%|██████████| 509/509 [01:10<00:00,  7.25it/s, train_loss=0.0053]


Best model saved!


Run 2 | Epoch 3: 100%|██████████| 509/509 [01:10<00:00,  7.24it/s, train_loss=0.0027]


Best model saved!


Run 2 | Epoch 4: 100%|██████████| 509/509 [01:10<00:00,  7.24it/s, train_loss=0.0005]
Run 2 | Epoch 5: 100%|██████████| 509/509 [01:08<00:00,  7.42it/s, train_loss=0.0005]
Run 2 | Epoch 6: 100%|██████████| 509/509 [01:08<00:00,  7.43it/s, train_loss=0.0004]


Best model saved!


Run 2 | Epoch 7: 100%|██████████| 509/509 [01:08<00:00,  7.38it/s, train_loss=0.0000]


Best model saved!


Run 2 | Epoch 8: 100%|██████████| 509/509 [01:09<00:00,  7.31it/s, train_loss=0.0140]
Run 2 | Epoch 9: 100%|██████████| 509/509 [01:09<00:00,  7.33it/s, train_loss=0.0000]


Best model saved!


Run 2 | Epoch 10: 100%|██████████| 509/509 [01:09<00:00,  7.28it/s, train_loss=0.0005]


[Run 2] 베스트 모델 로드: /content/drive/MyDrive/Github/2025_Voicephishing/model/lstm_251107.pt

--- [Run 2] Validation Set Results ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1456
           1       0.99      0.97      0.98       287

    accuracy                           0.99      1743
   macro avg       0.99      0.98      0.99      1743
weighted avg       0.99      0.99      0.99      1743

[Run 2] Val Loss: 0.0474 | Val Acc: 0.9937 | Val F1: 0.9806 | Val Precision: 0.9929 | Val Recall: 0.9686

--- [Run 2] Test Set Results (Final) ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1457
           1       1.00      0.97      0.98       287

    accuracy                           0.99      1744
   macro avg       1.00      0.99      0.99      1744
weighted avg       0.99      0.99      0.99      1744

[Run 2] Test Loss: 0.0334 | Test Acc: 0.9948 | Test F1: 0.9841 | Test 

Run 3 | Epoch 1: 100%|██████████| 509/509 [01:08<00:00,  7.40it/s, train_loss=0.3845]


Best model saved!


Run 3 | Epoch 2: 100%|██████████| 509/509 [01:10<00:00,  7.26it/s, train_loss=0.5664]


Best model saved!


Run 3 | Epoch 3: 100%|██████████| 509/509 [01:08<00:00,  7.38it/s, train_loss=0.0012]


Best model saved!


Run 3 | Epoch 4: 100%|██████████| 509/509 [01:09<00:00,  7.37it/s, train_loss=0.0007]


Best model saved!


Run 3 | Epoch 5: 100%|██████████| 509/509 [01:09<00:00,  7.27it/s, train_loss=0.0037]


Best model saved!


Run 3 | Epoch 6: 100%|██████████| 509/509 [01:09<00:00,  7.33it/s, train_loss=0.0000]
Run 3 | Epoch 7: 100%|██████████| 509/509 [01:08<00:00,  7.43it/s, train_loss=0.0000]
Run 3 | Epoch 8: 100%|██████████| 509/509 [01:08<00:00,  7.41it/s, train_loss=0.0000]
Run 3 | Epoch 9: 100%|██████████| 509/509 [01:08<00:00,  7.44it/s, train_loss=0.0000]
Run 3 | Epoch 10: 100%|██████████| 509/509 [01:08<00:00,  7.45it/s, train_loss=0.0001]


Best model saved!
[Run 3] 베스트 모델 로드: /content/drive/MyDrive/Github/2025_Voicephishing/model/lstm_251107.pt

--- [Run 3] Validation Set Results ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1456
           1       0.98      0.98      0.98       287

    accuracy                           0.99      1743
   macro avg       0.99      0.99      0.99      1743
weighted avg       0.99      0.99      0.99      1743

[Run 3] Val Loss: 0.0618 | Val Acc: 0.9920 | Val F1: 0.9756 | Val Precision: 0.9756 | Val Recall: 0.9756

--- [Run 3] Test Set Results (Final) ---
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1457
           1       0.96      0.99      0.98       287

    accuracy                           0.99      1744
   macro avg       0.98      0.99      0.99      1744
weighted avg       0.99      0.99      0.99      1744

[Run 3] Test Loss: 0.0582 | Test Acc: 0.9920 | Test 

Run 4 | Epoch 1: 100%|██████████| 509/509 [01:08<00:00,  7.39it/s, train_loss=0.0127]


Best model saved!


Run 4 | Epoch 2: 100%|██████████| 509/509 [01:10<00:00,  7.20it/s, train_loss=0.8922]


Best model saved!


Run 4 | Epoch 3: 100%|██████████| 509/509 [01:09<00:00,  7.32it/s, train_loss=0.0042]


Best model saved!


Run 4 | Epoch 4: 100%|██████████| 509/509 [01:10<00:00,  7.24it/s, train_loss=0.0004]
Run 4 | Epoch 5: 100%|██████████| 509/509 [01:08<00:00,  7.44it/s, train_loss=0.0062]
Run 4 | Epoch 6: 100%|██████████| 509/509 [01:08<00:00,  7.41it/s, train_loss=0.0008]


Best model saved!


Run 4 | Epoch 7: 100%|██████████| 509/509 [01:10<00:00,  7.25it/s, train_loss=0.0277]


Best model saved!


Run 4 | Epoch 8: 100%|██████████| 509/509 [01:10<00:00,  7.26it/s, train_loss=0.0000]


Best model saved!


Run 4 | Epoch 9: 100%|██████████| 509/509 [01:09<00:00,  7.31it/s, train_loss=0.0002]


Best model saved!


Run 4 | Epoch 10: 100%|██████████| 509/509 [01:09<00:00,  7.27it/s, train_loss=0.0000]


[Run 4] 베스트 모델 로드: /content/drive/MyDrive/Github/2025_Voicephishing/model/lstm_251107.pt

--- [Run 4] Validation Set Results ---
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1456
           1       0.97      0.99      0.98       287

    accuracy                           0.99      1743
   macro avg       0.99      0.99      0.99      1743
weighted avg       0.99      0.99      0.99      1743

[Run 4] Val Loss: 0.0274 | Val Acc: 0.9943 | Val F1: 0.9828 | Val Precision: 0.9727 | Val Recall: 0.9930

--- [Run 4] Test Set Results (Final) ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1457
           1       0.98      0.98      0.98       287

    accuracy                           0.99      1744
   macro avg       0.99      0.99      0.99      1744
weighted avg       0.99      0.99      0.99      1744

[Run 4] Test Loss: 0.0355 | Test Acc: 0.9937 | Test F1: 0.9808 | Test 

Run 5 | Epoch 1: 100%|██████████| 509/509 [01:08<00:00,  7.38it/s, train_loss=0.0051]


Best model saved!


Run 5 | Epoch 2: 100%|██████████| 509/509 [01:09<00:00,  7.32it/s, train_loss=0.3794]


Best model saved!


Run 5 | Epoch 3: 100%|██████████| 509/509 [01:09<00:00,  7.29it/s, train_loss=0.0003]


Best model saved!


Run 5 | Epoch 4: 100%|██████████| 509/509 [01:09<00:00,  7.31it/s, train_loss=0.0002]


Best model saved!


Run 5 | Epoch 5: 100%|██████████| 509/509 [01:09<00:00,  7.36it/s, train_loss=0.0005]


Best model saved!


Run 5 | Epoch 6: 100%|██████████| 509/509 [01:09<00:00,  7.33it/s, train_loss=0.0001]


Best model saved!


Run 5 | Epoch 7: 100%|██████████| 509/509 [01:09<00:00,  7.32it/s, train_loss=0.0005]
Run 5 | Epoch 8: 100%|██████████| 509/509 [01:08<00:00,  7.39it/s, train_loss=0.0001]
Run 5 | Epoch 9: 100%|██████████| 509/509 [01:08<00:00,  7.40it/s, train_loss=0.0004]
Run 5 | Epoch 10: 100%|██████████| 509/509 [01:08<00:00,  7.41it/s, train_loss=0.0000]


[Run 5] 베스트 모델 로드: /content/drive/MyDrive/Github/2025_Voicephishing/model/lstm_251107.pt

--- [Run 5] Validation Set Results ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1456
           1       0.99      0.97      0.98       287

    accuracy                           0.99      1743
   macro avg       0.99      0.98      0.99      1743
weighted avg       0.99      0.99      0.99      1743

[Run 5] Val Loss: 0.0479 | Val Acc: 0.9925 | Val F1: 0.9772 | Val Precision: 0.9858 | Val Recall: 0.9686

--- [Run 5] Test Set Results (Final) ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1457
           1       0.99      0.96      0.97       287

    accuracy                           0.99      1744
   macro avg       0.99      0.98      0.98      1744
weighted avg       0.99      0.99      0.99      1744

[Run 5] Test Loss: 0.0486 | Test Acc: 0.9914 | Test F1: 0.9735 | Test 

0,1
epoch,▁▂▃▃▄▆▇█▁▂▃▄▅▆▆█▁▂▃▃▅▆▆▇█▂▃▄▅▆▇█▁▂▃▄▅▆▆█
final_val_accuracy,█▆▁█▃
final_val_f1,█▆▁█▃
final_val_loss,▁▆█▂▆
final_val_precision,▄█▂▁▆
final_val_recall,▅▁▃█▁
run_idx,▁▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▃▅▅▅▅▅▆▆▆▆▆▆▆▆█████████
test_accuracy,▁█▅▇▅
test_f1,▁█▆▇▅
test_loss,█▁▆▁▄

0,1
epoch,10
final_val_accuracy,0.99254
final_val_f1,0.97715
final_val_loss,0.04786
final_val_precision,0.98582
final_val_recall,0.96864
run_idx,5
test_accuracy,0.9914
test_accuracy_mean,0.99186
test_f1,0.97345


In [None]:
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split # (추가) 5-run split을 위해
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from tqdm import tqdm
from transformers import BertTokenizer
import wandb

# ===== 경로 설정 =====
BASE_DIR = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATASET_DIR = os.path.join(BASE_DIR, "dataset")
# (수정) korccvi.csv 파일 하나만 사용
KORCCVI_DATA_PATH = os.path.join(DATASET_DIR, "spam_stt.csv")
MODEL_SAVE_DIR = os.path.join(BASE_DIR, "model") # (수정) 모델 저장 '디렉토리'

# (추가) 5-Run 설정
N_RUNS = 5
all_test_metrics = [] # 5번의 테스트 결과를 저장할 리스트

# (수정) korccvi.csv의 실제 컬럼명 사용
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'

# ===== 난수 시드 고정 함수 (유지) =====
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# ===== GPU 설정 (유지) =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ===== Dataset 클래스 (유지) =====
class KorCCVIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=1024):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ===== GRU 모델 정의 (유지) =====
class GRUBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers=1, bidirectional=True, dropout=0.3):
        super(GRUBinaryClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        gru_out, _ = self.gru(x)
        out = gru_out[:, -1, :]
        out = self.fc(out)
        out = self.sigmoid(out)
        return out.squeeze()

# ===== (추가) 평가 함수 (루프 밖으로 이동) =====
def evaluate(model, dataloader, criterion):
    model.eval()
    preds = []
    targets = []
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].float().to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * input_ids.size(0)
            preds.extend((outputs > 0.5).long().cpu().numpy())
            targets.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader.dataset)
    # (수정) zero_division=0 추가
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, zero_division=0)
    precision = precision_score(targets, preds, zero_division=0)
    recall = recall_score(targets, preds, zero_division=0)
    return avg_loss, acc, f1, precision, recall, targets, preds

# ===== Tokenizer 준비 (루프 밖으로 이동) =====
# 토크나이저는 한 번만 로드
tokenizer = BertTokenizer.from_pretrained("monologg/kobert", cache_dir=os.path.join(BASE_DIR, "cache"))
vocab_size = tokenizer.vocab_size
max_length = 1024

# ===== 하이퍼파라미터 (루프 밖에서 정의) =====
embedding_dim = 256
hidden_dim = 128
num_layers = 1
bidirectional = True
dropout = 0.3
num_epochs = 10 # (RF 코드와 동일하게 10 에포크로 설정)
batch_size = 32
learning_rate = 1e-4

# ===== (추가) 5-Run 학습 루프 시작 =====
for i in range(N_RUNS):
    print(f"\n--- 5-Run 실험 : {i+1}/{N_RUNS} 번째 실행 ---")

    # (추가) 매번 다른 시드 적용
    current_seed = 42 + i
    set_seed(current_seed)
    print(f"Current Seed: {current_seed}")

    # (수정) 모델 저장 경로 (매번 다르게)
    MODEL_SAVE_PATH = os.path.join(MODEL_SAVE_DIR, f"gru_korccvi_run_{i+1}.pt")

    # ===== 1. 데이터 불러오기 및 분할 (매번 새로 수행) =====
    try:
        df = pd.read_csv(KORCCVI_DATA_PATH)
    except FileNotFoundError as e:
        print(f"오류: {KORCCVI_DATA_PATH} 파일을 찾을 수 없습니다.")
        break # 루프 중단

    df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])
    df = df[[TEXT_COLUMN, LABEL_COLUMN]]

    # (수정) 70:15:15 비율로 분할 (RF 코드와 동일)
    # 1. Train+Val (85%) / Test (15%) 분할
    train_val_df, test_df = train_test_split(
        df,
        test_size=0.15,
        random_state=current_seed,
        stratify=df[LABEL_COLUMN]
    )

    # 2. Train (70%) / Val (15%) 분할 (85% 중 15/85 = 약 17.6%)
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=(0.15 / 0.85), # 85% 중 15% 비율
        random_state=current_seed,
        stratify=train_val_df[LABEL_COLUMN]
    )

    train_texts = train_df[TEXT_COLUMN].tolist()
    train_labels = train_df[LABEL_COLUMN].tolist()
    val_texts = val_df[TEXT_COLUMN].tolist()
    val_labels = val_df[LABEL_COLUMN].tolist()
    test_texts = test_df[TEXT_COLUMN].tolist()
    test_labels = test_df[LABEL_COLUMN].tolist()

    print(f"Run {i+1} Data Split -> Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")

    # ===== 2. Dataset 및 DataLoader 생성 (매번 새로 수행) =====
    train_dataset = KorCCVIDataset(train_texts, train_labels, tokenizer, max_length=max_length)
    val_dataset = KorCCVIDataset(val_texts, val_labels, tokenizer, max_length=max_length)
    test_dataset = KorCCVIDataset(test_texts, test_labels, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size * 2) # (val/test는 2배)
    test_loader = DataLoader(test_dataset, batch_size=batch_size * 2)

    # ===== 3. 모델, 손실함수, 옵티마이저 (매번 새로 생성) =====
    model = GRUBinaryClassifier(
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        vocab_size=vocab_size,
        num_layers=num_layers,
        bidirectional=bidirectional,
        dropout=dropout
    ).to(device)

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # ===== 4. WandB 초기화 (매번 새로 수행) =====
    wandb_project = "Voicephishing"
    wandb_group = "gru_5_run_korccvi" # (추가) 5번의 실행을 그룹화
    wandb_run_name = f"gru_run_{i+1}_seed_{current_seed}"
    wandb.init(project=wandb_project, group=wandb_group, name=wandb_run_name)
    wandb.config.update({
        "run_id": i + 1,
        "seed": current_seed,
        "embedding_dim": embedding_dim,
        "hidden_dim": hidden_dim,
        "vocab_size": vocab_size,
        "batch_size": batch_size,
        "max_length": max_length,
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "dropout": dropout,
        "model_type": "GRU"
    })

    # ===== 5. 학습 루프 (매번 새로 수행) =====
    best_f1 = 0
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        loop = tqdm(train_loader, desc=f"Run {i+1} Epoch {epoch+1}", leave=False)

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].float().to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            if torch.isnan(loss):
                print(f"Warning: Run {i+1} Epoch {epoch+1} NaN loss detected. Skipping batch.")
                continue

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            total_train_loss += loss.item() * input_ids.size(0)
            loop.set_postfix(train_loss=f"{loss.item():.4f}")

        avg_train_loss = total_train_loss / len(train_loader.dataset)

        # Validation 평가
        valid_loss, acc, f1, precision, recall, _, _ = evaluate(model, val_loader, criterion)

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "val_loss": valid_loss,
            "val_accuracy": acc,
            "val_f1": f1,
            "val_precision": precision,
            "val_recall": recall
        })

        print(f"Run {i+1} Epoch {epoch+1} -> "
              f"Train Loss: {avg_train_loss:.4f} | "
              f"Valid Loss: {valid_loss:.4f} | Val F1: {f1:.4f}")

        if f1 > best_f1:
            torch.save(model.state_dict(), MODEL_SAVE_PATH)
            best_f1 = f1
            print(f"Run {i+1} Best model saved! (Val F1: {best_f1:.4f})")
            wandb.save(MODEL_SAVE_PATH)

    print(f"Run {i+1} Training complete.")

    # ===== 6. 최종 평가 (Test) (매번 새로 수행) =====
    print(f"\n--- Run {i+1} 최종 테스트 평가 ---")
    try:
        model.load_state_dict(torch.load(MODEL_SAVE_PATH))
        print(f"Run {i+1} 베스트 모델 로드 성공: {MODEL_SAVE_PATH}")

        test_loss, test_acc, test_f1, test_precision, test_recall, test_targets, test_preds = evaluate(model, test_loader, criterion)

        print(f"\n--- Run {i+1} Test Set Results ---")
        print(classification_report(test_targets, test_preds, zero_division=0))
        print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | "
              f"Test F1: {test_f1:.4f} | Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")

        # WandB에 최종 테스트 결과 기록 (summary)
        wandb.summary["test_loss"] = test_loss
        wandb.summary["test_accuracy"] = test_acc
        wandb.summary["test_f1_score"] = test_f1
        wandb.summary["test_precision"] = test_precision
        wandb.summary["test_recall"] = test_recall

        # (추가) 5-Run 평균을 위한 리스트에 추가
        all_test_metrics.append({
            'run': i + 1,
            'seed': current_seed,
            'test_f1': test_f1,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_accuracy': test_acc,
            'test_loss': test_loss
        })

    except FileNotFoundError:
        print(f"오류: {MODEL_SAVE_PATH} 모델을 찾을 수 없어 테스트 평가를 스킵합니다.")
    except Exception as e:
        print(f"Run {i+1} 테스트 평가 중 오류 발생: {e}")

    wandb.finish() # (수정) 매 루프 종료 시 wandb 종료

# ===== (추가) 5-Run 결과 평균 계산 =====
print("\n\n==============================================")
print(f"모든 {N_RUNS}번의 실행이 완료되었습니다.")
print("==============================================")

if all_test_metrics:
    metrics_df = pd.DataFrame(all_test_metrics)
    metrics_df = metrics_df.set_index('run')

    avg_metrics = metrics_df.mean()
    std_metrics = metrics_df.std()

    print("\n--- 5-Run 최종 테스트 결과 (평균) ---")
    print(avg_metrics)

    print("\n--- 5-Run 최종 테스트 결과 (표준편차) ---")
    print(std_metrics)

    print("\n--- 상세 결과 (DataFrame) ---")
    print(metrics_df)

    # WandB에 최종 요약 로깅 (새로운 Run 생성)
    wandb.init(project=wandb_project, group=wandb_group, name="final_average_summary")
    summary_table = wandb.Table(dataframe=metrics_df.reset_index())
    wandb.log({"final_results_table": summary_table})

    for metric in avg_metrics.index:
        wandb.summary[f"avg_{metric}"] = avg_metrics[metric]
        wandb.summary[f"std_{metric}"] = std_metrics[metric]

    print("\n최종 평균/표준편차를 WandB에 'final_average_summary' 런으로 기록했습니다.")
    wandb.finish()

else:
    print("테스트가 정상적으로 실행되지 않아 평균을 계산할 수 없습니다.")

Device: cuda


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.



--- 5-Run 실험 : 1/5 번째 실행 ---
Current Seed: 42
Run 1 Data Split -> Train: 8135, Val: 1744, Test: 1744




Run 1 Epoch 1 -> Train Loss: 0.4530 | Valid Loss: 0.4480 | Val F1: 0.0000




Run 1 Epoch 2 -> Train Loss: 0.4492 | Valid Loss: 0.4474 | Val F1: 0.0000




Run 1 Epoch 3 -> Train Loss: 0.4488 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 1 Epoch 4 -> Train Loss: 0.4482 | Valid Loss: 0.4473 | Val F1: 0.0000




Run 1 Epoch 5 -> Train Loss: 0.4490 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 1 Epoch 6 -> Train Loss: 0.4482 | Valid Loss: 0.4473 | Val F1: 0.0000




Run 1 Epoch 7 -> Train Loss: 0.4481 | Valid Loss: 0.4486 | Val F1: 0.0000




Run 1 Epoch 8 -> Train Loss: 0.4488 | Valid Loss: 0.4483 | Val F1: 0.0000




Run 1 Epoch 9 -> Train Loss: 0.4489 | Valid Loss: 0.4496 | Val F1: 0.0000




Run 1 Epoch 10 -> Train Loss: 0.4481 | Valid Loss: 0.4484 | Val F1: 0.0000
Run 1 Training complete.

--- Run 1 최종 테스트 평가 ---
Run 1 베스트 모델 로드 성공: /content/drive/MyDrive/Github/2025_Voicephishing/model/gru_korccvi_run_1.pt

--- Run 1 Test Set Results ---
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1457
         1.0       0.16      1.00      0.28       287

    accuracy                           0.16      1744
   macro avg       0.08      0.50      0.14      1744
weighted avg       0.03      0.16      0.05      1744

Test Loss: 0.7115 | Test Acc: 0.1646 | Test F1: 0.2826 | Test Precision: 0.1646 | Test Recall: 1.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▃▂▁▂▁▁▂▂▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_f1,▁▁▁▁▁▁▁▁▁▁
val_loss,▃▂▁▁▁▁▅▄█▅
val_precision,▁▁▁▁▁▁▁▁▁▁
val_recall,▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10
test_accuracy,0.16456
test_f1_score,0.28262
test_loss,0.71145
test_precision,0.16456
test_recall,1
train_loss,0.44808
val_accuracy,0.83544
val_f1,0
val_loss,0.44843



--- 5-Run 실험 : 2/5 번째 실행 ---
Current Seed: 43
Run 2 Data Split -> Train: 8135, Val: 1744, Test: 1744




Run 2 Epoch 1 -> Train Loss: 0.4518 | Valid Loss: 0.4475 | Val F1: 0.0000




Run 2 Epoch 2 -> Train Loss: 0.4495 | Valid Loss: 0.4474 | Val F1: 0.0000




Run 2 Epoch 3 -> Train Loss: 0.4488 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 2 Epoch 4 -> Train Loss: 0.4488 | Valid Loss: 0.4483 | Val F1: 0.0000




Run 2 Epoch 5 -> Train Loss: 0.4492 | Valid Loss: 0.4481 | Val F1: 0.0000




Run 2 Epoch 6 -> Train Loss: 0.4480 | Valid Loss: 0.4480 | Val F1: 0.0000




Run 2 Epoch 7 -> Train Loss: 0.4483 | Valid Loss: 0.4476 | Val F1: 0.0000




Run 2 Epoch 8 -> Train Loss: 0.4481 | Valid Loss: 0.4476 | Val F1: 0.0000




Run 2 Epoch 9 -> Train Loss: 0.4485 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 2 Epoch 10 -> Train Loss: 0.4479 | Valid Loss: 0.4522 | Val F1: 0.0000
Run 2 Training complete.

--- Run 2 최종 테스트 평가 ---
Run 2 베스트 모델 로드 성공: /content/drive/MyDrive/Github/2025_Voicephishing/model/gru_korccvi_run_2.pt

--- Run 2 Test Set Results ---
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1457
         1.0       0.16      1.00      0.28       287

    accuracy                           0.16      1744
   macro avg       0.08      0.50      0.14      1744
weighted avg       0.03      0.16      0.05      1744

Test Loss: 0.7206 | Test Acc: 0.1646 | Test F1: 0.2826 | Test Precision: 0.1646 | Test Recall: 1.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▃▃▁▂▁▂▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_f1,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▃▂▂▂▂▁█
val_precision,▁▁▁▁▁▁▁▁▁▁
val_recall,▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10
test_accuracy,0.16456
test_f1_score,0.28262
test_loss,0.72058
test_precision,0.16456
test_recall,1
train_loss,0.44786
val_accuracy,0.83544
val_f1,0
val_loss,0.45218



--- 5-Run 실험 : 3/5 번째 실행 ---
Current Seed: 44
Run 3 Data Split -> Train: 8135, Val: 1744, Test: 1744




Run 3 Epoch 1 -> Train Loss: 0.4568 | Valid Loss: 0.4473 | Val F1: 0.0000




Run 3 Epoch 2 -> Train Loss: 0.4496 | Valid Loss: 0.4501 | Val F1: 0.0000




Run 3 Epoch 3 -> Train Loss: 0.4486 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 3 Epoch 4 -> Train Loss: 0.4496 | Valid Loss: 0.4492 | Val F1: 0.0000




Run 3 Epoch 5 -> Train Loss: 0.4481 | Valid Loss: 0.4475 | Val F1: 0.0000




Run 3 Epoch 6 -> Train Loss: 0.4480 | Valid Loss: 0.4478 | Val F1: 0.0000




Run 3 Epoch 7 -> Train Loss: 0.4487 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 3 Epoch 8 -> Train Loss: 0.4483 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 3 Epoch 9 -> Train Loss: 0.4481 | Valid Loss: 0.4473 | Val F1: 0.0000




Run 3 Epoch 10 -> Train Loss: 0.4479 | Valid Loss: 0.4494 | Val F1: 0.0000
Run 3 Training complete.

--- Run 3 최종 테스트 평가 ---
Run 3 베스트 모델 로드 성공: /content/drive/MyDrive/Github/2025_Voicephishing/model/gru_korccvi_run_3.pt

--- Run 3 Test Set Results ---
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1457
         1.0       0.16      1.00      0.28       287

    accuracy                           0.16      1744
   macro avg       0.08      0.50      0.14      1744
weighted avg       0.03      0.16      0.05      1744

Test Loss: 0.7631 | Test Acc: 0.1646 | Test F1: 0.2826 | Test Precision: 0.1646 | Test Recall: 1.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▂▂▂▁▁▂▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_f1,▁▁▁▁▁▁▁▁▁▁
val_loss,▁█▁▆▂▂▁▁▁▆
val_precision,▁▁▁▁▁▁▁▁▁▁
val_recall,▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10
test_accuracy,0.16456
test_f1_score,0.28262
test_loss,0.76313
test_precision,0.16456
test_recall,1
train_loss,0.44788
val_accuracy,0.83544
val_f1,0
val_loss,0.44941



--- 5-Run 실험 : 4/5 번째 실행 ---
Current Seed: 45
Run 4 Data Split -> Train: 8135, Val: 1744, Test: 1744




Run 4 Epoch 1 -> Train Loss: 0.4501 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 4 Epoch 2 -> Train Loss: 0.4486 | Valid Loss: 0.4473 | Val F1: 0.0000




Run 4 Epoch 3 -> Train Loss: 0.4490 | Valid Loss: 0.4476 | Val F1: 0.0000




Run 4 Epoch 4 -> Train Loss: 0.4485 | Valid Loss: 0.4476 | Val F1: 0.0000




Run 4 Epoch 5 -> Train Loss: 0.4486 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 4 Epoch 6 -> Train Loss: 0.4487 | Valid Loss: 0.4518 | Val F1: 0.0000




Run 4 Epoch 7 -> Train Loss: 0.4484 | Valid Loss: 0.4480 | Val F1: 0.0000




Run 4 Epoch 8 -> Train Loss: 0.4487 | Valid Loss: 0.4474 | Val F1: 0.0000




Run 4 Epoch 9 -> Train Loss: 0.4480 | Valid Loss: 0.4486 | Val F1: 0.0000




Run 4 Epoch 10 -> Train Loss: 0.4482 | Valid Loss: 0.4486 | Val F1: 0.0000
Run 4 Training complete.

--- Run 4 최종 테스트 평가 ---
Run 4 베스트 모델 로드 성공: /content/drive/MyDrive/Github/2025_Voicephishing/model/gru_korccvi_run_4.pt

--- Run 4 Test Set Results ---
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1457
         1.0       0.16      1.00      0.28       287

    accuracy                           0.16      1744
   macro avg       0.08      0.50      0.14      1744
weighted avg       0.03      0.16      0.05      1744

Test Loss: 0.7045 | Test Acc: 0.1646 | Test F1: 0.2826 | Test Precision: 0.1646 | Test Recall: 1.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▃▄▃▃▃▂▃▁▂
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_f1,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▂▂▁█▂▁▃▃
val_precision,▁▁▁▁▁▁▁▁▁▁
val_recall,▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10
test_accuracy,0.16456
test_f1_score,0.28262
test_loss,0.70446
test_precision,0.16456
test_recall,1
train_loss,0.44822
val_accuracy,0.83544
val_f1,0
val_loss,0.44861



--- 5-Run 실험 : 5/5 번째 실행 ---
Current Seed: 46
Run 5 Data Split -> Train: 8135, Val: 1744, Test: 1744




Run 5 Epoch 1 -> Train Loss: 0.4558 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 5 Epoch 2 -> Train Loss: 0.4487 | Valid Loss: 0.4475 | Val F1: 0.0000




Run 5 Epoch 3 -> Train Loss: 0.4496 | Valid Loss: 0.4483 | Val F1: 0.0000




Run 5 Epoch 4 -> Train Loss: 0.4481 | Valid Loss: 0.4527 | Val F1: 0.0000




Run 5 Epoch 5 -> Train Loss: 0.4490 | Valid Loss: 0.4482 | Val F1: 0.0000




Run 5 Epoch 6 -> Train Loss: 0.4485 | Valid Loss: 0.4478 | Val F1: 0.0000




Run 5 Epoch 7 -> Train Loss: 0.4482 | Valid Loss: 0.4472 | Val F1: 0.0000




Run 5 Epoch 8 -> Train Loss: 0.4478 | Valid Loss: 0.4488 | Val F1: 0.0000




Run 5 Epoch 9 -> Train Loss: 0.4477 | Valid Loss: 0.4488 | Val F1: 0.0000




Run 5 Epoch 10 -> Train Loss: 0.4486 | Valid Loss: 0.4478 | Val F1: 0.0000
Run 5 Training complete.

--- Run 5 최종 테스트 평가 ---
Run 5 베스트 모델 로드 성공: /content/drive/MyDrive/Github/2025_Voicephishing/model/gru_korccvi_run_5.pt

--- Run 5 Test Set Results ---
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1457
         1.0       0.16      1.00      0.28       287

    accuracy                           0.16      1744
   macro avg       0.08      0.50      0.14      1744
weighted avg       0.03      0.16      0.05      1744

Test Loss: 0.7030 | Test Acc: 0.1646 | Test F1: 0.2826 | Test Precision: 0.1646 | Test Recall: 1.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▂▃▁▂▂▁▁▁▂
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_f1,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▂█▂▂▁▃▃▂
val_precision,▁▁▁▁▁▁▁▁▁▁
val_recall,▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10
test_accuracy,0.16456
test_f1_score,0.28262
test_loss,0.70297
test_precision,0.16456
test_recall,1
train_loss,0.44856
val_accuracy,0.83544
val_f1,0
val_loss,0.44778




모든 5번의 실행이 완료되었습니다.

--- 5-Run 최종 테스트 결과 (평균) ---
seed              44.000000
test_f1            0.282619
test_precision     0.164564
test_recall        1.000000
test_accuracy      0.164564
test_loss          0.720518
dtype: float64

--- 5-Run 최종 테스트 결과 (표준편차) ---
seed              1.581139
test_f1           0.000000
test_precision    0.000000
test_recall       0.000000
test_accuracy     0.000000
test_loss         0.024818
dtype: float64

--- 상세 결과 (DataFrame) ---
     seed   test_f1  test_precision  test_recall  test_accuracy  test_loss
run                                                                       
1      42  0.282619        0.164564          1.0       0.164564   0.711453
2      43  0.282619        0.164564          1.0       0.164564   0.720576
3      44  0.282619        0.164564          1.0       0.164564   0.763130
4      45  0.282619        0.164564          1.0       0.164564   0.704463
5      46  0.282619        0.164564          1.0       0.164564   0.702968



최종 평균/표준편차를 WandB에 'final_average_summary' 런으로 기록했습니다.


0,1
avg_seed,44
avg_test_accuracy,0.16456
avg_test_f1,0.28262
avg_test_loss,0.72052
avg_test_precision,0.16456
avg_test_recall,1
std_seed,1.58114
std_test_accuracy,0
std_test_f1,0
std_test_loss,0.02482


# 5. KoBERT 학습

In [None]:
import os
import random
import numpy as np
import torch
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
)
from datasets import load_dataset
import wandb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# ===== 경로 설정 =====
BASE_DIR = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATA_PATH = os.path.join(BASE_DIR, "dataset/spam_total_train.csv")  # 단일 CSV
MODEL_SAVE_PATH = os.path.join(BASE_DIR, "model/model_kobert_251104_spam")
TOKENIZER_SAVE_PATH = os.path.join(BASE_DIR, "tokenizer/tokenizer_kobert_251104")
LOG_DIR = os.path.join(BASE_DIR, "logs/kobert_korccvi_251104")
OUTPUT_DIR = os.path.join(BASE_DIR, "results/kobert_korccvi_251104")

# ===== 난수 시드 고정 =====
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

# ===== GPU 설정 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ===== Tokenizer 및 Model =====
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir=os.path.join(BASE_DIR, "cache"))
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)
model.to(device)

# ===== WandB 초기화 =====
wandb_project = "Voicephishing"
wandb_run_name = "kobert_train"
wandb.init(project=wandb_project, name=wandb_run_name)

# ===== 평가 지표 함수 =====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# ===== 토큰화 함수 =====
def tokenize_korccvi_data(dataset, text_column="text", label_column="Label", max_length=512):
    def tokenize_fn(batch):
        tokenized = tokenizer(
            batch[text_column],
            max_length=max_length,
            truncation=True,
            padding="max_length",
            add_special_tokens=True
        )
        # 🔹 Label을 정수형으로 변환
        tokenized["labels"] = [int(x) for x in batch[label_column]]
        return tokenized
    return dataset.map(tokenize_fn, batched=True, num_proc=4)

# ===== 학습 함수 =====
def fine_tune_kobert():
    # CSV에서 train/validation 분리
    dataset = load_dataset("csv", data_files=DATA_PATH)["train"]
    split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset["train"]
    val_dataset = split_dataset["test"]

    # 컬럼 이름 확인 후 필요 시 변경
    text_column = "text"   # CSV의 텍스트 컬럼명
    label_column = "label" # CSV의 라벨 컬럼명

    # 토큰화
    tokenized_train = tokenize_korccvi_data(train_dataset, text_column=text_column, label_column=label_column)
    tokenized_val = tokenize_korccvi_data(val_dataset, text_column=text_column, label_column=label_column)

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        seed=42,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_dir=LOG_DIR,
        logging_steps=100,
        eval_strategy="steps",
        eval_steps=100,
        save_steps=100,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="wandb"
    )

    data_collator = DataCollatorWithPadding(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    # 모델과 토크나이저 저장
    model.save_pretrained(MODEL_SAVE_PATH)
    tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)

    # WandB에 모델 파일 업로드
    wandb.save(os.path.join(MODEL_SAVE_PATH, "*"))

if __name__ == "__main__":
    fine_tune_kobert()


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkujoon13413[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/7272 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/808 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.4525,0.103352,0.982673,0.944444,0.983471,0.908397
200,0.0539,0.041168,0.990099,0.969231,0.976744,0.961832
300,0.0509,0.040938,0.988861,0.965251,0.976562,0.954198
400,0.0417,0.052626,0.987624,0.960317,1.0,0.923664
500,0.0312,0.019005,0.99505,0.984496,1.0,0.969466
600,0.0268,0.023106,0.991337,0.97318,0.976923,0.969466
700,0.0204,0.014264,0.996287,0.988417,1.0,0.977099
800,0.0236,0.01125,0.996287,0.988593,0.984848,0.992366
900,0.0097,0.022579,0.993812,0.981132,0.970149,0.992366
1000,0.0063,0.006367,0.998762,0.996169,1.0,0.992366




In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def preprocess_input(text):
    """불필요한 공백 제거 + 키워드 강조"""
    text = " ".join(text.strip().split())
    keywords = ["송금", "개인정보", "대출", "계좌번호", "투자", "명의 도용"]
    for kw in keywords:
        text = text.replace(kw, f"**{kw}**")
    return text

def evaluate_and_save():
    DATA_PATH = "/content/drive/MyDrive/Github/2025_Voicephishing/dataset/spam_total_test.csv"
    MODEL_PATH = "/content/drive/MyDrive/Github/2025_Voicephishing/model/model_kobert_251104_spam"

    # ===== 데이터셋 로드 및 split =====
    test_dataset = load_dataset("csv", data_files=DATA_PATH)["train"]

    # ===== 토크나이저 및 모델 로드 =====
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
    )
    model.to(device)
    model.eval()
    for p in model.parameters():
        p.requires_grad = False

    # ===== 평가 =====
    true_labels = []
    pred_labels = []
    records = []

    for sample in tqdm(test_dataset, desc="🧪 Evaluating"):
        input_text = preprocess_input(sample["text"])
        target_label = int(sample["label"])

        # 토큰화
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # 추론
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            pred_label = logits.argmax(dim=1).item()

        true_labels.append(target_label)
        pred_labels.append(pred_label)

        records.append({
            "Input": input_text,
            "Prediction": pred_label,
            "Label": target_label,
            "Correct": int(pred_label == target_label)
        })

    # ===== 지표 계산 =====
    acc = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)

    print(f"\n✅ Accuracy: {acc:.2%}")
    print(f"✅ F1 Score: {f1:.4f}")
    print(f"✅ Precision: {precision:.4f}")
    print(f"✅ Recall: {recall:.4f}")

    # ===== 결과 저장 =====
    result_df = pd.DataFrame(records)
    result_df["accuracy"] = acc
    result_df["f1_score"] = f1
    result_df["precision"] = precision
    result_df["recall"] = recall

    os.makedirs("dataset/eval_folder", exist_ok=True)
    result_df.to_csv("dataset/eval_folder/eval_kobert_251028.csv", index=False, encoding="utf-8-sig")

if __name__ == "__main__":
    evaluate_and_save()


🧪 Evaluating: 100%|██████████| 1732/1732 [01:24<00:00, 20.60it/s]



✅ Accuracy: 99.60%
✅ F1 Score: 0.9864
✅ Precision: 1.0000
✅ Recall: 0.9731


In [4]:
import os
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
)
import wandb

def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

PROJECT_ROOT = "/content/drive/MyDrive/Github/2025_Voicephishing/"
DATASET_DIR = os.path.join(PROJECT_ROOT, "dataset")
SCAM_DATA_PATH = os.path.join(DATASET_DIR, "scam_spam_stt.csv")
MODEL_SAVE_PATH = os.path.join(PROJECT_ROOT, "model/model_kobert_251108")
TOKENIZER_SAVE_PATH = os.path.join(PROJECT_ROOT, "tokenizer/tokenizer_kobert_251108")
LOG_DIR = os.path.join(PROJECT_ROOT, "logs/kobert_251108")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "results/kobert_251108")

os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
os.makedirs(TOKENIZER_SAVE_PATH, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

val_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}
test_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

BASE_SEED = 42

if __name__ == "__main__":
    wandb.init(
        project="Voicephishing",
        name="kobert_251108_5runs",
        config={
            "model": "KoBERT",
            "learning_rate": 2e-5,
            "train_batch_size": 16,
            "eval_batch_size": 32,
            "epochs": 5,
            "max_length": 256,
            "early_stopping": 3,
            "num_runs": 5,
            "split_ratio": "train 0.70, val 0.15, test 0.15"
        }
    )
    # Read CSV
    try:
        df = pd.read_csv(SCAM_DATA_PATH)
    except FileNotFoundError as e:
        print(f"오류: 데이터 파일을 찾을 수 없습니다. 경로를 확인하세요: {e}")
        wandb.finish()
        raise SystemExit(1)
    df = df.dropna(subset=["text", "label"])
    X_all = df["text"].astype(str).tolist()
    y_all = df["label"].astype(int).tolist()
    print(f"Total samples: {len(df)}")

    for run_idx in range(wandb.config.num_runs):
        run_seed = BASE_SEED + run_idx
        set_seed(run_seed)
        # Stratified split (70/15/15)
        X_train, X_temp, y_train, y_temp = train_test_split(
            X_all,
            y_all,
            test_size=0.30,
            random_state=run_seed,
            stratify=y_all
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp,
            y_temp,
            test_size=0.50,
            random_state=run_seed,
            stratify=y_temp
        )
        print(f"[Run {run_idx+1}] Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
        # Prepare datasets as dicts for transformers
        train_ds = [{"text": x, "label": int(y)} for x, y in zip(X_train, y_train)]
        val_ds =   [{"text": x, "label": int(y)} for x, y in zip(X_val, y_val)]
        test_ds =  [{"text": x, "label": int(y)} for x, y in zip(X_test, y_test)]
        # Tokenizer/model (reloaded per run for full reset/reproducibility)
        tokenizer = BertTokenizer.from_pretrained("monologg/kobert", cache_dir=os.path.join(PROJECT_ROOT, "cache"))
        model = BertForSequenceClassification.from_pretrained("monologg/kobert", num_labels=2)
        model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        # Tokenize
        def tokenize_fn(batch):
            return tokenizer(batch["text"],
                             max_length=wandb.config.max_length,
                             truncation=True, padding="max_length")
        import datasets as hfdatasets
        train_dataset = hfdatasets.Dataset.from_list(train_ds).map(tokenize_fn, batched=True)
        val_dataset = hfdatasets.Dataset.from_list(val_ds).map(tokenize_fn, batched=True)
        test_dataset = hfdatasets.Dataset.from_list(test_ds).map(tokenize_fn, batched=True)
        train_dataset = train_dataset.rename_column("label", "labels")
        val_dataset = val_dataset.rename_column("label", "labels")
        test_dataset = test_dataset.rename_column("label", "labels")
        train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        # Data collator
        data_collator = DataCollatorWithPadding(tokenizer)
        # Compute metrics
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            preds = np.argmax(logits, axis=1)
            acc = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds, zero_division=0)
            precision = precision_score(labels, preds, zero_division=0)
            recall = recall_score(labels, preds, zero_division=0)
            return {
                "accuracy": acc,
                "f1": f1,
                "precision": precision,
                "recall": recall
            }
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            overwrite_output_dir=True,
            learning_rate=float(wandb.config.learning_rate),
            per_device_train_batch_size=int(wandb.config.train_batch_size),
            per_device_eval_batch_size=int(wandb.config.eval_batch_size),
            num_train_epochs=int(wandb.config.epochs),
            seed=run_seed,
            warmup_ratio=0.1,
            weight_decay=0.01,
            logging_dir=LOG_DIR,
            logging_steps=50,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            report_to="wandb",
            disable_tqdm=True
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=wandb.config.early_stopping)]
        )
        # Train
        trainer.train()
        # ----- Evaluate on val ------
        val_metrics_run = trainer.evaluate(val_dataset)
        print(f"[Run {run_idx+1}] Validation metrics: {val_metrics_run}")
        wandb.log({f"run_idx": run_idx + 1, **{f"val_{k}":v for k,v in val_metrics_run.items() if isinstance(v, float)}})
        for k in val_metrics.keys():
            val_metrics[k].append(val_metrics_run.get(f"eval_{k}", np.nan))
        # ----- Evaluate on test ------
        test_metrics_run = trainer.evaluate(test_dataset)
        print(f"[Run {run_idx+1}] Test metrics: {test_metrics_run}")
        wandb.log({f"run_idx": run_idx + 1, **{f"test_{k}":v for k,v in test_metrics_run.items() if isinstance(v, float)}})
        for k in test_metrics.keys():
            test_metrics[k].append(test_metrics_run.get(f"eval_{k}", np.nan))
        # 마지막 run의 모델/토크나이저만 저장
        if run_idx == wandb.config.num_runs - 1:
            model.save_pretrained(MODEL_SAVE_PATH)
            tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)
            try:
                wandb.save(os.path.join(MODEL_SAVE_PATH, "*"))
            except Exception:
                pass

    # 평균 결과
    def _mean(v):
        return float(np.mean([x for x in v if x is not None and not np.isnan(x)])) if len(v) > 0 else float("nan")
    val_avg = {k: _mean(val_metrics[k]) for k in val_metrics}
    test_avg = {k: _mean(test_metrics[k]) for k in test_metrics}
    print("\n=== Validation Averages over 5 runs ===")
    print(f"Val Accuracy: {val_avg['accuracy']:.4f}, Val Precision: {val_avg['precision']:.4f}, Val Recall: {val_avg['recall']:.4f}, Val F1: {val_avg['f1']:.4f}")
    print("\n=== Test Averages over 5 runs ===")
    print(f"Test Accuracy: {test_avg['accuracy']:.4f}, Test Precision: {test_avg['precision']:.4f}, Test Recall: {test_avg['recall']:.4f}, Test F1: {test_avg['f1']:.4f}")
    wandb.summary["val_accuracy_mean"] = val_avg["accuracy"]
    wandb.summary["val_precision_mean"] = val_avg["precision"]
    wandb.summary["val_recall_mean"] = val_avg["recall"]
    wandb.summary["val_f1_mean"] = val_avg["f1"]
    wandb.summary["test_accuracy_mean"] = test_avg["accuracy"]
    wandb.summary["test_precision_mean"] = test_avg["precision"]
    wandb.summary["test_recall_mean"] = test_avg["recall"]
    wandb.summary["test_f1_mean"] = test_avg["f1"]
    print(f"마지막 저장된 모델 경로: {MODEL_SAVE_PATH}")
    wandb.finish()

Total samples: 15463
[Run 1] Train: 10824, Val: 2319, Test: 2320


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10824 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/2320 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.6741, 'grad_norm': 3.2726118564605713, 'learning_rate': 2.8908554572271393e-06, 'epoch': 0.07385524372230429}
{'loss': 0.5346, 'grad_norm': 15.02613353729248, 'learning_rate': 5.840707964601771e-06, 'epoch': 0.14771048744460857}
{'loss': 0.4389, 'grad_norm': 3.7245640754699707, 'learning_rate': 8.790560471976402e-06, 'epoch': 0.22156573116691286}
{'loss': 0.3612, 'grad_norm': 5.311728000640869, 'learning_rate': 1.1740412979351032e-05, 'epoch': 0.29542097488921715}
{'loss': 0.3234, 'grad_norm': 2.9532992839813232, 'learning_rate': 1.4690265486725665e-05, 'epoch': 0.36927621861152143}
{'loss': 0.2938, 'grad_norm': 10.70580768585205, 'learning_rate': 1.7640117994100297e-05, 'epoch': 0.4431314623338257}
{'loss': 0.2517, 'grad_norm': 5.715576648712158, 'learning_rate': 1.993434011818779e-05, 'epoch': 0.51698670605613}
{'loss': 0.244, 'grad_norm': 1.9245811700820923, 'learning_rate': 1.9606040709126725e-05, 'epoch': 0.5908419497784343}
{'loss': 0.19, 'grad_norm': 13.46003055572509

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10824 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/2320 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.624, 'grad_norm': 6.069478988647461, 'learning_rate': 2.8908554572271393e-06, 'epoch': 0.07385524372230429}
{'loss': 0.5408, 'grad_norm': 4.061139106750488, 'learning_rate': 5.840707964601771e-06, 'epoch': 0.14771048744460857}
{'loss': 0.5131, 'grad_norm': 5.253606796264648, 'learning_rate': 8.790560471976402e-06, 'epoch': 0.22156573116691286}
{'loss': 0.398, 'grad_norm': 6.973231315612793, 'learning_rate': 1.1740412979351032e-05, 'epoch': 0.29542097488921715}
{'loss': 0.3791, 'grad_norm': 6.935694694519043, 'learning_rate': 1.4690265486725665e-05, 'epoch': 0.36927621861152143}
{'loss': 0.3574, 'grad_norm': 6.574274063110352, 'learning_rate': 1.7640117994100297e-05, 'epoch': 0.4431314623338257}
{'loss': 0.355, 'grad_norm': 2.973743200302124, 'learning_rate': 1.993434011818779e-05, 'epoch': 0.51698670605613}
{'loss': 0.3044, 'grad_norm': 1.461626648902893, 'learning_rate': 1.9606040709126725e-05, 'epoch': 0.5908419497784343}
{'loss': 0.3001, 'grad_norm': 15.374034881591797, '

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10824 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/2320 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.6155, 'grad_norm': 5.288933753967285, 'learning_rate': 2.8908554572271393e-06, 'epoch': 0.07385524372230429}
{'loss': 0.5311, 'grad_norm': 4.600874423980713, 'learning_rate': 5.840707964601771e-06, 'epoch': 0.14771048744460857}
{'loss': 0.4803, 'grad_norm': 37.27423095703125, 'learning_rate': 8.790560471976402e-06, 'epoch': 0.22156573116691286}
{'loss': 0.4268, 'grad_norm': 14.416298866271973, 'learning_rate': 1.1740412979351032e-05, 'epoch': 0.29542097488921715}
{'loss': 0.3909, 'grad_norm': 4.936573028564453, 'learning_rate': 1.4690265486725665e-05, 'epoch': 0.36927621861152143}
{'loss': 0.3442, 'grad_norm': 11.154760360717773, 'learning_rate': 1.7640117994100297e-05, 'epoch': 0.4431314623338257}
{'loss': 0.3229, 'grad_norm': 3.1067349910736084, 'learning_rate': 1.993434011818779e-05, 'epoch': 0.51698670605613}
{'loss': 0.299, 'grad_norm': 6.861286640167236, 'learning_rate': 1.9606040709126725e-05, 'epoch': 0.5908419497784343}
{'loss': 0.2843, 'grad_norm': 1.28646624088287

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10824 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/2320 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.6526, 'grad_norm': 2.2791695594787598, 'learning_rate': 2.8908554572271393e-06, 'epoch': 0.07385524372230429}
{'loss': 0.5501, 'grad_norm': 8.898675918579102, 'learning_rate': 5.840707964601771e-06, 'epoch': 0.14771048744460857}
{'loss': 0.4199, 'grad_norm': 1.139517903327942, 'learning_rate': 8.790560471976402e-06, 'epoch': 0.22156573116691286}
{'loss': 0.4302, 'grad_norm': 6.712156295776367, 'learning_rate': 1.1740412979351032e-05, 'epoch': 0.29542097488921715}
{'loss': 0.3475, 'grad_norm': 8.717182159423828, 'learning_rate': 1.4690265486725665e-05, 'epoch': 0.36927621861152143}
{'loss': 0.3039, 'grad_norm': 14.5314359664917, 'learning_rate': 1.7640117994100297e-05, 'epoch': 0.4431314623338257}
{'loss': 0.3223, 'grad_norm': 6.348567962646484, 'learning_rate': 1.993434011818779e-05, 'epoch': 0.51698670605613}
{'loss': 0.2652, 'grad_norm': 4.454723834991455, 'learning_rate': 1.9606040709126725e-05, 'epoch': 0.5908419497784343}
{'loss': 0.2592, 'grad_norm': 6.517964839935303,

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10824 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/2320 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.6172, 'grad_norm': 7.3244500160217285, 'learning_rate': 2.8908554572271393e-06, 'epoch': 0.07385524372230429}
{'loss': 0.5414, 'grad_norm': 8.970958709716797, 'learning_rate': 5.840707964601771e-06, 'epoch': 0.14771048744460857}
{'loss': 0.4132, 'grad_norm': 4.182835578918457, 'learning_rate': 8.790560471976402e-06, 'epoch': 0.22156573116691286}
{'loss': 0.374, 'grad_norm': 8.921520233154297, 'learning_rate': 1.1740412979351032e-05, 'epoch': 0.29542097488921715}
{'loss': 0.3858, 'grad_norm': 5.402360916137695, 'learning_rate': 1.4690265486725665e-05, 'epoch': 0.36927621861152143}
{'loss': 0.317, 'grad_norm': 9.170680046081543, 'learning_rate': 1.7640117994100297e-05, 'epoch': 0.4431314623338257}
{'loss': 0.3294, 'grad_norm': 4.697754383087158, 'learning_rate': 1.993434011818779e-05, 'epoch': 0.51698670605613}
{'loss': 0.3549, 'grad_norm': 16.355478286743164, 'learning_rate': 1.9606040709126725e-05, 'epoch': 0.5908419497784343}
{'loss': 0.2669, 'grad_norm': 5.776561737060547,




=== Validation Averages over 5 runs ===
Val Accuracy: 0.9176, Val Precision: 0.8375, Val Recall: 0.8497, Val F1: 0.8376

=== Test Averages over 5 runs ===
Test Accuracy: 0.9228, Test Precision: 0.8435, Test Recall: 0.8626, Test F1: 0.8474
마지막 저장된 모델 경로: /content/drive/MyDrive/Github/2025_Voicephishing/model/model_kobert_251108


0,1
eval/accuracy,▄▇████▇▁▁▂▃▃▄▄▄▄▃▃▃
eval/f1,▄▇████▇▁▁▂▂▂▂▃▃▃▃▃▃
eval/loss,▆▂▁▁▁▁▂██▆▅▅▅▄▄▄▆▆▆
eval/precision,▆▇█████▁▁▂▆▆▇▄▄▄▃▃▃
eval/recall,▄█████▇▆▆▆▁▁▁▅▅▅▅▅▅
eval/runtime,███████▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁▁▁▁▁▁▁▇██████▇████
eval/steps_per_second,▁▁▁▁▁▁▁▇██████▇████
run_idx,▁▁▃▃▅▅▆▆██
test_epoch,█▁▁▁▁

0,1
eval/accuracy,0.90302
eval/f1,0.81633
eval/loss,0.24312
eval/precision,0.76923
eval/recall,0.86957
eval/runtime,2.8041
eval/samples_per_second,827.369
eval/steps_per_second,26.034
run_idx,5
test_accuracy_mean,0.92276


In [5]:
from google.colab import runtime
runtime.unassign()