In [5]:
pip install -U -q datasets peft transformers==4.45.2 bitsandbytes wandb sentencepiece mlflow accelerate

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Github/2025_Voicephishing/dataset/spam_total_train.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Github/2025_Voicephishing/dataset/stt_results_train.csv')
df3 = pd.read_csv('/content/drive/MyDrive/Github/2025_Voicephishing/dataset/spam_total_val.csv')
df4 = pd.read_csv('/content/drive/MyDrive/Github/2025_Voicephishing/dataset/stt_results_val.csv')
df5 = pd.read_csv('/content/drive/MyDrive/Github/2025_Voicephishing/dataset/spam_total_test.csv')
df6 = pd.read_csv('/content/drive/MyDrive/Github/2025_Voicephishing/dataset/stt_results_test.csv')

In [None]:
import os
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    BitsAndBytesConfig, EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import HfFolder
import mlflow
import numpy as np
import random
from datasets import load_dataset

# ===== Í≤ΩÎ°ú ÏÑ§Ï†ï =====
BASE_DIR = "/content/drive/MyDrive/Github/2025_Voicephishing/"
CACHE_DIR = os.path.join(BASE_DIR, "cache")

TRAIN_DATA_PATH = os.path.join(BASE_DIR, "dataset/scam_spam_stt_bt_all.csv")
VAL_DATA_PATH = os.path.join(BASE_DIR, "dataset/scam_spam_stt_val.csv")
MODEL_SAVE_PATH = os.path.join(BASE_DIR, "model/model_llama_scam_spam_stt")

LOG_DIR = os.path.join(BASE_DIR, "logs/voicephishing")
OUTPUT_DIR = os.path.join(BASE_DIR, "results/voicephishing")
mlflow.set_tracking_uri(f"file:{os.path.join(BASE_DIR, 'mlruns')}")
mlflow.set_experiment("llm_experiment")

# ===== ÌôòÍ≤Ω Î≥ÄÏàò ÏÑ§Ï†ï =====
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HUGGINGFACE_HUB_CACHE"] = CACHE_DIR
os.environ["TRITON_CACHE_DIR"] = os.path.join(CACHE_DIR, "triton")

# ===== GPU ÏÑ§Ï†ï =====
gpu_count = torch.cuda.device_count()
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(gpu_count))
print('Device:', torch.cuda.current_device())
print('Using', torch.cuda.device_count(), 'GPUs')

# ===== ÎÇúÏàò ÏãúÎìú Í≥†Ï†ï =====
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True

set_seed(42)

# ===== Hugging Face Ïù∏Ï¶ù =====
hf_token = ""
os.environ["HUGGINGFACE_TOKEN"] = hf_token
HfFolder.save_token(hf_token)

  return FileStore(store_uri, store_uri)


Device: 0
Using 1 GPUs


In [3]:
# ===== Tokenizer Î∞è Model Î∂àÎü¨Ïò§Í∏∞ =====
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=CACHE_DIR, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
# AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    quantization_config=bnb_config,
    num_labels=1,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR,
)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.2,
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.config.use_cache = False
model.train()

# ===== ÌÜ†ÌÅ∞Ìôî Î∞è Î∂ÑÎ¶¨ Ìï®Ïàò =====
def tokenize_voicephishing_data(dataset, max_length=1024):
    def tokenize_fn(batch):
        tokenized = tokenizer(
            batch["text"],
            max_length=max_length,
            truncation=True,
            padding="max_length",
            add_special_tokens=True
        )
        tokenized["labels"] = batch["label"]
        return tokenized

    return dataset.map(tokenize_fn, batched=True, num_proc=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# ===== ÌïôÏäµ Ìï®Ïàò =====
def fine_tune_model():
    train_dataset = load_dataset("csv", data_files=TRAIN_DATA_PATH)["train"]
    val_dataset = load_dataset("csv", data_files=VAL_DATA_PATH)["train"]

    # ÌÜ†ÌÅ∞ÌôîÎßå ÏàòÌñâ
    tokenized_train = tokenize_voicephishing_data(train_dataset)
    tokenized_val = tokenize_voicephishing_data(val_dataset)

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=2e-5,
        lr_scheduler_type="cosine",
        warmup_ratio=0.2,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        save_steps=500,
        save_total_limit=3,
        logging_dir=LOG_DIR,
        logging_steps=500,
        optim="paged_adamw_32bit",
        max_grad_norm=5,
        eval_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        ddp_find_unused_parameters=False,
        bf16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    model.save_pretrained(MODEL_SAVE_PATH)

# ===== Ïã§Ìñâ =====
if __name__ == "__main__":
    fine_tune_model()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/25521 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2319 [00:00<?, ? examples/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkujoon13413[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,2.9824,0.88825
1000,0.3585,0.080234
1500,0.0604,0.032207
2000,0.0309,0.021921
2500,0.0233,0.019586
3000,0.0153,0.013406
3500,0.0141,0.011472
4000,0.0099,0.009958
4500,0.0093,0.009492
5000,0.0063,0.007329


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

In [6]:
import os
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import load_dataset


# pip install --upgrade -q datasets peft transformers==4.45.2
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # Î©ÄÌã∞ GPU ÏÇ¨Ïö© Ïãú
np.random.seed(seed)
random.seed(seed)


# ===== Í≥†Ï†ï Í≤ΩÎ°ú ÏÑ§Ï†ï =====
BASE_DIR = "/content/drive/MyDrive/Github/2025_Voicephishing/"
CACHE_DIR = os.path.join(BASE_DIR, "cache")
BASE_MODEL_PATH = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_PATH = os.path.join(BASE_DIR, "model/model_llama_scam_spam_stt")
TEST_DATA_PATH = os.path.join(BASE_DIR, "dataset/scam_spam_stt_test.csv")
SAVE_PATH = os.path.join(BASE_DIR, "dataset/eval_folder/model_llama_scam_spam_stt")
SCORES_PATH = os.path.join(BASE_DIR, "dataset/eval_folder/eval_metrics.csv")

# ===== ÌôòÍ≤Ω Î≥ÄÏàò =====
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["TRITON_CACHE_DIR"] = os.path.join(CACHE_DIR, "triton")
os.environ["HUGGINGFACE_HUB_CACHE"] = "/content/drive/MyDrive/Github/2025_Voicephishing/hf_cache"

# ===== Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò =====
def preprocess_input(text):
    text = " ".join(text.strip().split())
    return text

# ===== ÌèâÍ∞Ä Ìï®Ïàò =====
def evaluate_and_save():
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, cache_dir=CACHE_DIR)
    tokenizer.pad_token = tokenizer.eos_token

    # Ï†ÄÏû•Îêú ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÌååÏùº Î°úÎìú
    test_dataset = load_dataset("csv", data_files=TEST_DATA_PATH)["train"]

    base_model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL_PATH,
        num_labels=1,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
        cache_dir=CACHE_DIR,
        low_cpu_mem_usage=True,
        device_map="cuda:0"
    )

    model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
    model = model.merge_and_unload()
    model.eval()
    for p in model.parameters():
        p.requires_grad = False

    records = []
    y_true = []
    y_pred = []

    label_map = {0: "Ï†ïÏÉÅ", 1: "Î≥¥Ïù¥Ïä§ÌîºÏã±"}
    inv_label_map = {"Ï†ïÏÉÅ": 0, "Î≥¥Ïù¥Ïä§ÌîºÏã±": 1}

    for sample in tqdm(test_dataset, desc="üß™ Evaluating"):
        input_text = sample["text"]
        target_label = label_map[sample["label"]]

        preprocessed_text = preprocess_input(input_text)
        inputs = tokenizer(preprocessed_text, return_tensors="pt", padding="max_length", truncation=True, max_length=1024, return_token_type_ids=False)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = model(**inputs).logits
            # predicted_label = torch.argmax(logits, dim=-1).item()
            predicted_label = 1 if logits.item() > 0 else 0

        pred_text = label_map[predicted_label]
        is_correct = (pred_text == target_label)

        y_true.append(inv_label_map[target_label])
        y_pred.append(inv_label_map[pred_text])

        records.append({
            "Input": input_text,
            "Prediction": pred_text,
            "Label": target_label,
            "Correct": int(is_correct)
        })

    # ===== ÏßÄÌëú Í≥ÑÏÇ∞ =====
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    # ===== Í≤∞Í≥º Ï∂úÎ†• =====
    scores_df = pd.DataFrame([{
        "Accuracy": acc,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }])

    print("\n‚úÖ ÌèâÍ∞Ä ÏßÄÌëú:")
    print(scores_df.to_string(index=False, float_format="%.4f"))

    # ===== Í≤∞Í≥º Ï†ÄÏû• =====
    os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
    pd.DataFrame(records).to_csv(SAVE_PATH, index=False, encoding="utf-8-sig")
    scores_df.to_csv(SCORES_PATH, index=False, encoding="utf-8-sig")

    print(f"\nüìÅ ÏòàÏ∏° Í≤∞Í≥º Ï†ÄÏû• ÏôÑÎ£å: {SAVE_PATH}")
    print(f"üìÅ ÌèâÍ∞Ä ÏßÄÌëú Ï†ÄÏû• ÏôÑÎ£å: {SCORES_PATH}")

# ===== Ïã§Ìñâ =====
if __name__ == "__main__":
    evaluate_and_save()


Generating train split: 0 examples [00:00, ? examples/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
üß™ Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2320/2320 [00:53<00:00, 43.53it/s]



‚úÖ ÌèâÍ∞Ä ÏßÄÌëú:
 Accuracy  Precision  Recall  F1 Score
   0.5302     0.2882  0.6487    0.3991

üìÅ ÏòàÏ∏° Í≤∞Í≥º Ï†ÄÏû• ÏôÑÎ£å: /content/drive/MyDrive/Github/2025_Voicephishing/dataset/eval_folder/model_llama_scam_spam_stt
üìÅ ÌèâÍ∞Ä ÏßÄÌëú Ï†ÄÏû• ÏôÑÎ£å: /content/drive/MyDrive/Github/2025_Voicephishing/dataset/eval_folder/eval_metrics.csv


# ÌÜµÌï© ÏÖÄ

In [9]:
from google.colab import runtime
runtime.unassign()