In [1]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import pandas as pd
from tqdm import tqdm
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

max_len = 128
batch_size = 64
learning_rate = 3e-5
num_epochs = 3


  from .autonotebook import tqdm as notebook_tqdm
  state_dict = torch.load(resolved_archive_file, map_location="cpu")


In [2]:
phishing_data = pd.read_csv(r'C:\Users\DS\Desktop\22883\dp\voicephishing\dataset\final_phish.csv')  
normal_data = pd.read_csv(r"C:\Users\DS\Desktop\22883\dp\voicephishing\dataset\normal_data.csv")  

phishing_data['label'] = 1
normal_data['label'] = 0

df_normal_downsampled = normal_data.sample(n=5000, random_state=42)

df = pd.concat([phishing_data, df_normal_downsampled])

df['text'] = df['text'].apply(lambda x: x.strip())

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
class BERTDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        text = self.dataset.iloc[idx]['text']
        label = self.dataset.iloc[idx]['label']

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        
        return input_ids, attention_mask, label

    def __len__(self):
        return len(self.dataset)


In [5]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=2, dropout=0.3):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooler_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.dropout(pooler_output)
        return self.classifier(output)


In [6]:
train_dataset = BERTDataset(train_data, tokenizer, max_len)
test_dataset = BERTDataset(test_data, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model = BERTClassifier(bertmodel, num_classes=2).to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
t_total = len(train_dataloader) * num_epochs
warmup_steps = int(t_total * 0.1)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

loss_fn = nn.CrossEntropyLoss()


In [8]:
def train(model, train_dataloader, optimizer, loss_fn, scheduler):
    model.train()
    train_loss = 0
    train_acc = 0
    
    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        train_acc += (preds == labels).sum().item()

    avg_train_loss = train_loss / len(train_dataloader)
    avg_train_acc = train_acc / len(train_dataloader.dataset)
    return avg_train_loss, avg_train_acc


In [9]:
def evaluate(model, test_dataloader, loss_fn):
    model.eval()
    test_loss = 0
    test_acc = 0

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            test_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            test_acc += (preds == labels).sum().item()

    avg_test_loss = test_loss / len(test_dataloader)
    avg_test_acc = test_acc / len(test_dataloader.dataset)
    return avg_test_loss, avg_test_acc


In [10]:
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_dataloader, optimizer, loss_fn, scheduler)
    test_loss, test_acc = evaluate(model, test_dataloader, loss_fn)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}") 

Training: 100%|██████████| 98/98 [01:51<00:00,  1.14s/it]
Evaluating: 100%|██████████| 25/25 [00:10<00:00,  2.49it/s]


Epoch 1/3
Train Loss: 0.2003, Train Accuracy: 0.8904
Test Loss: 0.0169, Test Accuracy: 0.9955


Training: 100%|██████████| 98/98 [01:54<00:00,  1.17s/it]
Evaluating: 100%|██████████| 25/25 [00:12<00:00,  2.07it/s]


Epoch 2/3
Train Loss: 0.0062, Train Accuracy: 0.9997
Test Loss: 0.0097, Test Accuracy: 0.9974


Training: 100%|██████████| 98/98 [01:54<00:00,  1.17s/it]
Evaluating: 100%|██████████| 25/25 [00:11<00:00,  2.18it/s]

Epoch 3/3
Train Loss: 0.0036, Train Accuracy: 0.9998
Test Loss: 0.0072, Test Accuracy: 0.9981





In [None]:
import torch

def predict(model, sentence):
    model.eval()
    inputs = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
    
    probabilities = torch.softmax(output, dim=1)

    predicted = torch.argmax(probabilities, dim=1)
    confidence = probabilities[0][predicted.item()].item()
    
    label = 'Phishing' if predicted.item() == 1 else 'Normal'
    
    return label, confidence

sentence = "장난해? 감옥가기 싫으면 당장 돈 보내"
label, confidence = predict(model, sentence)
print(f"탐지 결과: {label}, 보이스피싱 확률: {confidence:.4f}")


Label: Normal, Confidence: 0.6910
