# New Section

In [17]:
# # 설치: Hugging Face Transformers와 Datasets 라이브러리 설치 (처음 한 번만)

!pip install transformers datasets

# 📌 추가 설치: BeautifulSoup4 설치 (HTML 텍스트 정제를 위해 필요)
!pip install beautifulsoup4


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [18]:
# 데이터셋 준비
from datasets import Dataset
from bs4 import BeautifulSoup  # 📌 HTML 제거용 BeautifulSoup
import re                      # 📌 정규표현식 re

# 1. 먼저 data 딕셔너리를 만든다

data = {
    'text': [
        # --- Phishing emails (11개) ---
        "Dear user, your account has been compromised. Click here to reset your password.",
        "Important update from your bank. Please verify your identity immediately.",
        "Your package could not be delivered. Provide your payment info to reschedule.",
        "Verify your account to avoid suspension. Click the link now.",
        "Update your billing information to continue using our service.",
        "Congratulations! You've won a free iPhone. Claim it today.",
        "Your PayPal account has been limited. Login to resolve the issue.",
        "Security alert: Unusual activity detected. Confirm your identity.",
        "Amazon account suspended. Immediate verification required.",
        "Your mailbox is almost full. Upgrade your storage by clicking here.",
        "We are Société Minières de Tanganyika Sprl (SMT Sprl), Democratic Republic of the Congo based GOLD and COPPER Mining Supply Company established in October 2006.",

        # --- Legitimate emails (20개) ---
        "Meeting at 3 PM today in conference room B.",
        "Happy Birthday! Wishing you a wonderful day filled with joy.",
        "Weekly newsletter: Top 10 travel destinations for 2024.",
        "Reminder: Submit your project report by Friday evening.",
        "Let's have coffee next week to discuss the new proposal.",
        "Internal team training session scheduled for next Monday.",
        "Holiday party details: Join us for food and fun!",
        "Company policy update: Please review the attached document.",
        "Client call rescheduled to Thursday at 2 PM.",
        "Expense report reminder: Submit receipts by end of month.",
        "Congratulations on your recent promotion!",
        "Wishing you all the best on your new journey!",
        "Happy work anniversary! Thank you for your dedication.",
        "We are excited to celebrate your success with you!",
        "Congratulations on completing your certification!",
        "Sending our warmest congratulations on your achievement.",
        "Cheers to your outstanding performance this quarter!",
        "Best wishes on your graduation and future endeavors!",
        "Happy anniversary with the company! We appreciate your hard work.",
        "Congratulations on the successful launch of your project!",
    ],

    'label': [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   # phishing (11개)
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0       # legitimate (20개)
    ]
}

# 2. 📌 여기서 BeautifulSoup 정제 함수를 정의한다
def clean_email_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 3. 📌 그리고 data['text']를 정제한다
data['text'] = [clean_email_text(t) for t in data['text']]
# 4. 정제된 데이터를 Dataset으로 변환한다

dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.3)
train_dataset = dataset['train']
test_dataset = dataset['test']


In [19]:
# 텍스트를 숫자로 변환 (Tokenization)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)

# 토크나이징 적용
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [20]:
# 분류 모델 불러오기 (DistilBERT 기반, 2개 클래스)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Trainer 설정 및 훈련
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,        # 에폭 수 3으로 설정
    weight_decay=0.01,
    report_to="none",           # W&B 끄기
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# 모델 훈련 시작
trainer.train()


Step,Training Loss


TrainOutput(global_step=18, training_loss=0.6247198316786025, metrics={'train_runtime': 205.9016, 'train_samples_per_second': 0.306, 'train_steps_per_second': 0.087, 'total_flos': 8345446115328.0, 'train_loss': 0.6247198316786025, 'epoch': 3.0})

In [22]:
# 새 이메일 문장 예측하기
texts = [
    "Update your payment information now to avoid service interruption.",  # 피싱 의심
    "Let's catch up over coffee next week!",                                # 정상 대화
    "Urgent: Your account has been suspended. Click here to verify.",       # 피싱 의심
    "Congratulations on your work anniversary!"                             # 정상 대화
]

# 입력 문장 토크나이징 및 예측
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=1)
print(predictions)


tensor([1, 0, 1, 0])


In [25]:
# 사용자로부터 직접 이메일 문장을 입력받아 분류하기
text = input("Please enter the email text to classify: ")

inputs = tokenizer([text], padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)
prediction = outputs.logits.argmax(dim=1).item()

if prediction == 1:
    print("⚠️ The email is classified as phishing.")
else:
    print("✅ The email is classified as legitimate.")


Please enter the email text to classify: "give me your account number"
✅ The email is classified as legitimate.
