# Text Classification for Phishing Email Detection

## –ü–ª–∞–Ω —Ä–∞–±–æ—Ç—ã:
1. **–ó–∞–≥—Ä—É–∑–∫–∞ –∏ –∞–Ω–∞–ª–∏–∑ –¥–∞–Ω–Ω—ã—Ö**
2. **NLP Preprocessing**
3. **EDA –¥–ª—è —Ç–µ–∫—Å—Ç–æ–≤ (Word Clouds)**
4. **–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è (TF-IDF, Count)**
5. **–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π**
6. **–°—Ä–∞–≤–Ω–µ–Ω–∏–µ –∏ –∏–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü–∏—è**

In [None]:
import re

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

# –°–∫–∞—á–∏–≤–∞–µ–º NLTK –¥–∞–Ω–Ω—ã–µ
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

plt.style.use('default')
np.random.seed(42)

print("‚úÖ –ë–∏–±–ª–∏–æ—Ç–µ–∫–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ –∏ –¥–∞–Ω–Ω—ã—Ö
try:
    with open('../config/config.yaml', 'r') as f:
        config = yaml.safe_load(f)
    print("‚úÖ Config –∑–∞–≥—Ä—É–∂–µ–Ω")
except:
    config = {'data': {'text_column': 'text', 'label_column': 'label'}}

# –ü–æ–ø—ã—Ç–∫–∞ –∑–∞–≥—Ä—É–∑–∏—Ç—å –¥–∞—Ç–∞—Å–µ—Ç
try:
    data = pd.read_csv('../data/phishing_emails.csv')
    print(f"‚úÖ Phishing dataset: {data.shape}")
except FileNotFoundError:
    try:
        data = pd.read_csv('../data/spam.csv', encoding='latin-1')
        data = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})
        data['label'] = data['label'].map({'spam': 1, 'ham': 0})
        print(f"‚úÖ SMS Spam dataset: {data.shape}")
    except FileNotFoundError:
        print("‚ö†Ô∏è –°–æ–∑–¥–∞–µ–º –¥–µ–º–æ-–¥–∞–Ω–Ω—ã–µ...")
        # –î–µ–º–æ-–¥–∞–Ω–Ω—ã–µ
        phishing = ["URGENT click here verify account", "Free money claim now", "Security alert update info"]
        legitimate = ["Hi how are you", "Meeting tomorrow 3PM", "Your order shipped"]

        texts = phishing * 50 + legitimate * 50
        labels = [1] * 150 + [0] * 150
        data = pd.DataFrame({'text': texts, 'label': labels})
        print(f"‚úÖ –î–µ–º–æ-–¥–∞–Ω–Ω—ã–µ: {data.shape}")

text_col = 'text'
label_col = 'label'
print(f"–ö–ª–∞—Å—Å—ã: {data[label_col].value_counts().to_dict()}")

In [None]:
# NLP Preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '[URL]', text)
    text = re.sub(r'\S+@\S+', '[EMAIL]', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = ' '.join(text.split())

    return text


# –ü—Ä–∏–º–µ–Ω—è–µ–º preprocessing
data['text_clean'] = data[text_col].apply(preprocess_text)

print("–ü—Ä–∏–º–µ—Ä—ã –æ–±—Ä–∞–±–æ—Ç–∫–∏:")
for i in range(2):
    print(f"\n–ò—Å—Ö–æ–¥–Ω—ã–π: {data.iloc[i][text_col]}")
    print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π: {data.iloc[i]['text_clean']}")
    print(f"–ö–ª–∞—Å—Å: {data.iloc[i][label_col]}")

In [None]:
# EDA: Word Clouds
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for idx, label in enumerate(sorted(data[label_col].unique())):
    texts = ' '.join(data[data[label_col] == label]['text_clean'])

    if len(texts.strip()) > 0:
        wordcloud = WordCloud(
            width=600, height=300,
            background_color='white',
            max_words=50
        ).generate(texts)

        axes[idx].imshow(wordcloud, interpolation='bilinear')
        class_name = 'Legitimate' if label == 0 else 'Phishing'
        axes[idx].set_title(f'{class_name} (Class {label})')
        axes[idx].axis('off')

plt.tight_layout()
plt.show()

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
data['text_length'] = data['text_clean'].str.len()
data['word_count'] = data['text_clean'].str.split().str.len()

print("\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ –∫–ª–∞—Å—Å–∞–º:")
print(data.groupby(label_col)[['text_length', 'word_count']].mean())

In [None]:
# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö
X = data['text_clean']
y = data[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"Train classes: {y_train.value_counts().to_dict()}")

# –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è
vectorizers = {
    'TF-IDF': TfidfVectorizer(
        max_features=1000,
        ngram_range=(1, 2),
        stop_words='english'
    ),
    'Count': CountVectorizer(
        max_features=1000,
        ngram_range=(1, 2),
        stop_words='english'
    )
}

vectorized_data = {}
for name, vec in vectorizers.items():
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    vectorized_data[name] = {
        'vectorizer': vec,
        'X_train': X_train_vec,
        'X_test': X_test_vec
    }
    print(f"{name}: {X_train_vec.shape}")

In [None]:
# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42)
}

results = []

for vec_name, vec_data in vectorized_data.items():
    print(f"\n=== {vec_name} ===")

    for model_name, model in models.items():
        # –û–±—É—á–µ–Ω–∏–µ
        model.fit(vec_data['X_train'], y_train)
        y_pred = model.predict(vec_data['X_test'])

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results.append({
            'Vectorizer': vec_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'F1-Score': f1
        })

        print(f"{model_name}: Acc={accuracy:.3f}, F1={f1:.3f}")

# –†–µ–∑—É–ª—å—Ç–∞—Ç—ã
results_df = pd.DataFrame(results)
print("\nüìä –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã:")
print(results_df.round(3))

# –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å
best_idx = results_df['F1-Score'].idxmax()
best = results_df.iloc[best_idx]
print(f"\nüèÜ –õ—É—á—à–∞—è: {best['Model']} + {best['Vectorizer']}")
print(f"F1-Score: {best['F1-Score']:.3f}")

In [None]:
# –î–µ—Ç–∞–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏
best_vec_name = best['Vectorizer']
best_model_name = best['Model']

# –ü–µ—Ä–µ–æ–±—É—á–∞–µ–º –ª—É—á—à—É—é –º–æ–¥–µ–ª—å
best_vec_data = vectorized_data[best_vec_name]
best_model = models[best_model_name]
best_model.fit(best_vec_data['X_train'], y_train)
y_pred_best = best_model.predict(best_vec_data['X_test'])

print(f"üîç –ê–Ω–∞–ª–∏–∑: {best_model_name} + {best_vec_name}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.title(f'Confusion Matrix')
plt.show()

# Feature importance (–µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ)
if hasattr(best_model, 'coef_') and best_model.coef_ is not None:
    vectorizer = best_vec_data['vectorizer']
    feature_names = vectorizer.get_feature_names_out()
    coef = best_model.coef_[0] if len(best_model.coef_) == 1 else best_model.coef_[1]

    # –¢–æ–ø phishing –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã
    top_phishing_idx = coef.argsort()[-10:][::-1]
    print("\nüìà –¢–æ–ø phishing –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã:")
    for idx in top_phishing_idx:
        print(f"  {feature_names[idx]}: {coef[idx]:.3f}")

    # –¢–æ–ø legitimate –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã
    top_legit_idx = coef.argsort()[:10]
    print("\nüìâ –¢–æ–ø legitimate –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã:")
    for idx in top_legit_idx:
        print(f"  {feature_names[idx]}: {coef[idx]:.3f}")

print("\n‚úÖ –ê–Ω–∞–ª–∏–∑ –∑–∞–≤–µ—Ä—à–µ–Ω!")

## –í—ã–≤–æ–¥—ã

### –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:
...

### –°–ª–µ–¥—É—é—â–∏–µ —à–∞–≥–∏:
...

### Production:
...