# Email Spam Detection - Model Training

This notebook demonstrates the complete training pipeline for the spam detection model.

In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from spam_detector.config import settings
from spam_detector.data_generator import generate_dataset
from spam_detector.train import SpamDetectorTrainer

%matplotlib inline
sns.set_style('whitegrid')

## 1. Generate Training Data

First, let's generate synthetic email data for training.

In [None]:
# Generate dataset
df = generate_dataset(
    num_samples=5000,
    spam_ratio=0.4,
    output_path=settings.train_data_path
)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few samples:")
df.head()

## 2. Explore the Data

In [None]:
# Label distribution
label_counts = df['label'].value_counts()
print("Label distribution:")
print(label_counts)

# Visualize
plt.figure(figsize=(8, 5))
label_counts.plot(kind='bar', color=['green', 'red'])
plt.title('Distribution of Spam vs Ham Emails')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Text length distribution
df['text_length'] = df['text'].str.len()

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df[df['label'] == 'ham']['text_length'].hist(bins=30, alpha=0.7, color='green', label='Ham')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Ham Email Length Distribution')
plt.legend()

plt.subplot(1, 2, 2)
df[df['label'] == 'spam']['text_length'].hist(bins=30, alpha=0.7, color='red', label='Spam')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Spam Email Length Distribution')
plt.legend()

plt.tight_layout()
plt.show()

## 3. Train the Model

In [None]:
# Initialize trainer
trainer = SpamDetectorTrainer(
    max_features=settings.max_features,
    test_size=settings.test_size,
    random_state=settings.random_state
)

# Load data
texts, labels = trainer.load_data(settings.train_data_path)

# Train model
accuracy, report, cm = trainer.train(texts, labels)

print(f"\nFinal Accuracy: {accuracy:.4f}")

## 4. Visualize Results

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['ham', 'spam'], 
            yticklabels=['ham', 'spam'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
# Classification metrics
metrics_df = pd.DataFrame(report).transpose()
metrics_df = metrics_df.drop(['accuracy', 'macro avg', 'weighted avg'])

plt.figure(figsize=(10, 6))
metrics_df[['precision', 'recall', 'f1-score']].plot(kind='bar', figsize=(10, 6))
plt.title('Classification Metrics by Class')
plt.ylabel('Score')
plt.xlabel('Class')
plt.xticks(rotation=0)
plt.legend(loc='lower right')
plt.ylim([0, 1.1])
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Save the Model

In [None]:
# Save model and vectorizer
trainer.save_model(settings.model_path, settings.vectorizer_path)

print("Model training complete!")
print(f"Model saved to: {settings.model_path}")
print(f"Vectorizer saved to: {settings.vectorizer_path}")

## 6. Test Predictions

In [None]:
# Test with sample emails
test_emails = [
    "Hi team, meeting scheduled for tomorrow at 2pm",
    "CONGRATULATIONS! You won $1,000,000! Click here NOW!",
    "Please review the attached quarterly report",
    "URGENT! Limited time offer! Buy now and save 90%!"
]

predictions = trainer.pipeline.predict(test_emails)
probabilities = trainer.pipeline.predict_proba(test_emails)

for email, pred, prob in zip(test_emails, predictions, probabilities):
    confidence = max(prob)
    print(f"Email: {email}")
    print(f"Prediction: {pred} (confidence: {confidence:.4f})\n")