In [11]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("📊 Using matplotlib and seaborn for data visualization")


✅ All libraries imported successfully!
📊 Using matplotlib and seaborn for data visualization


In [12]:
# Try different separator and quote handling
try:
    df = pd.read_csv('../data/spamhamdata.csv', 
                     encoding='latin-1', 
                     sep=',',
                     quotechar='"',
                     on_bad_lines='skip')
    print("✅ Dataset loaded successfully")
except Exception as e:
    print(f"❌ Error: {e}")


✅ Dataset loaded successfully


In [13]:
# Data cleaning and preprocessing
# Handle different column names
if 'v1' in df.columns and 'v2' in df.columns:
    df = df[['v1', 'v2']].copy()
    df.columns = ['label', 'text']
elif 'Category' in df.columns and 'Message' in df.columns:
    df = df[['Category', 'Message']].copy()
    df.columns = ['label', 'text']

# Clean the data
df = df.dropna()
df['label'] = df['label'].str.lower()

print(f"Cleaned dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nData types:")
print(df.dtypes)


KeyError: 'label'

In [None]:
# Exploratory Data Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Label distribution
df['label'].value_counts().plot(kind='bar', ax=axes[0,0], color=['skyblue', 'lightcoral'])
axes[0,0].set_title('Distribution of Email Categories', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Category')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Message length distribution
df['text_length'] = df['text'].str.len()
sns.boxplot(data=df, x='label', y='text_length', ax=axes[0,1])
axes[0,1].set_title('Text Length Distribution by Category', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Category')
axes[0,1].set_ylabel('Text Length (characters)')

# 3. Word count distribution
df['word_count'] = df['text'].str.split().str.len()
sns.histplot(data=df, x='word_count', hue='label', bins=50, ax=axes[1,0])
axes[1,0].set_title('Word Count Distribution', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Number of Words')
axes[1,0].set_ylabel('Frequency')

# 4. Pie chart of label distribution
df['label'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
axes[1,1].set_title('Email Category Distribution', fontsize=14, fontweight='bold')
axes[1,1].set_ylabel('')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n📊 SUMMARY STATISTICS:")
print(f"Total emails: {len(df)}")
print(f"Spam emails: {len(df[df['label'] == 'spam'])} ({len(df[df['label'] == 'spam'])/len(df)*100:.1f}%)")
print(f"Ham emails: {len(df[df['label'] == 'ham'])} ({len(df[df['label'] == 'ham'])/len(df)*100:.1f}%)")
print(f"Average text length: {df['text_length'].mean():.1f} characters")
print(f"Average word count: {df['word_count'].mean():.1f} words")


In [None]:
# Advanced text analysis with word frequency
def analyze_word_patterns(df):
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    
    # Extract words for spam and ham
    spam_text = ' '.join(df[df['label'] == 'spam']['text']).lower()
    ham_text = ' '.join(df[df['label'] == 'ham']['text']).lower()
    
    # Get word frequencies
    spam_words = re.findall(r'\b[a-zA-Z]{3,}\b', spam_text)
    ham_words = re.findall(r'\b[a-zA-Z]{3,}\b', ham_text)
    
    spam_counter = Counter(spam_words).most_common(20)
    ham_counter = Counter(ham_words).most_common(20)
    
    # 1. Top spam words
    if spam_counter:
        words, counts = zip(*spam_counter)
        y_pos = np.arange(len(words))
        axes[0,0].barh(y_pos, counts, color='lightcoral', alpha=0.8)
        axes[0,0].set_yticks(y_pos)
        axes[0,0].set_yticklabels(words)
        axes[0,0].set_title('Top 20 Words in SPAM Emails', fontsize=14, fontweight='bold')
        axes[0,0].set_xlabel('Frequency')
    
    # 2. Top ham words
    if ham_counter:
        words, counts = zip(*ham_counter)
        y_pos = np.arange(len(words))
        axes[0,1].barh(y_pos, counts, color='lightblue', alpha=0.8)
        axes[0,1].set_yticks(y_pos)
        axes[0,1].set_yticklabels(words)
        axes[0,1].set_title('Top 20 Words in HAM Emails', fontsize=14, fontweight='bold')
        axes[0,1].set_xlabel('Frequency')
    
    # 3. Character-level analysis
    df['exclamation_count'] = df['text'].str.count('!')
    df['question_count'] = df['text'].str.count('\?')
    df['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    
    # Exclamation marks
    sns.boxplot(data=df, x='label', y='exclamation_count', ax=axes[1,0])
    axes[1,0].set_title('Exclamation Marks Distribution', fontsize=14, fontweight='bold')
    axes[1,0].set_ylabel('Number of Exclamation Marks')
    
    # Capital letters ratio
    sns.boxplot(data=df, x='label', y='caps_ratio', ax=axes[1,1])
    axes[1,1].set_title('Capital Letters Ratio', fontsize=14, fontweight='bold')
    axes[1,1].set_ylabel('Ratio of Capital Letters')
    
    plt.tight_layout()
    plt.show()
    
    # Print insights
    print("📊 TEXT ANALYSIS INSIGHTS:")
    print(f"Avg exclamation marks in spam: {df[df['label']=='spam']['exclamation_count'].mean():.2f}")
    print(f"Avg exclamation marks in ham: {df[df['label']=='ham']['exclamation_count'].mean():.2f}")
    print(f"Avg caps ratio in spam: {df[df['label']=='spam']['caps_ratio'].mean():.3f}")
    print(f"Avg caps ratio in ham: {df[df['label']=='ham']['caps_ratio'].mean():.3f}")

# Run the analysis
analyze_word_patterns(df)


In [None]:
# Text preprocessing
import sys
sys.path.append('../')
from utils.preprocess import preprocess_text

# Apply preprocessing
print("Applying text preprocessing...")
df['processed_text'] = df['text'].apply(preprocess_text)

# Show examples of preprocessing
print("\n📝 PREPROCESSING EXAMPLES:")
for i in range(5):
    print(f"\n{i+1}. Original ({df['label'].iloc[i]}):")
    print(f"   {df['text'].iloc[i][:100]}...")
    print(f"   Processed:")
    print(f"   {df['processed_text'].iloc[i][:100]}...")
    print("-" * 80)

# Check for empty processed texts
empty_texts = df[df['processed_text'] == ''].shape[0]
print(f"\n⚠️ Found {empty_texts} empty texts after preprocessing")

# Remove empty processed texts
df = df[df['processed_text'] != ''].reset_index(drop=True)
print(f"✅ Final dataset size: {len(df)} emails")


In [None]:
# Model training and evaluation setup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

# Prepare features and labels
print("Preparing features and labels...")
X = df['processed_text']
y = df['label'].map({'ham': 0, 'spam': 1})

print(f"Feature vector shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Class distribution: {y.value_counts().to_dict()}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training class distribution: {pd.Series(y_train).value_counts().to_dict()}")
print(f"Test class distribution: {pd.Series(y_test).value_counts().to_dict()}")


In [None]:
# Feature extraction with TF-IDF
print("Creating TF-IDF features...")

vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    lowercase=True,
    strip_accents='ascii'
)

# Fit and transform
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

print(f"Feature matrix shape (train): {X_train_vect.shape}")
print(f"Feature matrix shape (test): {X_test_vect.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Show top features
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample features: {feature_names[:20]}")

# Feature sparsity
sparsity = 1 - (X_train_vect.nnz / (X_train_vect.shape[0] * X_train_vect.shape[1]))
print(f"Feature matrix sparsity: {sparsity:.3f}")


In [None]:
# Train and evaluate multiple models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
}

results = {}
model_objects = {}

print("Training and evaluating models...")
print("=" * 60)

for name, model in models.items():
    print(f"\n🤖 Training {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_vect, y_train, cv=5, scoring='accuracy')
    
    # Train model
    model.fit(X_train_vect, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_vect)
    y_pred_proba = model.predict_proba(X_test_vect) if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    model_objects[name] = model
    
    print(f"   Cross-validation: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    print(f"   Test accuracy: {accuracy:.4f}")
    
    # Detailed classification report
    print(f"   Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam'], zero_division=0))

print("\n" + "=" * 60)
print("📊 MODEL COMPARISON SUMMARY:")
for name, metrics in results.items():
    print(f"{name}: {metrics['accuracy']:.4f} (CV: {metrics['cv_mean']:.4f})")


In [None]:
# Visualize model performance comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Accuracy comparison
models_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in models_names]
cv_means = [results[name]['cv_mean'] for name in models_names]

bars1 = axes[0,0].bar(models_names, accuracies, color=['skyblue', 'lightgreen', 'lightcoral'], alpha=0.8)
axes[0,0].set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].set_ylim(0.8, 1.0)

# Add accuracy values on bars
for bar, accuracy in zip(bars1, accuracies):
    axes[0,0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
                   f'{accuracy:.4f}', ha='center', va='bottom', fontweight='bold')

# 2. Cross-validation comparison
bars2 = axes[0,1].bar(models_names, cv_means, color=['skyblue', 'lightgreen', 'lightcoral'], alpha=0.8)
cv_stds = [results[name]['cv_std'] for name in models_names]
axes[0,1].errorbar(models_names, cv_means, yerr=cv_stds, fmt='none', color='black', capsize=5)
axes[0,1].set_title('Cross-Validation Accuracy', fontsize=14, fontweight='bold')
axes[0,1].set_ylabel('CV Accuracy')
axes[0,1].set_ylim(0.8, 1.0)

# Add CV values on bars
for bar, cv_mean in zip(bars2, cv_means):
    axes[0,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
                   f'{cv_mean:.4f}', ha='center', va='bottom', fontweight='bold')

# 3. Feature importance (for Random Forest)
if 'Random Forest' in model_objects:
    rf_model = model_objects['Random Forest']
    feature_importance = rf_model.feature_importances_
    top_indices = np.argsort(feature_importance)[-15:]
    top_features = [feature_names[i] for i in top_indices]
    top_importance = feature_importance[top_indices]
    
    axes[1,0].barh(range(len(top_features)), top_importance, color='orange', alpha=0.7)
    axes[1,0].set_yticks(range(len(top_features)))
    axes[1,0].set_yticklabels(top_features)
    axes[1,0].set_title('Top 15 Features (Random Forest)', fontsize=14, fontweight='bold')
    axes[1,0].set_xlabel('Feature Importance')

# 4. Model performance metrics
metrics_data = []
for name in models_names:
    metrics_data.append([
        results[name]['accuracy'],
        results[name]['cv_mean'],
        results[name]['cv_std']
    ])

metrics_df = pd.DataFrame(metrics_data, 
                         columns=['Test Accuracy', 'CV Mean', 'CV Std'],
                         index=models_names)

# Create heatmap
sns.heatmap(metrics_df, annot=True, fmt='.4f', cmap='Blues', ax=axes[1,1])
axes[1,1].set_title('Performance Metrics Heatmap', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Select best model and create detailed confusion matrix
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = model_objects[best_model_name]
best_predictions = results[best_model_name]['predictions']
best_accuracy = results[best_model_name]['accuracy']

print(f"🏆 BEST MODEL: {best_model_name}")
print(f"   Accuracy: {best_accuracy:.4f}")
print(f"   CV Score: {results[best_model_name]['cv_mean']:.4f} (+/- {results[best_model_name]['cv_std']*2:.4f})")

# Create detailed confusion matrix visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Confusion matrix
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'], ax=axes[0])
axes[0].set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Normalized confusion matrix
cm_normalized = confusion_matrix(y_test, best_predictions, normalize='true')
sns.heatmap(cm_normalized, annot=True, fmt='.3f', cmap='Greens', 
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'], ax=axes[1])
axes[1].set_title(f'Normalized Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

# Calculate additional metrics
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, best_predictions)
recall = recall_score(y_test, best_predictions)
f1 = f1_score(y_test, best_predictions)

print(f"\n📊 DETAILED METRICS FOR {best_model_name}:")
print(f"   Accuracy:  {best_accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1-Score:  {f1:.4f}")

# False positive and false negative analysis
tn, fp, fn, tp = cm.ravel()
print(f"\n🔍 CONFUSION MATRIX BREAKDOWN:")
print(f"   True Negatives (Ham → Ham):   {tn}")
print(f"   False Positives (Ham → Spam): {fp}")
print(f"   False Negatives (Spam → Ham): {fn}")
print(f"   True Positives (Spam → Spam): {tp}")


In [None]:
# Test predictions on sample emails
sample_emails = [
    "Congratulations! You've won $1000! Click here to claim now!",
    "Hi John, can we meet for lunch tomorrow at 1 PM?",
    "FREE MONEY! URGENT! Act now and get rich quick!",
    "Meeting reminder: Team standup at 10 AM in conference room.",
    "WINNER! You have been selected to receive a FREE iPhone! Claim now!",
    "Thanks for your email. I'll get back to you by end of week.",
    "HOT SINGLES in your area! Click here for instant access!",
    "Could you please review the quarterly report I sent yesterday?"
]

print("🧪 TESTING MODEL ON SAMPLE EMAILS:")
print("=" * 80)

for i, email in enumerate(sample_emails, 1):
    # Preprocess email
    processed_email = preprocess_text(email)
    
    # Vectorize
    email_vect = vectorizer.transform([processed_email])
    
    # Predict
    prediction = best_model.predict(email_vect)[0]
    
    # Get probability if available
    if hasattr(best_model, 'predict_proba'):
        probabilities = best_model.predict_proba(email_vect)[0]
        confidence = max(probabilities)
        spam_prob = probabilities[1]
    else:
        confidence = 0.8
        spam_prob = prediction
    
    result = "SPAM" if prediction == 1 else "HAM"
    
    print(f"\n{i}. Email: {email}")
    print(f"   Processed: {processed_email[:80]}...")
    print(f"   Prediction: {result} (Confidence: {confidence:.2%})")
    if hasattr(best_model, 'predict_proba'):
        print(f"   Spam Probability: {spam_prob:.3f}")
    print("-" * 80)

# Feature analysis for interpretability
if hasattr(best_model, 'coef_'):  # For Logistic Regression
    # Get feature coefficients
    feature_coef = best_model.coef_[0]
    
    # Top positive features (spam indicators)
    top_spam_indices = np.argsort(feature_coef)[-15:]
    top_spam_features = [(feature_names[i], feature_coef[i]) for i in top_spam_indices]
    
    # Top negative features (ham indicators)
    top_ham_indices = np.argsort(feature_coef)[:15]
    top_ham_features = [(feature_names[i], feature_coef[i]) for i in top_ham_indices]
    
    print("\n🔍 MODEL INTERPRETATION (Logistic Regression Coefficients):")
    print("\nTop SPAM indicators:")
    for feature, coef in reversed(top_spam_features):
        print(f"   {feature}: {coef:.4f}")
    
    print("\nTop HAM indicators:")
    for feature, coef in top_ham_features:
        print(f"   {feature}: {coef:.4f}")

print(f"\n✅ NOTEBOOK ANALYSIS COMPLETE!")
print(f"   Best model: {best_model_name} ({best_accuracy:.4f} accuracy)")
print(f"   Total emails analyzed: {len(df)}")
print(f"   Features extracted: {X_train_vect.shape[1]}")
