In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

# Set random seed for reproducibility
seed = 1234
np.random.seed(seed)

# Data loading and preprocessing
header = ["age", "Job-type", "fnlwgt", "edu", "edu-num", "marital-status", 
          "Job", "Relationship", "race", "sex", "gain", "loss", "hrs/week", 
          "naive", "income"]
df = pd.read_csv('adult/adult.data', low_memory=False, names=header)

# Convert categorical variables to numerical
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

# Split features and target
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Problem Description and Dataset Analysis
print("=== Problem Description and Dataset Analysis ===")
print("Problem: Predict whether income exceeds $50K/yr based on census data")
print(f"Dataset Size: {len(df)} samples")
print(f"Number of Features: {X.shape[1]}")
print(f"Class Distribution:\n{df['income'].value_counts(normalize=True)}\n")

# Initialize Naive Bayes models
models = {
    'GaussianNB': GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Training
    model.fit(X_train, y_train)
    
    # Predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Performance metrics
    train_error = np.mean(y_train != train_preds)
    test_error = np.mean(y_test != test_preds)
    test_accuracy = accuracy_score(y_test, test_preds)
    
    results[name] = {
        'train_error': train_error,
        'test_error': test_error,
        'test_accuracy': test_accuracy,
        'classification_report': classification_report(y_test, test_preds)
    }

# Parameter tuning for MultinomialNB (alpha parameter)
alphas = np.logspace(-3, 1, 10)  # Test alpha from 0.001 to 10
train_errors = []
test_errors = []

for alpha in alphas:
    mnb = MultinomialNB(alpha=alpha)
    mnb.fit(X_train, y_train)
    
    train_preds = mnb.predict(X_train)
    test_preds = mnb.predict(X_test)
    
    train_errors.append(np.mean(y_train != train_preds))
    test_errors.append(np.mean(y_test != test_preds))

# Plotting parameter tuning results
plt.figure(figsize=(10, 6))
plt.semilogx(alphas, train_errors, label='Training Error', color='blue')
plt.semilogx(alphas, test_errors, label='Test Error', color='orange')
plt.xlabel('Alpha (Smoothing Parameter)')
plt.ylabel('Error Rate')
plt.title('MultinomialNB Performance vs Alpha')
plt.legend()
plt.grid(True)
plt.show()

# Find best alpha
best_alpha_idx = np.argmin(test_errors)
best_alpha = alphas[best_alpha_idx]
print(f"Best alpha: {best_alpha:.4f}")

# Train final model with best parameters
final_model = MultinomialNB(alpha=best_alpha)
final_model.fit(X_train, y_train)
final_test_preds = final_model.predict(X_test)

# Results Summary
print("\n=== Model Performance Summary ===")
print("| Model          | Train Error | Test Error | Test Accuracy |")
print("|----------------|-------------|------------|---------------|")
for name, metrics in results.items():
    print(f"| {name:<14} | {metrics['train_error']:.4f}      | {metrics['test_error']:.4f}     | {metrics['test_accuracy']:.4f}       |")

print("\n=== Final Model Detailed Performance ===")
print(f"MultinomialNB with alpha={best_alpha:.4f}")
print(classification_report(y_test, final_test_preds))

ModuleNotFoundError: No module named 'pandas'