In [3]:
# Import necessary libraries
import zipfile
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define paths and column names
zip_path = 'spambase.zip'  # Your uploaded zip filename
extract_folder = 'spambase_data'
data_path = f'{extract_folder}/spambase.data' 

column_names = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our',
    'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail',
    'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses',
    'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
    'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs',
    'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
    'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re',
    'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#',
    'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total',
    'spam' # Target column
]

# 1. Extract the data
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)
print(f"Extracted files to '{extract_folder}' folder.")

# 2. Load the data
data = pd.read_csv(data_path, header=None, names=column_names)
print("Data loaded. Shape:", data.shape)

# Display the first few rows
print("\nFirst 5 rows of the data:")
display(data.head())

Extracted files to 'spambase_data' folder.
Data loaded. Shape: (4601, 58)

First 5 rows of the data:


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [5]:
# Separate features (X) and target (y)
X = data.drop('spam', axis=1).values
y = data['spam'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set shape (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set shape (X_test, y_test): {X_test.shape}, {y_test.shape}")

Training set shape (X_train, y_train): (3220, 57), (3220,)
Testing set shape (X_test, y_test): (1381, 57), (1381,)


In [7]:
# Find the unique classes (0: not spam, 1: spam)
classes = np.unique(y_train)

# Calculate class priors P(C=c)
# priors[c] = (Count of class c) / (Total count)
priors = {c: np.mean(y_train == c) for c in classes}

# Calculate mean (μ) and standard deviation (σ) for each feature per class
# Note: ddof=1 for sample standard deviation
means = {c: np.mean(X_train[y_train == c], axis=0) for c in classes}
stds = {c: np.std(X_train[y_train == c], axis=0, ddof=1) for c in classes}

print("Calculated Class Priors:")
for c, prior in priors.items():
    print(f"  Class {c} (Spam={c}): {prior:.4f}")

print("\nFirst 5 Means for Class 0 (Not Spam):", means[0][:5])
print("First 5 Standard Deviations for Class 1 (Spam):", stds[1][:5])

Calculated Class Priors:
  Class 0 (Spam=0): 0.6161
  Class 1 (Spam=1): 0.3839

First 5 Means for Class 0 (Not Spam): [0.07296875 0.21552923 0.20525706 0.000625   0.17731351]
First 5 Standard Deviations for Class 1 (Spam): [0.31196498 0.37666884 0.48547619 2.15584132 0.72044424]


In [9]:
def likelihood(x, mean, std):
    """
    Calculates the sum of log-probabilities (log-likelihood) for a single sample x,
    assuming a Gaussian distribution for each feature.
    P(X=x | C=c) = Product over features P(X_i=x_i | C=c)
    log P(X=x | C=c) = Sum over features log P(X_i=x_i | C=c)
    """
    # Use a small constant to prevent division by zero for features with std=0
    std_safe = np.where(std == 0, 1e-6, std) 
    
    # norm.logpdf calculates log P(X_i=x_i | C=c)
    # np.sum adds them up due to Naive Bayes assumption of independence
    return np.sum(norm.logpdf(x, mean, std_safe))

def predict(X):
    """
    Predicts the class for an array of samples X using the maximum a posteriori principle.
    """
    y_pred = []
    for x in X:
        posteriors = {}
        for c in classes:
            # 1. Log Prior: log P(C=c)
            log_prior = np.log(priors[c])
            
            # 2. Log Likelihood: log P(X=x | C=c)
            log_likelihood = likelihood(x, means[c], stds[c])
            
            # 3. Log Posterior (Proportional): log P(X=x|C=c) + log P(C=c)
            posteriors[c] = log_prior + log_likelihood
        
        # Select the class with the highest log-posterior value (Maximum A Posteriori)
        y_pred.append(max(posteriors, key=posteriors.get))
        
    return np.array(y_pred)

print("Prediction functions defined.")

Prediction functions defined.


In [11]:
# Make predictions on the test set
y_pred = predict(X_test)

# Evaluate the model performance
print("--- Gaussian Naive Bayes Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

--- Gaussian Naive Bayes Model Evaluation ---
Accuracy: 0.8161

Confusion Matrix:
[[583 221]
 [ 33 544]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.73      0.82       804
           1       0.71      0.94      0.81       577

    accuracy                           0.82      1381
   macro avg       0.83      0.83      0.82      1381
weighted avg       0.85      0.82      0.82      1381



In [13]:
# Assuming y_train is your training target array (e.g., [0, 1, 0, 1, 1, ...])
classes = np.unique(y_train)

# --- PRIOR PROBABILITY CODE ---
# Calculate class priors P(C=c)
# np.mean(y_train == c) computes (count of c) / (total samples)
priors = {c: np.mean(y_train == c) for c in classes}
# -----------------------------

print("Calculated Class Priors:")
for c, prior in priors.items():
    print(f"  P(Class {c}): {prior:.4f}")

Calculated Class Priors:
  P(Class 0): 0.6161
  P(Class 1): 0.3839


In [15]:
# Assuming you have a single test sample 'x' and previously calculated 'priors', 'means', and 'stds'

# Define the Log Likelihood function first
def likelihood(x, mean, std):
    # Sum of log-probabilities from Gaussian PDF (Log Likelihood: log P(X=x | C=c))
    std_safe = np.where(std == 0, 1e-6, std) 
    return np.sum(norm.logpdf(x, mean, std_safe))

# --- LOG POSTERIOR PROBABILITY CODE (Proportional) ---
posteriors = {}
for c in classes:
    # 1. Log Prior: log P(C=c)
    log_prior = np.log(priors[c])

    # 2. Log Likelihood: log P(X=x | C=c)
    log_likelihood = likelihood(x, means[c], stds[c])

    # 3. Log Posterior (Proportional): Sum of Log Prior and Log Likelihood
    posteriors[c] = log_prior + log_likelihood
# ----------------------------------------------------

print(f"Log Posterior (Proportional) for Class 0: {posteriors[0]:.4f}")
print(f"Log Posterior (Proportional) for Class 1: {posteriors[1]:.4f}")

# The prediction is the class with the higher value
prediction = max(posteriors, key=posteriors.get)
print(f"Prediction: Class {prediction}")

NameError: name 'x' is not defined

In [17]:
# Assuming you have a single test sample 'x' and previously calculated 'priors', 'means', and 'stds'

# Define the Log Likelihood function first
def likelihood(x, mean, std):
    # Sum of log-probabilities from Gaussian PDF (Log Likelihood: log P(X=x | C=c))
    std_safe = np.where(std == 0, 1e-6, std) 
    return np.sum(norm.logpdf(x, mean, std_safe))

# --- LOG POSTERIOR PROBABILITY CODE (Proportional) ---
posteriors = {}
for c in classes:
    # 1. Log Prior: log P(C=c)
    log_prior = np.log(priors[c])

    # 2. Log Likelihood: log P(X=x | C=c)
    log_likelihood = likelihood(x, means[c], stds[c])

    # 3. Log Posterior (Proportional): Sum of Log Prior and Log Likelihood
    posteriors[c] = log_prior + log_likelihood
# ----------------------------------------------------

print(f"Log Posterior (Proportional) for Class 0: {posteriors[0]:.4f}")
print(f"Log Posterior (Proportional) for Class 1: {posteriors[1]:.4f}")

# The prediction is the class with the higher value
prediction = max(posteriors, key=posteriors.get)
print(f"Prediction: Class {prediction}")

NameError: name 'x' is not defined

In [19]:
def likelihood(x, mean, std):
    """Calculates the log-Likelihood for a single sample x."""
    # Safety: Use a small constant to prevent division by zero for std=0
    std_safe = np.where(std == 0, 1e-6, std) 
    return np.sum(norm.logpdf(x, mean, std_safe))

def predict(X):
    """Predicts the class for an array of samples X."""
    y_pred = []
    # 🎯 The variable 'x' is defined here as it iterates through X
    for x in X: 
        posteriors = {}
        for c in classes:
            # 1. Log Prior
            log_prior = np.log(priors[c])
            
            # 2. Log Likelihood
            log_likelihood = likelihood(x, means[c], stds[c])
            
            # 3. Log Posterior (Proportional)
            posteriors[c] = log_prior + log_likelihood
        
        # Bayesian Decision: Select class with max log-posterior
        y_pred.append(max(posteriors, key=posteriors.get))
        
    return np.array(y_pred)

In [23]:
# Assuming X_test, y_test are defined from the train_test_split
y_pred = predict(X_test)
print("Predictions made successfully.")

Predictions made successfully.


In [25]:
# Assuming 'x' is a single data point from X_test, 
# and 'c' is the current class (0 or 1) being evaluated.

# 1. Log Prior: log P(C=c)
log_prior = np.log(priors[c])

# 2. Log Likelihood: log P(X=x | C=c)
# This uses the 'likelihood' function defined earlier.
log_likelihood = likelihood(x, means[c], stds[c])

# --- 🎯 THE CALCULATION FOR LOG POSTERIOR (PROPORTIONAL) ---
# Log P(C=c | X=x) ∝ log P(X=x | C=c) + log P(C=c)
posteriors[c] = log_prior + log_likelihood
# -----------------------------------------------------------

# The result 'posteriors[c]' is the metric used for the final prediction.

NameError: name 'x' is not defined

In [28]:
# Assuming 'likelihood', 'priors', 'means', 'stds', and 'classes' are defined

def predict(X):
    """
    Predicts the class for an array of samples X.
    This function implements the Bayesian Decision Rule.
    """
    y_pred = []
    # 🎯 The variable 'x' is defined here as it iterates through the input array X
    for x in X: 
        posteriors = {}
        for c in classes:
            # 1. Log Prior
            log_prior = np.log(priors[c])
            
            # 2. Log Likelihood
            log_likelihood = likelihood(x, means[c], stds[c])
            
            # 3. Log Posterior (Proportional)
            posteriors[c] = log_prior + log_likelihood
        
        # Select the class with the maximum log-posterior value
        y_pred.append(max(posteriors, key=posteriors.get))
        
    return np.array(y_pred)

# Now, call the function using your test data:
y_pred = predict(X_test)

print("Prediction successful. The error is resolved.")

Prediction successful. The error is resolved.
