# Import Dependencies

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification

# Preparing a Sample Dataset
### Let's create a simple dataset with a mix of continuous, categorical, and binary features for the examples.

In [3]:
# Create a dataset with a mix of continuous and categorical data
X, y = make_classification(n_samples=100, n_features=4, random_state=42)

df = pd.DataFrame(X, columns=["Feature1", "Feature2", "Feature3", "Feature4"])
df["Target"] = y

# Convert to categorical data for MultinomialNB 
df["Feature2"] = np.digitize(df["Feature2"], bins=[-3, 0, 3])
df["Feature3"] = (df["Feature3"] > 0).astype(int)  # Binary feature for BernoulliNB

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("Target", axis=1), df["Target"], test_size=0.3, random_state=42
)


## Gaussian Naive Bayes
### Gaussian Naive Bayes is used when features follow a Gaussian distribution (common for continuous data).

In [5]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print("Gaussian Naive Bayes:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Gaussian Naive Bayes:
Confusion Matrix:
[[ 8  3]
 [ 2 17]]
Accuracy: 0.8333333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           1       0.85      0.89      0.87        19

    accuracy                           0.83        30
   macro avg       0.82      0.81      0.82        30
weighted avg       0.83      0.83      0.83        30



## Multinomial Naive Bayes
### Multinomial Naive Bayes is typically used for categorical or count data (e.g., word counts).

In [6]:
mnb = MultinomialNB()
# Only select categorical or non-negative features
X_train_multinomial = X_train[["Feature2"]]
X_test_multinomial = X_test[["Feature2"]]
mnb.fit(X_train_multinomial, y_train)
y_pred = mnb.predict(X_test_multinomial)

print("\nMultinomial Naive Bayes:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))



Multinomial Naive Bayes:
Confusion Matrix:
[[11  0]
 [19  0]]
Accuracy: 0.36666666666666664
Classification Report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54        11
           1       0.00      0.00      0.00        19

    accuracy                           0.37        30
   macro avg       0.18      0.50      0.27        30
weighted avg       0.13      0.37      0.20        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Bernoulli Naive Bayes
### Bernoulli Naive Bayes is used for binary data.

In [8]:
bnb = BernoulliNB()
# Only select binary features
X_train_bernoulli = X_train[["Feature3"]]
X_test_bernoulli = X_test[["Feature3"]]
bnb.fit(X_train_bernoulli, y_train)
y_pred = bnb.predict(X_test_bernoulli)

print("\nBernoulli Naive Bayes:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))



Bernoulli Naive Bayes:
Confusion Matrix:
[[ 8  3]
 [ 2 17]]
Accuracy: 0.8333333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           1       0.85      0.89      0.87        19

    accuracy                           0.83        30
   macro avg       0.82      0.81      0.82        30
weighted avg       0.83      0.83      0.83        30



## Application on Spam Detection

In [9]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

# Fetch dataset
data = fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.hockey'], shuffle=True, random_state=42)

In [10]:
# Use a pipeline to vectorize the text and then apply MultinomialNB
pipeline = make_pipeline(CountVectorizer(), MultinomialNB())


In [11]:
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    data.data, data.target, test_size=0.3, random_state=42
)

In [12]:
pipeline.fit(X_train_text, y_train_text)
y_pred_text = pipeline.predict(X_test_text)

In [13]:
print("\nSpam Detection with Multinomial Naive Bayes:")
print("Confusion Matrix:")
print(confusion_matrix(y_test_text, y_pred_text))
print("Accuracy:", accuracy_score(y_test_text, y_pred_text))
print("Classification Report:")
print(classification_report(y_test_text, y_pred_text, target_names=data.target_names))


Spam Detection with Multinomial Naive Bayes:
Confusion Matrix:
[[185   4]
 [  1 168]]
Accuracy: 0.9860335195530726
Classification Report:
                  precision    recall  f1-score   support

rec.sport.hockey       0.99      0.98      0.99       189
       sci.space       0.98      0.99      0.99       169

        accuracy                           0.99       358
       macro avg       0.99      0.99      0.99       358
    weighted avg       0.99      0.99      0.99       358

