In [None]:
# %pip install seaborn
%pip install plotly
import plotly.graph_objects as go
# import ipywidgets as widgets
from math import sqrt
import time
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pprint as pp
import numpy as np
import pandas as pd


################################## DATA LOADING / PROCESSING ONLY ##################################
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import KBinsDiscretizer
####################################################################################################

# If Using Google Colab, enable these extensions
# from google.colab import output
# output.enable_custom_widget_manager()

## Import Data and Light EDA

Data we will be using!
- Download raw data [here](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) But you do not need to, I have already included it!
- Wolberg,William, Mangasarian,Olvi, Street,Nick, and Street,W.. (1995). Breast Cancer Wisconsin (Diagnostic). UCI Machine Learning Repository. https://doi.org/10.24432/C5DW2B.

In [None]:
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [None]:
X.describe()

In [None]:
# Display the distribution of the target variable
print("\nDistribution of the target variable (y):")
print(y.value_counts())

# Visualize the distribution of the target variable
y.value_counts().plot(kind='bar', title='Distribution of the target variable (Malignant=0, Benign=1)')

## Preprocess Data for NB Training

**What we Start with:**
- data = raw data
- X = all data without labels (share same index with y)
- y = labels (share same index with X)

**What we Split:**
- X_train, y_train 
- X_test, y_test 

- Training set shape: (398, 30)
- Test set shape: (171, 30)
- Unique classes: [0 1]



In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Unique classes:", np.unique(y_train))

In [None]:
#e.g. of the data before we discretize
X_train["mean radius"]

## Discretize Continious Data

In [None]:
# Discretize the data
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_train_discretized = discretizer.fit_transform(X_train)
X_test_discretized = discretizer.transform(X_test)


In [None]:
X_train_discretized[:,0]

## Let's Tackle NB from Scratch

Bayes' Theorem and the "Missing" Denominator
Recall Bayes' theorem:
$$P(y|X) = \frac{P(X|y)P(y)}{P(X)}$$
Where:

- $P(y|X)$ is the posterior probability
- $P(X|y)$ is the likelihood
- $P(y)$ is the prior probability
- $P(X)$ is the evidence

In Naive Bayes classification, we're trying to find the class $y$ that maximizes $P(y|X)$. The key insight is that $P(X)$ is constant for all classes $y$ for a given input $X$. Therefore, when we're comparing different classes, we can ignore this term.
Mathematically, we can express this as:
$$\hat{y} = \arg\max_{y} P(y|X) = \arg\max_{y} \frac{P(X|y)P(y)}{P(X)} \propto \arg\max_{y} P(X|y)P(y)$$
The $\propto$ symbol means "proportional to". Since we're only interested in which class maximizes this probability, and not the actual probability value, we can work with this proportional form.
This is why our final classification rule becomes:
$$\hat{y} = \arg\max_{y} \left( \log P(y) + \sum_{i=1}^{n} \log P(x_i|y) \right)$$
Which is equivalent to:
$$\hat{y} = \arg\max_{y} \left( \log P(y) + \log P(X|y) \right)$$
Where we've also taken the logarithm to turn products into sums (for numerical stability).

The Logarithmic Solution:
We apply the logarithm to both sides of our equation:
$$\log(P(y|X)) \propto \log(P(y)) + \sum_{i=1}^{n} \log(P(x_i|y))$$
Key points:

1. Log of a Product: The logarithm of a product is the sum of logarithms:
$$\log(a \cdot b \cdot c) = \log(a) + \log(b) + \log(c)$$
2. Monotonicity: The logarithm is a monotonically increasing function, meaning if $a > b$, then $\log(a) > \log(b)$. This property ensures that the class with the highest probability will also have the highest log probability.
3. Numerical Stability: Log probabilities are negative numbers closer to zero (since $\log(x) < 0$ for $0 < x < 1$). This avoids underflow and preserves precision.


The final equation for Naive Bayes classification with Laplace smoothing can be expressed as:
$$\hat{y} = \arg\max_{y} \left( \log P(y) + \sum_{i=1}^{n} \log P(x_i|y) \right)$$
Where:
$$P(y) = \frac{\text{count}(y) + \alpha}{\sum_{y'} (\text{count}(y') + \alpha)}$$
$$P(x_i|y) = \frac{\text{count}(x_i, y) + \alpha}{\text{count}(y) + \alpha |V_i|}$$
And:

- $\hat{y}$ is the predicted class
- $y$ is a class label
- $x_i$ is the i-th feature
- $n$ is the number of features
- $\alpha$ is the smoothing parameter (typically 1 for Laplace smoothing)
- $|V_i|$ is the number of unique values for the i-th feature
- $\text{count}(y)$ is the number of samples with class $y$
- $\text{count}(x_i, y)$ is the number of times feature $x_i$ appears in class $y$

In [None]:
class NaiveBayes:
    def __init__(self):
        self.classes = None
        self.priors = None
        self.likelihoods = None

    def fit(self, X, y):
        # Milestone 1: Calculate prior probabilities
        # TODO: Calculate self.classes and self.priors
        print(X.shape)
        print(np.unique(y))
        # print("Milestone 1: Prior probabilities")
        # print(f"Classes: {self.classes}")
        # print(f"Priors: {self.priors}")

        # Milestone 2: Calculate likelihood probabilities
        # TODO: Calculate self.likelihoods with Laplace smoothing
        # print("\nMilestone 2: Likelihood probabilities")
        # print(f"Likelihoods shape: {len(self.likelihoods)}")
        # Print a sample of likelihoods for debugging

    def predict(self, X):
        predictions = []
        # Milestone 3: Implement prediction logic
        # TODO: Calculate class scores and make predictions
        # print("\nMilestone 3: Predictions")
        # print(f"Number of predictions: {len(predictions)}")
        return np.array(predictions)






######### Create and train models #########
nb = NaiveBayes()
nb.fit(X_train_discretized, y_train)

######### Make predictions and calculate accuracy for both models #########
# nb_predictions = nb.predict(X_test_discretized)
# nb_accuracy = np.mean(nb_predictions == y_test)
# print(f"Discretized Naive Bayes Accuracy: {nb_accuracy:.4f}")






In [None]:
class NaiveBayesTest:
    def __init__(self):
        self.classes = None
        self.priors = None
        self.likelihoods = None
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples, n_features = X.shape
        
        # Calculate prior probabilities: P(class)
        classes, counts = np.unique(y, return_counts=True)
        self.priors = dict(zip(classes, counts / n_samples))
        
        # Calculate likelihood probabilities: P(feature|class)
        self.likelihoods = {c: {} for c in self.classes}
        for c in self.classes:
            X_class = X[y == c]
            class_count = len(X_class)
            for feature in range(n_features):
                feature_values, feature_counts = np.unique(X_class[:, feature], return_counts=True)
                num_unique_values = len(feature_values)
                
                # Apply Laplace smoothing
                smoothed_counts = feature_counts + 1
                smoothed_total = class_count + num_unique_values
                
                probabilities = smoothed_counts / smoothed_total
                self.likelihoods[c][feature] = dict(zip(feature_values, probabilities))
                
                # Add probability for unseen values
                self.likelihoods[c][feature]['unseen'] = 1 / smoothed_total
        
        pp.pprint(self.likelihoods)
    
    def predict(self, X):
        predictions = []
        for sample in X:
            class_scores = {}
            for c in self.classes:
                # Calculate log probabilities to avoid underflow
                class_scores[c] = np.log(self.priors[c])
                for feature, value in enumerate(sample):
                    if value in self.likelihoods[c][feature]:
                        class_scores[c] += np.log(self.likelihoods[c][feature][value])
                    else:
                        # Handle unseen feature values
                        class_scores[c] += np.log(1 / (np.sum(list(self.likelihoods[c][feature].values())) + 1))
            predictions.append(max(class_scores, key=class_scores.get))
        return np.array(predictions)
    
# Create and train models
nb_test = NaiveBayesTest()
nb_test.fit(X_train_discretized, y_train)

# Make predictions and calculate accuracy for both models
nb_predictions = nb_test.predict(X_test_discretized)
nb_accuracy = np.mean(nb_predictions == y_test)
print(f"Discretized Naive Bayes Accuracy: {nb_accuracy:.4f}")

# Naive Bayes: Mathematical Foundations

## 1. Prior Probability

The prior probability of a class is the probability of observing that class before seeing any features.

$P(y) = \frac{\text{count of samples in class y}}{\text{total number of samples}}$

This corresponds to the calculation of `self.priors` in the `fit` method:

```python
self.priors = {c: np.sum(y == c) / n_samples for c in self.classes}
# or...
classes, counts = np.unique(y, return_counts=True)
self.priors = dict(zip(classes, counts / n_samples))
```

## 2. Likelihood

The likelihood is the probability of observing a feature value given a particular class.

$P(x_i|y) = \frac{\text{count of feature } x_i \text{ in class y} + 1}{\text{total count in class y} + \text{number of unique values}}$

The "+1" in the numerator and "number of unique values" in the denominator represent Laplace smoothing.

This corresponds to the calculation of `self.likelihoods` in the `fit` method:

```python
self.likelihoods[c][feature] = dict(zip(feature_values, feature_counts / np.sum(feature_counts)))
```

## 3. Posterior Probability

The posterior probability is what we're trying to calculate for prediction. It's the probability of a class given the observed features.

$P(y|X) \propto P(y) \prod_{i=1}^{n} P(x_i|y)$

In practice, we often work with log probabilities to avoid underflow:

$\log P(y|X) \propto \log P(y) + \sum_{i=1}^{n} \log P(x_i|y)$

This corresponds to the calculation in the `predict` method:

```python
class_scores[c] = np.log(self.priors[c])
for feature, value in enumerate(sample):
    class_scores[c] += np.log(self.likelihoods[c][feature][value])
```

## 4. Classification

The final classification is done by choosing the class with the highest posterior probability:

$\hat{y} = \arg\max_{y} P(y|X)$

Or, using log probabilities:

$\hat{y} = \arg\max_{y} (\log P(y) + \sum_{i=1}^{n} \log P(x_i|y))$

This corresponds to the final prediction in the `predict` method:

```python
predictions.append(max(class_scores, key=class_scores.get))
```

## Mapping to Code

1. `__init__`: Initializes the attributes that will store the computed probabilities.
2. `fit`:
   - Calculates prior probabilities (`self.priors`)
   - Calculates likelihood probabilities (`self.likelihoods`)
3. `predict`:
   - Computes the log posterior probability for each class
   - Selects the class with the highest probability as the prediction

The "naive" in Naive Bayes comes from the assumption that features are conditionally independent given the class, which allows us to simply multiply (or add in log space) the individual feature likelihoods.

# Gaussian NB
**when being average is good enough...**

#### Pypi Version *--for loopin' it*

In [None]:
def calculate_prior(y):
    classes = list(set(y))
    prior = {}
    for cls in classes:
        prior[cls] = len([1 for val in y if val == cls]) / len(y)
    print("Prior probabilities:", prior)  # DEBUGGING
    return prior

def calculate_likelihood(X, y):
    classes = list(set(y))
    likelihood = {}
    for cls in classes:
        X_cls = X[y == cls]
        mean = X_cls.mean(axis=0)
        var = X_cls.var(axis=0)
        likelihood[cls] = {
            'mean': mean,
            'var': var
        }
    print("Likelihood means shape:", {cls: likelihood[cls]['mean'].shape for cls in likelihood})  # DEBUGGING
    print("Likelihood variances shape:", {cls: likelihood[cls]['var'].shape for cls in likelihood})  # DEBUGGING
    return likelihood

def calculate_posterior(X, prior, likelihood):
    posteriors = []
    for i in range(len(X)):
        sample_posteriors = {}
        for cls in prior:
            prior_prob = math.log(prior[cls])
            conditional_prob = 0
            for feature in X.columns:
                mean = likelihood[cls]['mean'][feature]
                var = likelihood[cls]['var'][feature]
                value = X.iloc[i][feature]
                conditional_prob += -0.5 * math.log(2 * math.pi * var) - ((value - mean) ** 2) / (2 * var)
            sample_posteriors[cls] = prior_prob + conditional_prob
        posteriors.append(sample_posteriors)
    return posteriors

def predict(X, prior, likelihood):
    posteriors = calculate_posterior(X, prior, likelihood)
    y_pred = []
    for posterior in posteriors:
        y_pred.append(max(posterior, key=posterior.get))
    return y_pred

In [None]:
# Measure training time
start_time = time.time()
prior = calculate_prior(y_train)
likelihood = calculate_likelihood(X_train, y_train)
training_time = time.time() - start_time

print(f'Training time: {training_time:.4f} seconds')

# Measure prediction time
start_time = time.time()
y_pred = predict(X_test, prior, likelihood)
prediction_time = time.time() - start_time

print(f'Prediction time: {prediction_time:.4f} seconds')

# Measure accuracy
accuracy = sum(y_pred == y_test) / len(y_test)
print(f'Accuracy: {accuracy:.4f}')

print("Predictions shape:", len(y_pred))
print("First 10 predictions:", y_pred[:10])

#### Vectorized Version *--Zoom Zoom*

In [None]:
def calculate_prior(y):
    classes, counts = np.unique(y, return_counts=True)
    return counts / len(y)

def calculate_likelihood(X, y):
    means = X.groupby(y).mean().values
    variances = X.groupby(y).var().values
    return means, variances

def calculate_posterior(X, prior, means, variances):
    n_samples, n_features = X.shape
    n_classes = len(prior)
    posteriors = np.zeros((n_samples, n_classes))

    for idx, (mean, var, prior_prob) in enumerate(zip(means, variances, prior)):
        log_prior = np.log(prior_prob)
        log_likelihood = -0.5 * np.sum(np.log(2. * np.pi * var))
        log_likelihood -= 0.5 * np.sum(((X - mean) ** 2) / var, axis=1)
        posteriors[:, idx] = log_prior + log_likelihood
    
    return posteriors

def predict(X, prior, means, variances):
    posteriors = calculate_posterior(X, prior, means, variances)
    y_pred = np.argmax(posteriors, axis=1)
    return y_pred

## Measuring Speed of Implementations

In [None]:
# Measure training time
start_time = time.time()
prior = calculate_prior(y_train)
means, variances = calculate_likelihood(X_train, y_train)
training_time = time.time() - start_time
print(f'Training time: {training_time:.4f} seconds')

# Measure prediction time
start_time = time.time()
y_pred = predict(X_test, prior, means, variances)
prediction_time = time.time() - start_time
print(f'Prediction time: {prediction_time:.4f} seconds')


accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.4f}')

print("Predictions shape:", y_pred.shape)
print("First 10 predictions:", y_pred[:10])

## Understanding our Predictions in Context.
	•	True Positives (TP): Correctly predicted positive cases.
	•	False Positives (FP): Incorrectly predicted positive cases (Type I error).
	•	False Negatives (FN): Incorrectly predicted negative cases (Type II error).
	•	True Negatives (TN): Correctly predicted negative cases.

False Positives (FP):

	•	These occur when the model predicts a sample as positive (malignant) but it is actually negative (benign).
	•	In medical diagnosis, a false positive means a patient is incorrectly diagnosed with cancer, leading to unnecessary stress and potentially invasive follow-up tests.

False Negatives (FN):

	•	These occur when the model predicts a sample as negative (benign) but it is actually positive (malignant).
	•	In medical diagnosis, a false negative means a patient with cancer is not diagnosed, leading to a lack of necessary treatment and potentially worsening health outcomes.

Optimizing for False Positives vs. False Negatives:

	•	Minimizing False Positives: This is crucial when the cost of unnecessary follow-up actions (e.g., additional tests, treatments) is high. It ensures that fewer healthy patients undergo unnecessary stress and procedures.
	•	Minimizing False Negatives: This is critical when missing a diagnosis has severe consequences. In cancer detection, missing a malignant case can lead to delayed treatment and worse prognosis.

Optimizing the Classifier:

	•	Threshold Adjustment: Changing the decision threshold can help balance between false positives and false negatives. Lowering the threshold might reduce false negatives but increase false positives, and vice versa.
	•	Cost-Sensitive Learning: Incorporating the costs of false positives and false negatives into the learning process to train a model that minimizes the overall cost.

## Confusion Matrix and Intepreting our results

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nConfusion Matrix:")
print(conf_matrix)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))