# Create the dataset

In [None]:
import pandas as pd

# Create the DataFrame
data = [
    ["Sunny",    "Hot",  "High",   "Weak",   "No"],
    ["Sunny",    "Hot",  "High",   "Strong", "No"],
    ["Overcast", "Hot",  "High",   "Weak",   "Yes"],
    ["Rainy",    "Mild", "High",   "Weak",   "Yes"],
    ["Rainy",    "Cool", "Normal", "Weak",   "Yes"],
    ["Rainy",    "Cool", "Normal", "Strong", "No"],
    ["Overcast", "Cool", "Normal", "Strong", "Yes"],
    ["Sunny",    "Mild", "High",   "Weak",   "No"],
    ["Sunny",    "Cool", "Normal", "Weak",   "Yes"],
    ["Rainy",    "Mild", "Normal", "Weak",   "Yes"],
    ["Sunny",    "Mild", "Normal", "Strong", "Yes"],
    ["Overcast", "Mild", "High",   "Strong", "Yes"],
    ["Overcast", "Hot",  "Normal", "Weak",   "Yes"],
    ["Rainy",    "Mild", "High",   "Strong", "No"],
]

columns = ["Outlook", "Temperature", "Humidity", "Wind", "PlayTennis"]
df = pd.DataFrame(data, columns=columns)

print(df)

# Save as CSV (no index column)
# df.to_csv("play_tennis.csv", index=False)
print("Saved to play_tennis.csv")

     Outlook Temperature Humidity    Wind PlayTennis
0      Sunny         Hot     High    Weak         No
1      Sunny         Hot     High  Strong         No
2   Overcast         Hot     High    Weak        Yes
3      Rainy        Mild     High    Weak        Yes
4      Rainy        Cool   Normal    Weak        Yes
5      Rainy        Cool   Normal  Strong         No
6   Overcast        Cool   Normal  Strong        Yes
7      Sunny        Mild     High    Weak         No
8      Sunny        Cool   Normal    Weak        Yes
9      Rainy        Mild   Normal    Weak        Yes
10     Sunny        Mild   Normal  Strong        Yes
11  Overcast        Mild     High  Strong        Yes
12  Overcast         Hot   Normal    Weak        Yes
13     Rainy        Mild     High  Strong         No
Saved to play_tennis.csv


In [2]:
# naive_bayes_solution.py
# NumPy-only Naive Bayes (categorical) solution for the PlayTennis dataset.

import numpy as np


def load_play_tennis_csv(path: str):
    """Loads play_tennis.csv into (X, y, feature_names). X is (N,4) strings, y is (N,) strings."""
    with open(path, "r", encoding="utf-8") as f:
        header = f.readline().strip().split(",")

    data = np.genfromtxt(path, delimiter=",", dtype=str, skip_header=1)
    if data.ndim == 1:  # in case the CSV has only 1 row (not expected here)
        data = data[None, :]

    X = data[:, :-1]
    y = data[:, -1]
    feature_names = header[:-1]
    label_name = header[-1]
    return X, y, feature_names, label_name


def fit_naive_bayes_categorical(X: np.ndarray, y: np.ndarray):
    """
    Fit a categorical Naive Bayes model using frequency counts.
    Stores priors, per-class counts, per-feature value sets, and conditional counts.
    """
    n_samples, n_features = X.shape
    classes, class_counts_arr = np.unique(y, return_counts=True)

    class_counts = {c: int(cnt) for c, cnt in zip(classes, class_counts_arr)}
    priors = {c: class_counts[c] / n_samples for c in classes}

    feature_values = [np.unique(X[:, j]) for j in range(n_features)]

    # Conditional counts: cond_counts[c][j][v] = count of (X_j=v AND y=c)
    cond_counts = {c: [] for c in classes}
    for c in classes:
        mask_c = (y == c)
        for j in range(n_features):
            vals = feature_values[j]
            counts_j = {v: int(np.sum(mask_c & (X[:, j] == v))) for v in vals}
            cond_counts[c].append(counts_j)

    model = {
        "n_samples": n_samples,
        "n_features": n_features,
        "classes": classes,
        "class_counts": class_counts,
        "priors": priors,
        "feature_values": feature_values,
        "cond_counts": cond_counts,
    }
    return model


def conditional_prob(model, class_label: str, feature_index: int, feature_value: str, smoothing: bool = False):
    """
    Returns P(X_j=feature_value | y=class_label).
    If smoothing=False: frequency estimate.
    If smoothing=True: add-one smoothing with K_j = #unique values in feature j.
    Handles unseen feature_value at prediction time.
    """
    n_c = model["class_counts"][class_label]
    K_j = len(model["feature_values"][feature_index])

    count = model["cond_counts"][class_label][feature_index].get(feature_value, 0)

    if not smoothing:
        return count / n_c if n_c > 0 else 0.0

    # Add-one (Laplace) smoothing
    return (count + 1) / (n_c + K_j)


def conditional_fraction(model, class_label: str, feature_index: int, feature_value: str, smoothing: bool = False):
    """
    Returns (numerator, denominator) for P(X_j=feature_value | y=class_label),
    matching the slide-style fractions.
    """
    n_c = model["class_counts"][class_label]
    K_j = len(model["feature_values"][feature_index])

    count = model["cond_counts"][class_label][feature_index].get(feature_value, 0)

    if not smoothing:
        return count, n_c
    return count + 1, n_c + K_j


def prior_fraction(model, class_label: str):
    """Returns (numerator, denominator) for P(y=class_label)."""
    return model["class_counts"][class_label], model["n_samples"]


def score_naive_bayes(model, x_star: np.ndarray, class_label: str, smoothing: bool = False, use_log: bool = False):
    """
    Unnormalized NB score: P(c) * Π_j P(x_j | c)
    If use_log=True: returns log-score to avoid underflow.
    """
    prior = model["priors"][class_label]

    if use_log:
        s = np.log(prior)
        for j, v in enumerate(x_star):
            p = conditional_prob(model, class_label, j, v, smoothing=smoothing)
            if p <= 0.0:
                return -np.inf
            s += np.log(p)
        return s

    s = prior
    for j, v in enumerate(x_star):
        p = conditional_prob(model, class_label, j, v, smoothing=smoothing)
        s *= p
    return s


def predict_one(model, x_star: np.ndarray, smoothing: bool = False, use_log: bool = False):
    """Predict label for one sample x_star."""
    classes = model["classes"]
    scores = {c: score_naive_bayes(model, x_star, c, smoothing=smoothing, use_log=use_log) for c in classes}
    pred = max(scores, key=scores.get)
    return pred, scores


def predict(model, X: np.ndarray, smoothing: bool = False, use_log: bool = False):
    """Predict labels for a batch X."""
    preds = []
    for i in range(X.shape[0]):
        pred_i, _ = predict_one(model, X[i], smoothing=smoothing, use_log=use_log)
        preds.append(pred_i)
    return np.array(preds, dtype=str)


def fmt_frac(num: int, den: int):
    return f"{num}/{den}"


def print_slide_example(model, feature_names):
    """
    Reproduce the slide’s intermediate probabilities and NB scores for:
    x* = (Sunny, Cool, High, Strong)
    """
    x_star = np.array(["Sunny", "Cool", "High", "Strong"], dtype=str)

    print("=== Slide-style intermediate probabilities (no smoothing) ===")
    print("x* =", x_star.tolist())
    print()

    # Priors
    for c in ["Yes", "No"]:
        num, den = prior_fraction(model, c)
        print(f"P({c}) = {fmt_frac(num, den)} = {num/den:.6f}")
    print()

    # Conditionals
    for c in ["Yes", "No"]:
        print(f"--- Conditionals given {c} ---")
        for j, v in enumerate(x_star):
            num, den = conditional_fraction(model, c, j, v, smoothing=False)
            p = num / den if den > 0 else 0.0
            print(f"P({v} | {c})  [feature={feature_names[j]}] = {fmt_frac(num, den)} = {p:.6f}")
        print()

    # Scores (as in slide)
    score_yes = score_naive_bayes(model, x_star, "Yes", smoothing=False, use_log=False)
    score_no = score_naive_bayes(model, x_star, "No", smoothing=False, use_log=False)

    print("=== Naive Bayes unnormalized scores (no smoothing) ===")
    print(f"score(Yes) = {score_yes:.10f}  (≈ {score_yes:.4f})")
    print(f"score(No)  = {score_no:.10f}  (≈ {score_no:.4f})")
    print("Prediction:", "Yes" if score_yes > score_no else "No")
    print()

    # Also show log-scores
    log_yes = score_naive_bayes(model, x_star, "Yes", smoothing=False, use_log=True)
    log_no = score_naive_bayes(model, x_star, "No", smoothing=False, use_log=True)
    print("=== Log-scores (no smoothing) ===")
    print(f"logscore(Yes) = {log_yes:.10f}")
    print(f"logscore(No)  = {log_no:.10f}")
    print("Prediction (log):", "Yes" if log_yes > log_no else "No")
    print()


def print_zero_count_example(model):
    """
    Demonstrate the zero-count problem and Laplace smoothing for:
    P(Overcast | No)
    """
    j_outlook = 0
    v = "Overcast"
    c = "No"

    num0, den0 = conditional_fraction(model, c, j_outlook, v, smoothing=False)
    p0 = conditional_prob(model, c, j_outlook, v, smoothing=False)

    num1, den1 = conditional_fraction(model, c, j_outlook, v, smoothing=True)
    p1 = conditional_prob(model, c, j_outlook, v, smoothing=True)

    print("=== Zero-count and Laplace smoothing example ===")
    print(f"P({v} | {c}) without smoothing = {fmt_frac(num0, den0)} = {p0:.6f}")
    print(f"P({v} | {c}) with add-one smoothing = {fmt_frac(num1, den1)} = {p1:.6f}")
    print()


def main():
    # Adjust path if needed
    X, y, feature_names, label_name = load_play_tennis_csv("play_tennis.csv")

    print(f"Loaded dataset with N={X.shape[0]} rows, {X.shape[1]} features. Label column: {label_name}")
    print("Feature names:", feature_names)
    print("Class labels:", np.unique(y).tolist())
    print()

    model = fit_naive_bayes_categorical(X, y)

    # Reproduce the lecture-slide example
    print_slide_example(model, feature_names)

    # Show zero-count and smoothing example
    print_zero_count_example(model)

    # (Optional) predict all training samples (not required, but useful sanity check)
    preds = predict(model, X, smoothing=False, use_log=True)
    acc = np.mean(preds == y)
    print("Training accuracy (no smoothing, log):", f"{acc:.3f}")


if __name__ == "__main__":
    main()


Loaded dataset with N=14 rows, 4 features. Label column: PlayTennis
Feature names: ['Outlook', 'Temperature', 'Humidity', 'Wind']
Class labels: ['No', 'Yes']

=== Slide-style intermediate probabilities (no smoothing) ===
x* = ['Sunny', 'Cool', 'High', 'Strong']

P(Yes) = 9/14 = 0.642857
P(No) = 5/14 = 0.357143

--- Conditionals given Yes ---
P(Sunny | Yes)  [feature=Outlook] = 2/9 = 0.222222
P(Cool | Yes)  [feature=Temperature] = 3/9 = 0.333333
P(High | Yes)  [feature=Humidity] = 3/9 = 0.333333
P(Strong | Yes)  [feature=Wind] = 3/9 = 0.333333

--- Conditionals given No ---
P(Sunny | No)  [feature=Outlook] = 3/5 = 0.600000
P(Cool | No)  [feature=Temperature] = 1/5 = 0.200000
P(High | No)  [feature=Humidity] = 4/5 = 0.800000
P(Strong | No)  [feature=Wind] = 3/5 = 0.600000

=== Naive Bayes unnormalized scores (no smoothing) ===
score(Yes) = 0.0052910053  (≈ 0.0053)
score(No)  = 0.0205714286  (≈ 0.0206)
Prediction: No

=== Log-scores (no smoothing) ===
logscore(Yes) = -5.2417470151
logscor