In [None]:
"""
Naive Bayes Classifier from scratch (categorical)
Dataset: Weather / Play Tennis
"""

import pandas as pd
from collections import defaultdict
from typing import Dict, List, Any


# ---------------------------------------------------------------------
# 1. Create the Weather / Play Tennis dataset
# ---------------------------------------------------------------------
def load_weather_dataset() -> pd.DataFrame:
    """
    Manually constructs the classic weather / play tennis dataset.
    """
    data = [
        # Outlook,   Temp,  Humidity, Windy, Play
        ["Rainy",    "Hot",  "High",   "f",  "no"],
        ["Rainy",    "Hot",  "High",   "t",  "no"],
        ["Overcast", "Hot",  "High",   "f",  "yes"],
        ["Sunny",    "Mild", "High",   "f",  "yes"],
        ["Sunny",    "Cool", "Normal", "f",  "yes"],
        ["Sunny",    "Cool", "Normal", "t",  "no"],
        ["Overcast", "Cool", "Normal", "t",  "yes"],
        ["Rainy",    "Mild", "High",   "f",  "no"],
        ["Rainy",    "Cool", "Normal", "f",  "yes"],
        ["Sunny",    "Mild", "Normal", "f",  "yes"],
        ["Rainy",    "Mild", "Normal", "t",  "yes"],
        ["Overcast", "Mild", "High",   "t",  "yes"],
        ["Overcast", "Hot",  "Normal", "f",  "yes"],
        ["Sunny",    "Mild", "High",   "t",  "no"],
    ]

    cols = ["Outlook", "Temp", "Humidity", "Windy", "Play"]
    return pd.DataFrame(data, columns=cols)


# ---------------------------------------------------------------------
# 2. Training: priors and likelihoods
# ---------------------------------------------------------------------
def compute_priors(df: pd.DataFrame, target_col: str) -> Dict[Any, float]:
    """
    Compute P(y) for each class y.
    """
    counts = df[target_col].value_counts()
    priors = (counts / len(df)).to_dict()
    return priors


def compute_likelihoods(
    df: pd.DataFrame, feature_cols: List[str], target_col: str
) -> Dict[Any, Dict[str, Dict[Any, float]]]:
    """
    Compute P(x_j | y) for all categorical feature values and classes.

    Returns a nested dict:
        likelihoods[class_value][feature_name][feature_value] = probability
    """
    likelihoods: Dict[Any, Dict[str, Dict[Any, float]]] = {}

    class_values = df[target_col].unique()
    for c in class_values:
        likelihoods[c] = {}
        df_c = df[df[target_col] == c]

        for feat in feature_cols:
            probs = (df_c[feat].value_counts() / len(df_c)).to_dict()
            likelihoods[c][feat] = probs

    return likelihoods


# ---------------------------------------------------------------------
# 3. Prediction (single instance + helper for whole DataFrame)
# ---------------------------------------------------------------------
def predict_single(
    x: Dict[str, Any],
    priors: Dict[Any, float],
    likelihoods: Dict[Any, Dict[str, Dict[Any, float]]],
    feature_cols: List[str],
    smoothing: float = 1e-9,
) -> (Any, Dict[Any, float]):
    """
    Predict class for a single instance x using Naive Bayes.

    x: dict like {"Outlook": "Rainy", "Temp": "Mild", ...}
    smoothing: small value to avoid zero probabilities (very simple Laplace-like)

    Returns:
        predicted_class, posterior_probs_dict
    """
    scores = {}

    for c, prior in priors.items():
        score = prior
        for feat in feature_cols:
            val = x[feat]
            # get P(val | c) with tiny smoothing if unseen
            prob = likelihoods[c][feat].get(val, smoothing)
            score *= prob
        scores[c] = score

    # normalise to get proper posterior probabilities
    total = sum(scores.values())
    if total == 0:
        # all zero (shouldn't really happen with smoothing)
        posteriors = {c: 0.0 for c in scores}
    else:
        posteriors = {c: s / total for c, s in scores.items()}

    predicted_class = max(posteriors, key=posteriors.get)
    return predicted_class, posteriors


def predict_dataframe(
    df: pd.DataFrame,
    priors: Dict[Any, float],
    likelihoods: Dict[Any, Dict[str, Dict[Any, float]]],
    feature_cols: List[str],
) -> List[Any]:
    """
    Predict classes for all rows of a DataFrame.
    """
    predictions = []
    for _, row in df.iterrows():
        x = {feat: row[feat] for feat in feature_cols}
        y_hat, _ = predict_single(x, priors, likelihoods, feature_cols)
        predictions.append(y_hat)
    return predictions


# ---------------------------------------------------------------------
# 4. Main: train, evaluate, and test some queries
# ---------------------------------------------------------------------
def main():
    # Load dataset
    df = load_weather_dataset()
    print("Weather dataset:")
    print(df)
    print("\nClass distribution:")
    print(df["Play"].value_counts(), "\n")

    feature_cols = ["Outlook", "Temp", "Humidity", "Windy"]
    target_col = "Play"

    # Train: compute priors and likelihoods
    priors = compute_priors(df, target_col)
    likelihoods = compute_likelihoods(df, feature_cols, target_col)

    print("Priors P(Play):")
    for c, p in priors.items():
        print(f"  P(Play={c}) = {p:.4f}")
    print()

    print("Likelihoods P(feature | Play):")
    for c in likelihoods:
        print(f"\nClass = {c}")
        for feat in feature_cols:
            print(f"  {feat}: {likelihoods[c][feat]}")
    print("\n" + "-" * 60)

    # Evaluate training accuracy
    y_true = df[target_col].tolist()
    y_pred = predict_dataframe(df, priors, likelihoods, feature_cols)

    correct = sum(yt == yp for yt, yp in zip(y_true, y_pred))
    acc = correct / len(df)
    print(f"Training accuracy: {acc * 100:.2f}% ({correct}/{len(df)})")
    print("-" * 60)

    # Test queries (like in the article)
    queries = [
        {"Outlook": "Rainy",    "Temp": "Mild", "Humidity": "Normal", "Windy": "t"},
        {"Outlook": "Overcast", "Temp": "Cool", "Humidity": "Normal", "Windy": "t"},
        {"Outlook": "Sunny",    "Temp": "Hot",  "Humidity": "High",   "Windy": "t"},
    ]

    print("Query predictions:")
    for i, q in enumerate(queries, start=1):
        pred, post = predict_single(q, priors, likelihoods, feature_cols)
        print(f"Query {i}: {q} -> predicted Play = '{pred}' "
              f"with posteriors {post}")


if __name__ == "__main__":
    main()


Weather dataset:
     Outlook  Temp Humidity Windy Play
0      Rainy   Hot     High     f   no
1      Rainy   Hot     High     t   no
2   Overcast   Hot     High     f  yes
3      Sunny  Mild     High     f  yes
4      Sunny  Cool   Normal     f  yes
5      Sunny  Cool   Normal     t   no
6   Overcast  Cool   Normal     t  yes
7      Rainy  Mild     High     f   no
8      Rainy  Cool   Normal     f  yes
9      Sunny  Mild   Normal     f  yes
10     Rainy  Mild   Normal     t  yes
11  Overcast  Mild     High     t  yes
12  Overcast   Hot   Normal     f  yes
13     Sunny  Mild     High     t   no

Class distribution:
Play
yes    9
no     5
Name: count, dtype: int64 

Priors P(Play):
  P(Play=yes) = 0.6429
  P(Play=no) = 0.3571

Likelihoods P(feature | Play):

Class = no
  Outlook: {'Rainy': 0.6, 'Sunny': 0.4}
  Temp: {'Hot': 0.4, 'Mild': 0.4, 'Cool': 0.2}
  Humidity: {'High': 0.8, 'Normal': 0.2}
  Windy: {'t': 0.6, 'f': 0.4}

Class = yes
  Outlook: {'Overcast': 0.4444444444444444, 'Sunny