# Understanding Bayes Nets

We have:

- Nodes: Variables (features like Age, Stress_Level and Mental_Health_Condition)
- Edges: Dependencies between variables (Stress_Level -> Mental_Health_Condition)

Bayes Nets encode the joint probability distribution of variables, allowing inference given evidence.

# Setup

In [4]:
from inspect import getsource
from IPython.display import display
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import itertools
import math
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
import time
import json
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
random.seed(42)

In [3]:
mental_data = pd.read_csv("../data/final_mental_data.csv")

In [4]:
mental_data.head()

Unnamed: 0,Age,Gender,Occupation,Country,Mental_Health_Condition,Severity,Consultation_History,Stress_Level,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Work_Stress_Index,Physical_Activity_Stress_Index
0,0,2,6,1,0,2,1,2,2,1,2,1,2
1,0,0,0,5,1,1,0,1,2,2,0,1,2
2,1,2,6,6,1,0,0,1,0,2,0,1,2
3,0,1,5,0,0,1,0,2,0,1,1,1,2
4,2,0,3,1,1,1,0,2,1,2,2,2,2


In [5]:
mental_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   Age                             1500 non-null   int64
 1   Gender                          1500 non-null   int64
 2   Occupation                      1500 non-null   int64
 3   Country                         1500 non-null   int64
 4   Mental_Health_Condition         1500 non-null   int64
 5   Severity                        1500 non-null   int64
 6   Consultation_History            1500 non-null   int64
 7   Stress_Level                    1500 non-null   int64
 8   Sleep_Hours                     1500 non-null   int64
 9   Work_Hours                      1500 non-null   int64
 10  Physical_Activity_Hours         1500 non-null   int64
 11  Work_Stress_Index               1500 non-null   int64
 12  Physical_Activity_Stress_Index  1500 non-null   int64
dtypes: 

In [6]:
train_data, test_data = train_test_split(
    mental_data,
    test_size=0.4,
    random_state=42,
    stratify=mental_data["Mental_Health_Condition"]
)

In [7]:
categorical_columns = mental_data.columns
for col in categorical_columns:
    train_data[col] = train_data[col].astype("category")
    test_data[col] = test_data[col].astype("category")

# Helper Code

In [8]:
def extend(s, var, val):
    """Create a copy of dictionary `s` and add a new key-value pair where `var` is set to `val`. Return the updated copy."""
    return {**s, var: val}

In [11]:
class ProbDist:
    """
    Represents a discrete probability distribution for a single random variable. 
    You can initialize it with a variable name and an optional frequency dictionary.
    Probabilities are normalized automatically if frequencies are provided.

    Example:
    >>> P = ProbDist('Flip'); P['H'], P['T'] = 0.25, 0.75; P['H']
    0.25
    >>> P = ProbDist('X', {'lo': 125, 'med': 375, 'hi': 500})
    >>> P['lo'], P['med'], P['hi']
    (0.125, 0.375, 0.5)
    """
    def __init__(self, varname='?', freqs=None):
        """
        Initialize the distribution. If `freqs` is given, it must be a dictionary 
        with values as keys and their frequencies as values. The distribution is normalized.
        """
        self.prob = {}
        self.varname = varname
        self.values = []
        if freqs:
            for (v, p) in freqs.items():
                self[v] = p
            self.normalize()

    def __getitem__(self, val):
        """Retrieve the probability of `val` if it exists, otherwise return 0."""
        try:
            return self.prob[val]
        except KeyError:
            return 0

    def __setitem__(self, val, p):
        """Assign probability `p` to the value `val`."""
        if val not in self.values:
            self.values.append(val)
        self.prob[val] = p

    def normalize(self):
        """
        Ensure that the probabilities of all values sum up to 1. 
        If the sum of values is 0, a ZeroDivisionError is raised.
        """
        total = sum(self.prob.values())
        if not np.isclose(total, 1.0):
            for val in self.prob:
                self.prob[val] /= total
        return self

    def show_approx(self, numfmt='{:.3g}'):
        """
        Display the probabilities rounded to a specified format, sorted by their keys. 
        Useful for readability in doctests.
        """
        return ', '.join([('{}: ' + numfmt).format(v, p)
                          for (v, p) in sorted(self.prob.items())])

    def __repr__(self):
        """Return a string representation of the distribution."""
        return "P({})".format(self.varname)

In [14]:
def probability_sampling(probabilities):
    """
    Perform random sampling based on the given probability distribution. 
    Returns an outcome based on the probabilities.
    """
    total = sum(probabilities.values())
    r = random.uniform(0, total)
    cumulative = 0
    for outcome, prob in probabilities.items():
        cumulative += prob
        if r <= cumulative:
            return outcome
    return None  # This should not occur if probabilities are normalized.

In [15]:
class MultiClassBayesNode:
    """
    Represents a node in a Bayesian network for multi-class variables. 
    It contains the variable, its parents, and the conditional probability table (CPT).
    """
    def __init__(self, X, parents, cpt):
        """
        Initialize the node with:
        - `X`: Variable name.
        - `parents`: List of parent variable names.
        - `cpt`: A dictionary representing the conditional probability table.
        """
        if isinstance(parents, str):
            parents = parents.split()
        self.variable = X
        self.parents = parents
        self.cpt = cpt
        self.children = []

    def p(self, value, event):
        """
        Compute the conditional probability of `X=value` given the parent values in `event`.
        """
        parent_values = tuple(event.get(p, None) for p in self.parents)
        probabilities = self.cpt.get(parent_values, {})
        return probabilities.get(value, 0)  # Defaults to 0 if `value` is not found.

    def sample(self, event):
        """
        Sample a value for the variable given parent values in `event`. 
        Sampling is based on the conditional probability distribution.
        """
        parent_values = tuple(event.get(p, None) for p in self.parents)
        probabilities = self.cpt.get(parent_values, {})
        return probability_sampling(probabilities)

    def __repr__(self):
        """Return a string representation of the node."""
        return repr((self.variable, ' '.join(self.parents)))

In [16]:
class BayesNet:
    """
    Represents a Bayesian network consisting of nodes (variables) and their dependencies.
    Supports multi-class nodes.
    """
    def __init__(self, node_specs=None):
        """
        Initialize the network. Nodes must be added in topological order 
        (parents must be added before their children).
        """
        self.nodes = []
        self.variables = []
        node_specs = node_specs or []
        for node_spec in node_specs:
            self.add(node_spec)

    def add(self, node_spec):
        """
        Add a node to the network. Accepts either a pre-constructed node 
        or the specifications for a new node.
        """
        if isinstance(node_spec, MultiClassBayesNode):
            node = node_spec
        else:
            node = MultiClassBayesNode(*node_spec)

        assert node.variable not in self.variables
        assert all((parent in self.variables) for parent in node.parents)

        self.nodes.append(node)
        self.variables.append(node.variable)

        # Register this node as a child for its parent nodes
        for parent in node.parents:
            self.variable_node(parent).children.append(node)

    def variable_node(self, var):
        """Retrieve the node corresponding to the variable `var`."""
        for n in self.nodes:
            if n.variable == var:
                return n
        raise Exception(f"No such variable: {var}")

    def variable_values(self, var):
        """Retrieve the domain of `var` (default is `[True, False]`)."""
        return [True, False]

    def __repr__(self):
        """Return a string representation of the network."""
        return f"BayesNet({self.nodes!r})"

In [17]:
def make_factor(var, e, bn):
    """
    Generate a factor for the variable `var` based on its joint distribution 
    in the network `bn` under evidence `e`.
    """
    node = bn.variable_node(var)
    variables = [X for X in [var] + node.parents if X not in e]
    cpt = {event_values(e1, variables): node.p(e1[var], e1)
           for e1 in all_events(variables, bn, e)}
    return Factor(variables, cpt)

In [18]:
class Factor:
    """Represents a factor in a joint distribution."""
    def __init__(self, variables, cpt):
        """
        Initialize the factor with:
        - `variables`: List of variables involved in the factor.
        - `cpt`: Conditional probability table.
        """
        self.variables = variables
        self.cpt = cpt

    def normalize(self):
        """
        Normalize the factor and return a `ProbDist` for the remaining variable.
        This is only valid if the factor has one variable left.
        """
        assert len(self.variables) == 1
        return ProbDist(self.variables[0], {k: v for ((k,), v) in self.cpt.items()})

    def p(self, e):
        """Retrieve the probability for the event `e` from the factor's CPT."""
        return self.cpt[event_values(e, self.variables)]

In [19]:
def enumerate_all(variables, e, bn):
    """
    Calculate the sum of all entries in the joint probability distribution 
    for `variables` consistent with the evidence `e` in network `bn`.
    """
    if not variables:
        return 1.0
    Y, rest = variables[0], variables[1:]
    Ynode = bn.variable_node(Y)
    if Y in e:
        return Ynode.p(e[Y], e) * enumerate_all(rest, e, bn)
    else:
        return sum(Ynode.p(y, e) * enumerate_all(rest, extend(e, Y, y), bn)
                   for y in bn.variable_values(Y))

In [20]:
def enumeration_ask(X, e, bn):
    """
    Compute the conditional probability distribution for the query variable `X` 
    given evidence `e` in the Bayesian network `bn`.
    """
    assert X not in e, "Query variable must not overlap with the evidence."
    Q = ProbDist(X)
    for xi in bn.variable_values(X):
        Q[xi] = enumerate_all(bn.variables, extend(e, X, xi), bn)
    return Q.normalize()

In [28]:
def event_values(event, variables):
    """
    Generate a tuple containing the values of the specified variables from the event.
    
    Examples:
    >>> event_values({'A': 10, 'B': 9, 'C': 8}, ['C', 'A'])
    (8, 10)
    >>> event_values((1, 2), ['C', 'A'])
    (1, 2)
    """
    if isinstance(event, tuple) and len(event) == len(variables):
        return event
    else:
        return tuple(event[var] for var in variables)

# Design the Network Structure

Find dependencies.

Mental_Health_Condition is the target variable.

It depends on:
- Gender, Physical Activity Hours.

Other variables are conditionally dependent based on logical relationships:
- Physical Activity Hours depend on:
    - Country
- Gender depends on:
    - Occupation
- Country depends on:
    - Stress Level
- Stress Level depends on:
    - Age

# Estimate Conditional Probabilities

If a node is a root node, then estimate probability directly from the data. Estimate conditional probabilities based on parent for non-root nodes.

In [70]:
def compute_cpt(data, target, parents, alpha=1):
    """
    Compute CPT with Laplace smoothing.
    
    Args:
        
        data: pandas DataFrame (training data)
        target: str, target variable
        parents: list of parent variable names
        alpha: smoothing parameter (default=1)
    
    Returns:
        cpt: dict { parent_values_tuple: { target_value: probability } }
    """
    target_values = data[target].cat.categories

    if not parents:
        # Marginal distribution of target
        counts = defaultdict(lambda: alpha)
        for val in data[target]:
            counts[val] += 1
        total = sum(counts.values())
        cpt = {(): {tv: counts[tv]/total for tv in counts}}
        return cpt

    # Determine possible parent combinations
    from itertools import product
    parent_values_list = [data[p].cat.categories for p in parents]
    parent_combinations = list(product(*parent_values_list)) if parents else [()]

    # Initialize counts with alpha
    counts = {pc: defaultdict(lambda: alpha) for pc in parent_combinations}

    # Count occurrences
    for _, row in data.iterrows():
        pv = tuple(row[p] for p in parents) if parents else ()
        tv = row[target]
        counts[pv][tv] += 1

    # Compute probabilities
    cpt = {}
    for pc in parent_combinations:
        total = sum(counts[pc].values())
        cpt[pc] = {tv: (counts[pc][tv] / total) for tv in counts[pc]}
        
    return cpt

In [86]:
cpt_mental_health = compute_cpt(train_data, "Mental_Health_Condition", ["Physical_Activity_Hours", "Gender"])

In [71]:
# Record Training Time
def compute_all_cpts(train_data):
    """
    Compute all CPTs and record the time taken.
    """
    start_time = time.time()  # Start timing

    cpt_mental_health = compute_cpt(train_data, "Mental_Health_Condition", ["Physical_Activity_Hours", "Gender"])
    cpt_activity = compute_cpt(train_data, "Physical_Activity_Hours", ["Country"])
    cpt_gender = compute_cpt(train_data, "Gender", ["Occupation"])
    cpt_country = compute_cpt(train_data, "Country", ["Age"])
    cpt_occupation = compute_cpt(train_data, "Occupation", [])
    cpt_age = compute_cpt(train_data, "Age", [])

    end_time = time.time()  # End timing
    training_time = end_time - start_time
    print(f"Training Time (CPT Computation): {training_time:.4f} seconds")

    return {
        "cpt_mental_health": cpt_mental_health,
        "cpt_activity": cpt_activity,
        "cpt_gender": cpt_gender,
        "cpt_country": cpt_country,
        "cpt_occupation": cpt_occupation,
        "cpt_age": cpt_age
    }, training_time

In [78]:
cpts, training_time = compute_all_cpts(train_data)

Training Time (CPT Computation): 0.1261 seconds


# Implement Inference

Use the chain rule of probability to predict target given evidence.

In [72]:
mental_health_node = MultiClassBayesNode("Mental_Health_Condition", ["Physical_Activity_Hours", "Gender"], cpts["cpt_mental_health"])
activity_node = MultiClassBayesNode("Physical_Activity_Hours", ["Country"], cpts["cpt_activity"])
gender_node = MultiClassBayesNode("Gender", ["Occupation"], cpts["cpt_gender"])
country_node = MultiClassBayesNode("Country", ["Age"], cpts["cpt_country"])
occupation_node = MultiClassBayesNode("Occupation", [], cpts["cpt_occupation"])
age_node = MultiClassBayesNode("Age", [], cpts["cpt_age"])

In [73]:
mental_health_bn = BayesNet([
    age_node,
    occupation_node,
    gender_node,
    country_node,
    activity_node,
    mental_health_node
])

In [74]:
print(mental_health_bn)

BayesNet([('Age', ''), ('Occupation', ''), ('Gender', 'Occupation'), ('Country', 'Age'), ('Physical_Activity_Hours', 'Country'), ('Mental_Health_Condition', 'Physical_Activity_Hours Gender')])


# Evaluate the Model

In [75]:
def predict_bayes_net(bn, evidence, query_var):
    """
    Predict the most likely value of a query variable given evidence using the Bayesian Network.
    
    Args:
        bn: Bayesian network.
        evidence: Dictionary of evidence variables and their values.
        query_var: Variable to predict.
    
    Returns:
        The most likely value of the query variable.
    """
    result = enumeration_ask(query_var, evidence, bn)
    return max(result.prob, key=lambda k: result.prob[k])

In [1]:
def evaluate_bayes_net_with_time(bn, test_data, query_var):
    """
    Evaluate the Bayesian Network and measure prediction time.

    Args:
        bn: Bayesian Network instance.
        test_data: Pandas DataFrame (test dataset).
        query_var: The target variable to predict.

    Returns:
        metrics: Dictionary containing accuracy and timing metrics.
    """
    y_true = []
    y_pred = []

    # Start Timing Predictions
    start_time = time.time()

    for _, row in test_data.iterrows():
        evidence = {
            "Gender": row["Gender"],
            "Physical_Activity_Hours": row["Physical_Activity_Hours"],
            "Country": row["Country"],
            "Age": row["Age"],
            "Occupation": row["Occupation"],
        }
        prediction = predict_bayes_net(bn, evidence, query_var)
        y_true.append(row[query_var])
        y_pred.append(prediction)

    end_time = time.time()
    prediction_time = end_time - start_time

    # Calculate Metrics
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, zero_division=0)

    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", report)

    metrics = {
        "accuracy": acc,
        "prediction_time": prediction_time
    }

    print(f"Prediction Time: {prediction_time:.4f} seconds")
    return metrics

In [77]:
# Evaluate the Bayesian Network and Record Prediction Time
print("\n--- Evaluating Bayesian Network ---\n")
metrics = evaluate_bayes_net_with_time(mental_health_bn, test_data, "Mental_Health_Condition")

# Final Output
print("\n--- Final Results ---")
print(f"Training Time: {training_time:.4f} seconds")
print(f"Prediction Time: {metrics['prediction_time']:.4f} seconds")
print(f"Accuracy: {metrics['accuracy']:.2%}")


--- Evaluating Bayesian Network ---

Confusion Matrix:
 [[ 11 249]
 [ 10 330]]

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.04      0.08       260
           1       0.57      0.97      0.72       340

    accuracy                           0.57       600
   macro avg       0.55      0.51      0.40       600
weighted avg       0.55      0.57      0.44       600

Prediction Time: 0.1032 seconds

--- Final Results ---
Training Time: 0.1091 seconds
Prediction Time: 0.1032 seconds
Accuracy: 56.83%


# For Later

In [80]:
def cpts_to_json(cpts):
    serializable_cpts = {}
    for var, cpt in cpts.items():
        serializable_cpts[var] = {
            str(parent_comb): {str(k): v for k, v in target_probs.items()}
            for parent_comb, target_probs in cpt.items()
        }
    return serializable_cpts

In [83]:
# Save the CPTs to a JSON file
cpt_json = cpts_to_json(cpts)
with open("mental_cpts.json", "w") as f:
    json.dump(cpt_json, f, indent=4)
print("CPTs saved successfully to mental_cpts.json!")

CPTs saved successfully to mental_cpts.json!


In [85]:
cpts

{'cpt_mental_health': {(0, 0): {1: 0.6, 0: 0.4},
  (0, 1): {0: 0.5081967213114754, 1: 0.4918032786885246},
  (0, 2): {1: 0.6271186440677966, 0: 0.3728813559322034},
  (0, 3): {1: 0.5283018867924528, 0: 0.4716981132075472},
  (1, 0): {1: 0.5490196078431373, 0: 0.45098039215686275},
  (1, 1): {1: 0.5915492957746479, 0: 0.4084507042253521},
  (1, 2): {1: 0.6363636363636364, 0: 0.36363636363636365},
  (1, 3): {1: 0.5211267605633803, 0: 0.4788732394366197},
  (2, 0): {0: 0.3939393939393939, 1: 0.6060606060606061},
  (2, 1): {0: 0.4318181818181818, 1: 0.5681818181818182},
  (2, 2): {1: 0.5408163265306123, 0: 0.45918367346938777},
  (2, 3): {0: 0.47368421052631576, 1: 0.5263157894736842}},
 'cpt_activity': {(0,): {2: 0.40559440559440557,
   0: 0.26573426573426573,
   1: 0.32867132867132864},
  (1,): {2: 0.43478260869565216,
   1: 0.34782608695652173,
   0: 0.21739130434782608},
  (2,): {2: 0.3656716417910448, 0: 0.3283582089552239, 1: 0.30597014925373134},
  (3,): {2: 0.375, 0: 0.243055555555

In [84]:
cpt_json

{'cpt_mental_health': {'(0, 0)': {'1': 0.6, '0': 0.4},
  '(0, 1)': {'0': 0.5081967213114754, '1': 0.4918032786885246},
  '(0, 2)': {'1': 0.6271186440677966, '0': 0.3728813559322034},
  '(0, 3)': {'1': 0.5283018867924528, '0': 0.4716981132075472},
  '(1, 0)': {'1': 0.5490196078431373, '0': 0.45098039215686275},
  '(1, 1)': {'1': 0.5915492957746479, '0': 0.4084507042253521},
  '(1, 2)': {'1': 0.6363636363636364, '0': 0.36363636363636365},
  '(1, 3)': {'1': 0.5211267605633803, '0': 0.4788732394366197},
  '(2, 0)': {'0': 0.3939393939393939, '1': 0.6060606060606061},
  '(2, 1)': {'0': 0.4318181818181818, '1': 0.5681818181818182},
  '(2, 2)': {'1': 0.5408163265306123, '0': 0.45918367346938777},
  '(2, 3)': {'0': 0.47368421052631576, '1': 0.5263157894736842}},
 'cpt_activity': {'(0,)': {'2': 0.40559440559440557,
   '0': 0.26573426573426573,
   '1': 0.32867132867132864},
  '(1,)': {'2': 0.43478260869565216,
   '1': 0.34782608695652173,
   '0': 0.21739130434782608},
  '(2,)': {'2': 0.3656716417