In [31]:
# Step 1: Import the dependencies
import numpy as np
import pandas as pd
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.inference import VariableElimination
from sklearn.preprocessing import KBinsDiscretizer

In [32]:

# Step 2: Load the dataset into a Pandas DataFrame
hd = pd.read_csv("heart.csv")
hd = hd.replace('?', np.nan)   # handle missing values
print("Original dataset sample:")
print(hd.head())

Original dataset sample:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   67    1   1       136   145    1        0      149      1      4.2      0   
1   57    1   2       114   554    0        0      161      0      5.0      0   
2   43    1   3       155   292    0        0      166      0      4.8      1   
3   71    0   0        99   419    1        0      128      0      0.9      1   
4   36    1   0       145   252    0        1      128      0      2.6      0   

   ca  thal  target  
0   4     0       0  
1   3     0       1  
2   1     0       0  
3   3     0       0  
4   3     2       0  


In [34]:

# Step 3: Discretize continuous features into bins
continuous_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
disc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

# Fit and transform continuous columns
hd[continuous_cols] = disc.fit_transform(hd[continuous_cols])

# Convert discretized values to int
hd[continuous_cols] = hd[continuous_cols].astype(int)




In [35]:

# Step 4: Define Bayesian Model structure
model = DiscreteBayesianNetwork([
    ('age','target'), ('sex','target'), ('cp','target'),
    ('trestbps','target'), ('chol','target'), ('fbs','target'),
    ('restecg','target'), ('thalach','target'), ('exang','target'),
    ('oldpeak','target'), ('slope','target'), ('ca','target'),
    ('thal','target')
])

In [37]:
# Step 5: Convert all columns to string (categorical labels)
for col in hd.columns:
    hd[col] = hd[col].astype(str)

# Train the model with Maximum Likelihood Estimator
model.fit(hd, estimator=MaximumLikelihoodEstimator)

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'sex': 'C', 'cp': 'C', 'trestbps': 'C', 'chol': 'C', 'fbs': 'C', 'restecg': 'C', 'thalach': 'C', 'exang': 'C', 'oldpeak': 'C', 'slope': 'C', 'ca': 'C', 'thal': 'C', 'target': 'C'}


<pgmpy.models.DiscreteBayesianNetwork.DiscreteBayesianNetwork at 0x230088f7bb0>

In [None]:
for cpd in model.get_cpds():
    print(cpd)


+--------+----------+
| age(0) | 0.186667 |
+--------+----------+
| age(1) | 0.213333 |
+--------+----------+
| age(2) | 0.19     |
+--------+----------+
| age(3) | 0.203333 |
+--------+----------+
| age(4) | 0.206667 |
+--------+----------+


In [None]:

# Step 6: Use Variable Elimination for inference
hd_infer = VariableElimination(model)

In [30]:

# Step 7: Query the model with evidence
print("\n1. Probability of heart disease given evidence = restecg:1")
q1 = hd_infer.query(variables=['target'], evidence={'restecg': '1'})
print(q1)

print("\n2. Probability of heart disease given evidence = cp:2")
q2 = hd_infer.query(variables=['target'], evidence={'cp': '2'})
print(q2)



1. Probability of heart disease given evidence = restecg:1
+-----------+---------------+
| target    |   phi(target) |
| target(0) |        0.5000 |
+-----------+---------------+
| target(1) |        0.5000 |
+-----------+---------------+

2. Probability of heart disease given evidence = cp:2
+-----------+---------------+
| target    |   phi(target) |
| target(0) |        0.5000 |
+-----------+---------------+
| target(1) |        0.5000 |
+-----------+---------------+
