In [1]:
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from pgmpy.models import BayesianNetwork
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("diabetes_data_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,Age > 45,Gender,Polyuria,sudden weight loss,visual blurring,Itching,Obesity,class
0,1,0,0,0,1,1,1,0
1,1,0,0,1,0,0,1,1
2,1,1,0,0,1,0,1,0
3,1,0,1,0,1,0,1,0
4,1,1,1,1,1,1,1,1


In [4]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Age > 45,Gender,Polyuria,sudden weight loss,visual blurring,Itching,Obesity,class
Age > 45,1.0,-0.268894,-0.28184,-0.208092,-0.052496,-0.005396,-0.449233,0.021715
Gender,-0.268894,1.0,0.447207,0.235095,0.088289,0.126567,0.665922,0.159551
Polyuria,-0.28184,0.447207,1.0,0.068754,-0.004516,0.169294,0.436568,0.08946
sudden weight loss,-0.208092,0.235095,0.068754,1.0,0.291191,0.109005,0.2513,0.330809
visual blurring,-0.052496,0.088289,-0.004516,0.291191,1.0,0.001894,-0.013384,0.243069
Itching,-0.005396,0.126567,0.169294,0.109005,0.001894,1.0,0.072173,0.083529
Obesity,-0.449233,0.665922,0.436568,0.2513,-0.013384,0.072173,1.0,0.112599
class,0.021715,0.159551,0.08946,0.330809,0.243069,0.083529,0.112599,1.0


Correlations between factors (>0.25):
- Age/Visual Blurring (0.331)
- Gender/Polyuria (-0.269)
- Gender/Sudden weight loss (-0.281)
- Visual Blurring/Itching (0.291)
- Polyuria/Sudden weight loss (0.447)

Correlations between factor and class (>0.25):
- Gender/Class (-0.449)
- Polyuria/Class (0.665)
- Sudden weight loss/Class (0.436)
- Visual Blurring/Class (0.251)

In [5]:
# model = BayesianNetwork([
#     ('Age', 'Itching'),
#     ('Age', 'visual blurring'),
#     ('Gender', 'class'),
#     ('Gender', 'Polyuria'),
#     ('Gender', 'sudden weight loss'),
#     ('Itching', 'visual blurring'),
#     ('sudden weight loss', 'Polyuria'),
#     ('visual blurring', 'class'),
#     ('Polyuria', 'class'),
#     ('sudden weight loss', 'class'),
#     ('Obesity', 'class')
# ])

# model.fit(df, estimator=MaximumLikelihoodEstimator)

In [6]:
# infer = VariableElimination(model)

# evidence = {
#     'Age': 40, 
#     'Gender':  1, 
#     'Polyuria': 0,
#     'sudden weight loss': 0,
#     'visual blurring': 0,
#     'Itching': 1,
#     'Obesity': 1
# }

# # Query the model for the probability of Diabetes
# result = infer.query(variables=['class'], evidence=evidence)
# print(result)

In [7]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
hc = HillClimbSearch(train_data)
bic_score = BicScore(train_data)

best_model_structure = hc.estimate(scoring_method=bic_score)

# Post-processing the structure to ensure 'class' is always a child node
edges_to_remove = [(parent, child) for parent, child in best_model_structure.edges() if parent == 'class']
edges_to_add = [(child, parent) for parent, child in edges_to_remove]

# Adjust the model structure
best_model_structure.remove_edges_from(edges_to_remove)
best_model_structure.add_edges_from(edges_to_add)

print("Adjusted Model Structure:")
for edge in best_model_structure.edges():
    print(edge)

  0%|          | 10/1000000 [00:00<2:45:44, 100.55it/s]

Adjusted Model Structure:
('Age > 45', 'class')
('Gender', 'Obesity')
('Polyuria', 'Itching')
('sudden weight loss', 'class')
('visual blurring', 'sudden weight loss')
('visual blurring', 'Obesity')
('visual blurring', 'class')
('Obesity', 'Polyuria')
('Obesity', 'Age > 45')
('Obesity', 'sudden weight loss')





In [9]:
model = BayesianNetwork(best_model_structure.edges())
model.fit(train_data, estimator=MaximumLikelihoodEstimator)

In [10]:
inference = VariableElimination(model)

In [11]:
# Query for the probability of 'class' given some evidence
evidence = {
    'Age > 45': 1, 
    'Gender':  1, 
    'Polyuria': 0,
    'sudden weight loss': 0,
    'visual blurring': 0,
    'Itching': 1,
    'Obesity': 1
}

prob_class = inference.query(variables=['class'], evidence=evidence)

print("Probability of class given the evidence:")
print(prob_class)

Probability of class given the evidence:
+----------+--------------+
| class    |   phi(class) |
| class(0) |       0.7290 |
+----------+--------------+
| class(1) |       0.2710 |
+----------+--------------+


In [12]:
predictions = []
for _, row in test_data.iterrows():
    evidence = row.to_dict()
    # Remove the actual outcome from the evidence
    actual_outcome = evidence.pop('class', None)
    predicted_outcome = inference.map_query(variables=['class'], evidence=evidence)['class']
    predictions.append((predicted_outcome, actual_outcome))

# Calculate accuracy or other metrics
correct_predictions = sum(1 for pred, actual in predictions if pred == actual)
accuracy = correct_predictions / len(predictions)
print(f'Accuracy: {accuracy:.2f}')

Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it

Accuracy: 0.66



