In [18]:
# install pgmpy
!pip install pgmpy



In [4]:
import pandas as pd

df = pd.read_csv('../data/covid_preprocessed.csv')

In [5]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import HillClimbSearch, BicScore
from pgmpy.estimators import ParameterEstimator

# Define Bayesian Network manually
model = BayesianNetwork([
    ('AGE_GROUP', 'PNEUMONIA'),
    ('PNEUMONIA', 'ICU'),
    ('ICU', 'CLASIFFICATION_FINAL'),
    ('SEX', 'CLASIFFICATION_FINAL'),
    ('OBESITY', 'CLASIFFICATION_FINAL'),
    ('DIABETES', 'CLASIFFICATION_FINAL')
])

# Print the defined structure
print("Defined Bayesian Network structure:")
print(model.edges())

Defined Bayesian Network structure:
[('AGE_GROUP', 'PNEUMONIA'), ('PNEUMONIA', 'ICU'), ('ICU', 'CLASIFFICATION_FINAL'), ('SEX', 'CLASIFFICATION_FINAL'), ('OBESITY', 'CLASIFFICATION_FINAL'), ('DIABETES', 'CLASIFFICATION_FINAL')]


In [6]:
from pgmpy.estimators import MaximumLikelihoodEstimator

# Learn CPDs using Maximum Likelihood Estimation
model.fit(df, estimator=MaximumLikelihoodEstimator)

print("\nConditional Probability Distributions (CPDs):")
for cpd in model.get_cpds():
    print(cpd)


Conditional Probability Distributions (CPDs):
+--------------+-----------+
| AGE_GROUP(0) | 0.0645409 |
+--------------+-----------+
| AGE_GROUP(1) | 0.416066  |
+--------------+-----------+
| AGE_GROUP(2) | 0.370065  |
+--------------+-----------+
| AGE_GROUP(3) | 0.128401  |
+--------------+-----------+
| AGE_GROUP(4) | 0.0209274 |
+--------------+-----------+
+--------------+---------------------+-----+---------------------+--------------------+
| AGE_GROUP    | AGE_GROUP(0)        | ... | AGE_GROUP(3)        | AGE_GROUP(4)       |
+--------------+---------------------+-----+---------------------+--------------------+
| PNEUMONIA(1) | 0.07636385129144749 | ... | 0.34333546249944297 | 0.4308694859642727 |
+--------------+---------------------+-----+---------------------+--------------------+
| PNEUMONIA(2) | 0.9236361487085525  | ... | 0.6566645375005571  | 0.5691305140357273 |
+--------------+---------------------+-----+---------------------+--------------------+
+-----------+-----

In [9]:
from pgmpy.inference import VariableElimination

# Perform inference
inference = VariableElimination(model)

# Example: Probability of classification (COVID-19 status) given some evidence
query_result = inference.query(variables=['CLASIFFICATION_FINAL'], evidence={'AGE_GROUP': 0, 'SEX': 1})
print("\nInference result:")
print(query_result)


Inference result:
+-------------------------+-----------------------------+
| CLASIFFICATION_FINAL    |   phi(CLASIFFICATION_FINAL) |
| CLASIFFICATION_FINAL(0) |                      0.6519 |
+-------------------------+-----------------------------+
| CLASIFFICATION_FINAL(1) |                      0.3481 |
+-------------------------+-----------------------------+


In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['CLASIFFICATION_FINAL']) 
y = df['CLASIFFICATION_FINAL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [11]:
# Fit the model using Maximum Likelihood Estimation
model.fit(train_data, estimator=MaximumLikelihoodEstimator)
print("\nLearned CPDs:")
for cpd in model.get_cpds():
    print(cpd)




Learned CPDs:
+--------------+-----------+
| AGE_GROUP(0) | 0.0646067 |
+--------------+-----------+
| AGE_GROUP(1) | 0.416055  |
+--------------+-----------+
| AGE_GROUP(2) | 0.370171  |
+--------------+-----------+
| AGE_GROUP(3) | 0.128236  |
+--------------+-----------+
| AGE_GROUP(4) | 0.0209308 |
+--------------+-----------+
+--------------+---------------------+-----+--------------------+---------------------+
| AGE_GROUP    | AGE_GROUP(0)        | ... | AGE_GROUP(3)       | AGE_GROUP(4)        |
+--------------+---------------------+-----+--------------------+---------------------+
| PNEUMONIA(1) | 0.07585430659089232 | ... | 0.3442252630796118 | 0.43057295819569424 |
+--------------+---------------------+-----+--------------------+---------------------+
| PNEUMONIA(2) | 0.9241456934091077  | ... | 0.6557747369203882 | 0.5694270418043057  |
+--------------+---------------------+-----+--------------------+---------------------+
+-----------+---------------------+---------------

In [13]:
from pgmpy.inference import VariableElimination

inference = VariableElimination(model)

evidence = {'AGE_GROUP': 1, 'SEX': 1}  # Example evidence
query_result = inference.query(variables=['CLASIFFICATION_FINAL'], evidence=evidence)
print("\nInference result for test evidence:")
print(query_result)


Inference result for test evidence:
+-------------------------+-----------------------------+
| CLASIFFICATION_FINAL    |   phi(CLASIFFICATION_FINAL) |
| CLASIFFICATION_FINAL(0) |                      0.6523 |
+-------------------------+-----------------------------+
| CLASIFFICATION_FINAL(1) |                      0.3477 |
+-------------------------+-----------------------------+


In [14]:
# Function to predict probabilities for each test case
def predict_probabilities(test_data, inference_engine, target):
    probabilities = []
    for _, row in test_data.iterrows():
        # Prepare evidence by removing the target variable
        evidence = {col: row[col] for col in test_data.columns if col != target}
        
        # Perform inference
        try:
            prob = inference_engine.query(variables=[target], evidence=evidence)
            probabilities.append(prob.values[1])  # Probability of target = 1
        except Exception as e:
            print(f"Error with evidence {evidence}: {e}")
            probabilities.append(0.5)  # Assign a neutral probability in case of error
    return probabilities

# Ensure test_data contains only the columns relevant to the Bayesian Network
network_nodes = list(model.nodes())  # Nodes in the Bayesian Network
filtered_test_data = test_data[network_nodes]

# Predict probabilities
test_probabilities = predict_probabilities(filtered_test_data, inference, 'CLASIFFICATION_FINAL')

# Convert probabilities to binary predictions based on a threshold
threshold = 0.5
test_predictions = [1 if prob >= threshold else 0 for prob in test_probabilities]

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

accuracy = accuracy_score(y_test, test_predictions)
precision = precision_score(y_test, test_predictions)
recall = recall_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions)
logloss = log_loss(y_test, test_probabilities)

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Log-Loss: {logloss:.2f}")


Evaluation Metrics:
Accuracy: 0.63
Precision: 0.54
Recall: 0.11
F1-Score: 0.18
Log-Loss: 0.65
