In [9]:
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.estimators import ExpectationMaximization

from pgmpy.estimators import ExpectationMaximization as EM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [10]:
df= pd.read_csv("breast-cancer.csv")
df.head()

Unnamed: 0,Class,Age,Menopause,Tumor_size,Inv_nodes,Node_caps,Deg_malig,Breast,Breast_quad,Irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [11]:
model = BayesianNetwork([('Age', 'Menopause'),
                         ('Age', 'Deg_malig'),
                         ('Irradiat', 'Deg_malig'),
                         ('Irradiat', 'Breast'),
                         ('Menopause', 'Deg_malig'),
                         ('Tumor_size', 'Deg_malig'),
                         ('Tumor_size', 'Class'),
                         ('Inv_nodes', 'Class'),
                         ('Inv_nodes', 'Node_caps'),

                         ('Node_caps', 'Deg_malig'),
                         ('Tumor_size', 'Inv_nodes'),
                         ('Node_caps', 'Class'),
                         ('Breast', 'Breast_quad'),
                         ('Breast_quad', 'Node_caps'),
                         ('Class', 'Deg_malig')
                         ])

In [12]:
estimator = EM(model, df)
estimator.get_parameters()

  0%|          | 0/100 [00:00<?, ?it/s]


[<TabularCPD representing P(Inv_nodes:7 | Tumor_size:11) at 0x7c9385ba3850>,
 <TabularCPD representing P(Node_caps:3 | Breast_quad:6, Inv_nodes:7) at 0x7c9385bad110>,
 <TabularCPD representing P(Menopause:3 | Age:6) at 0x7c9385b94c90>,
 <TabularCPD representing P(Age:6) at 0x7c93852a7590>,
 <TabularCPD representing P(Deg_malig:3 | Age:6, Class:2, Irradiat:2, Menopause:3, Node_caps:3, Tumor_size:11) at 0x7c9385bae690>,
 <TabularCPD representing P(Breast_quad:6 | Breast:2) at 0x7c9385b973d0>,
 <TabularCPD representing P(Breast:2 | Irradiat:2) at 0x7c9385bbe0d0>,
 <TabularCPD representing P(Class:2 | Inv_nodes:7, Node_caps:3, Tumor_size:11) at 0x7c9385bad790>,
 <TabularCPD representing P(Tumor_size:11) at 0x7c9385ba2950>,
 <TabularCPD representing P(Irradiat:2) at 0x7c9385bbc350>]

In [13]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
model.fit(train_data, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=10)

In [14]:
for cpd in model.get_cpds():
    print("CPD de {variable}:".format(variable=cpd.variable))
    print(cpd)

CPD de Age:
+------------+-----------+
| Age(30-39) | 0.121849  |
+------------+-----------+
| Age(40-49) | 0.323529  |
+------------+-----------+
| Age(50-59) | 0.340336  |
+------------+-----------+
| Age(60-69) | 0.184874  |
+------------+-----------+
| Age(70-79) | 0.0294118 |
+------------+-----------+
CPD de Menopause:
+--------------------+-----+---------------------+
| Age                | ... | Age(70-79)          |
+--------------------+-----+---------------------+
| Menopause(ge40)    | ... | 0.8095238095238094  |
+--------------------+-----+---------------------+
| Menopause(lt40)    | ... | 0.09523809523809522 |
+--------------------+-----+---------------------+
| Menopause(premeno) | ... | 0.09523809523809522 |
+--------------------+-----+---------------------+
CPD de Deg_malig:
+--------------+-----+--------------------------+
| Age          | ... | Age(70-79)               |
+--------------+-----+--------------------------+
| Class        | ... | Class(recurrence-events

In [15]:
# Check unique state names in the training data
unique_states = {}
for column in train_data.select_dtypes(include=['object']).columns:
    unique_states[column] = train_data[column].unique()

# Update state names in the test data
for column, states in unique_states.items():
    most_frequent_state = train_data[column].mode()[0]  # Get the most frequent state in training data
    test_data[column] = test_data[column].apply(lambda x: x if x in states else most_frequent_state)

In [16]:
# Make predictions on the updated test data
predicted_labels = []
threshold = 0.2  # meilleur seuil
for index, row in test_data.iterrows():
    # Construct a DataFrame with a single row containing the values from the test row
    test_row = pd.DataFrame([row.drop('Class')], columns=row.index.drop('Class'))
    # Predict the probabilities for 'Class' for the current row
    predicted_probabilities = model.predict_probability(test_row)
    # Assign label based on threshold
    predicted_label = 1 if predicted_probabilities.iloc[0, 1] > threshold else 0
    predicted_labels.append(predicted_label)

In [17]:
# Convert predicted labels back to class names
predicted_labels = ['recurrence-events' if label == 1 else 'no-recurrence-events' for label in predicted_labels]

In [18]:
# Calculate evaluation metrics
true_labels = test_data['Class']
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, pos_label='recurrence-events')
recall = recall_score(true_labels, predicted_labels, pos_label='recurrence-events')
f1 = f1_score(true_labels, predicted_labels, pos_label='recurrence-events')

# Print evaluation metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Evaluation Metrics:
Accuracy: 0.5862068965517241
Precision: 0.4482758620689655
Recall: 0.6190476190476191
F1 Score: 0.52
