# Evaluate Bayesian Networks for Classification
by Jaime Blackwell

### Import Statements

In [1]:
# from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from pgmpy.estimators import HillClimbSearch, BicScore, TreeSearch, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
import sys
import os

# Import CDRL from trustworthyAI
sys.path.append('/Users/jaime/repos/sit723/trustworthyAI/research/Causal Discovery with RL/src')


input_file = '/Users/jaime/repos/sit723/causal-datasets/Real_Dataset/real_dataset_processed.csv'
if not os.path.isfile(input_file):
    raise ValueError("Input file does not exist: {}".format(input_file))

data = pd.read_csv('/Users/jaime/repos/sit723/causal-datasets/Real_Dataset/real_dataset_processed.csv')
np.save("/Users/jaime/repos/sit723/causal-datasets/Real_Dataset/DAG.npy", data)

sys.argv = [
    'main.py',
    '--input_file', input_file,
    '--output_dir', '/Users/jaime/repos/sit723/output',
    '--seed', '42'
]

import main

  from .autonotebook import tqdm as notebook_tqdm


R packages including CAM have been already installed.
importing R packages CAM and mboost


### Load Datasets

In [2]:
datasets = {
            # 'Adult': 2,     # Adult https://archive.ics.uci.edu/dataset/2/adult     14 features, 48,842 instances
            # 'Breast Cancer': 17,    # Breast Cancer Wisconsin (Diagnostic) https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic    30 features, 569 instances
            # 'Magic': 159,   # Magic: https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope      10 features, 19,020 instances
            # 'Raisin':  850,     # Raisin https://archive.ics.uci.edu/dataset/850/raisin     7 features, 900 instances
            'Rice': 545,       # Rice https://archive.ics.uci.edu/dataset/545/rice+cammeo+and+osmancik   7 features,  3810 instances
            'TicTacToe': 101   # Tic tac toe https://archive.ics.uci.edu/dataset/101/tic+tac+toe+endgame   9 features, 958 instances
        }

### Discretize the Data

In [3]:
def label_encode_cols(X, cols):
    X_encoded = X.copy()
    encoders = {}
    for col in cols:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col])
        encoders[col] = le
    return X_encoded, encoders


def preprocess_data(X, y):   # Discretize and encode dataset as required
    continuous_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['object']).columns

    transformers = []

    if len(continuous_cols) > 0:
        continuous_transformer = Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform'))
        ])
        transformers.append(('num', continuous_transformer, continuous_cols))

    if len(categorical_cols) > 0:
        X, encoders = label_encode_cols(X, categorical_cols)

    preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
    X_transformed = preprocessor.fit_transform(X)
    X_transformed_df = pd.DataFrame(X_transformed, columns=continuous_cols.tolist() + categorical_cols.tolist())

    if y.dtypes[0] == 'object':
        label_encoder = LabelEncoder()
        y_transformed = pd.DataFrame(label_encoder.fit_transform(y.values.ravel()), columns=y.columns)
    else:
        y_transformed = y

    return train_test_split(X_transformed_df, y_transformed, test_size=0.2, random_state=42, stratify=y)

In [4]:
def evaluate_model(model, X_test, y_test):  # Evaluate each DAG based on classification task
    infer = VariableElimination(model)
    target_var = y_test.columns[0]  # Assumes only one target variable
    model_nodes = set(model.nodes())
    y_pred = []

    for index, row in X_test.iterrows():
        evidence = {k: v for k, v in row.to_dict().items() if k in model_nodes}
        try:
            q = infer.map_query(variables=[target_var], evidence=evidence, show_progress=False)  # Maximum a posteriori
            y_pred.append(q[target_var])
        except IndexError as e: y_pred.append(None)
        except ValueError as e: y_pred.append(None)
        except Exception as e: y_pred.append(None)

    y_test_classes = y_test[target_var].unique()
    y_pred = [pred if pred in y_test_classes else None for pred in y_pred]

    valid_indices = [i for i, pred in enumerate(y_pred) if pred is not None]
    y_pred = [y_pred[i] for i in valid_indices]
    y_test = y_test.iloc[valid_indices].values.ravel()  # Ensure y_test is a 1D array
    y_pred = pd.Series(y_pred).values  # Ensure y_pred is a 1D array

    pd.DataFrame(y_test, columns=[target_var]).to_csv('y_test.csv', index=False)    # Save y_test and y_pred to CSV
    y_pred_df = pd.DataFrame(y_pred, columns=['Class'])
    y_pred_df.to_csv('y_pred.csv', index=False)
    print(f"Length of y_pred: {len(y_pred)}")

    return accuracy_score(y_test, y_pred)

def evaluate_naive_bayes(model, X_test, y_test):     # Evaluate the NB model
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)


In [5]:
def train_bn(model, data):        # Train Bayesian Networks
    # \TRAIN
    bn = BayesianNetwork()
    bn.add_nodes_from(model.nodes())
    bn.add_edges_from(model.edges())
    # FIT
    bn.fit(data, estimator=MaximumLikelihoodEstimator)
    return bn

def get_results_table(datasets):
    
    results_dict = {}
    for name, id in datasets.items():
        data = fetch_ucirepo(id=id)

        X = data.data.features 
        y = data.data.targets

        X_train, X_test, y_train, y_test = preprocess_data(X, y) # Preprocess data
        train_data = pd.concat([X_train, y_train], axis=1)
        train_data.to_csv(f"{name}_train_data.csv")     # export training data post processing 

        hc = HillClimbSearch(train_data)        # Hill Climbing
        best_model_hc = hc.estimate(scoring_method=BicScore(train_data))

        ts = TreeSearch(train_data)     # Tree Search
        best_model_ts = ts.estimate()

        nb = GaussianNB()       # Naive Bayes 
        
        sm = from_pandas(train_data, w_threshold=0.8)   # NOTEARS structure model

        # TRAIN
        bn_hc = train_bn(best_model_hc, train_data)
        bn_ts = train_bn(best_model_ts, train_data)
        bn_nt = train_bn(sm, train_data)    # Train BN for NOTEARS bayesian network

        # FIT
        # bn_hc.fit(train_data)
        # bn_ts.fit(train_data)
        nb.fit(X_train, y_train)
        # bn_nt.fit(train_data, estimator=MaximumLikelihoodEstimator)

        # EVALUATE
        accuracy_hc = evaluate_model(bn_hc, X_test, y_test)
        accuracy_ts = evaluate_model(bn_ts, X_test, y_test)
        accuracy_nb = evaluate_naive_bayes(nb, X_test, y_test)
        accuracy_nt = evaluate_model(bn_nt, X_test, y_test)

        # Collate results dict
        dataset_dict = {'Hill Climbing': accuracy_hc
                      ,'Tree Search':  accuracy_ts
                      ,'Naive Bayes': accuracy_nb
                      ,'NOTEARS': accuracy_nt
                      }
        print(f"{name}: {dataset_dict}")
        results_dict[name] = dataset_dict

    results_df = pd.DataFrame.from_dict(results_dict, orient='index')  # Put results into dataframe
    return results_df

In [6]:
accuracy_df = get_results_table(datasets)

  0%|          | 9/1000000 [00:00<4:44:15, 58.63it/s]
Building tree: 100%|██████████| 28/28.0 [00:00<00:00, 3313.22it/s]
  y = column_or_1d(y, warn=True)


Length of y_pred: 762
Length of y_pred: 762
Length of y_pred: 762
Rice: {'Hill Climbing': 0.8543307086614174, 'Tree Search': 0.8451443569553806, 'Naive Bayes': 0.8727034120734908, 'NOTEARS': 0.5196850393700787}


  0%|          | 10/1000000 [00:00<5:47:33, 47.95it/s]
Building tree: 100%|██████████| 45/45.0 [00:00<00:00, 2353.56it/s]
  y = column_or_1d(y, warn=True)


Length of y_pred: 192
Length of y_pred: 192
Length of y_pred: 192
TicTacToe: {'Hill Climbing': 0.703125, 'Tree Search': 0.703125, 'Naive Bayes': 0.7083333333333334, 'NOTEARS': 0.6666666666666666}


In [7]:
print(accuracy_df)  # Display the DataFrame
print(accuracy_df.style.to_latex())   # get latex

           Hill Climbing  Tree Search  Naive Bayes   NOTEARS
Rice            0.854331     0.845144     0.872703  0.519685
TicTacToe       0.703125     0.703125     0.708333  0.666667
\begin{tabular}{lrrrr}
 & Hill Climbing & Tree Search & Naive Bayes & NOTEARS \\
Rice & 0.854331 & 0.845144 & 0.872703 & 0.519685 \\
TicTacToe & 0.703125 & 0.703125 & 0.708333 & 0.666667 \\
\end{tabular}

