# <center>IEE 520: Fall 2019</center>

# <center> Bayesian Network (11/07/19)</center>

## <center>Klim Drobnyh (klim.drobnyh@asu.edu)</center>

In [None]:
# For compatibility with Python 2
from __future__ import print_function

# To load datasets
from sklearn import datasets

# To import the models (Decision Tree Classifier and Regressor)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# To display a tree
from sklearn.tree import plot_tree

# To measure accuracy
from sklearn import metrics

from sklearn.model_selection import cross_validate, KFold, train_test_split, GridSearchCV

# To support plots
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import numpy as np
import pandas as pd

import seaborn as sn

import math

# To display all the plots inline
%matplotlib inline

Custom function to plot trees, taken from scikit-learn/sklearn/tree/export.py

In [None]:
# To increase quality of figures
plt.rcParams["figure.figsize"] = (10, 5)

In [None]:
# To import the scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer


class DummyScaler:
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        return data

def create_scaler_dummy():
    return DummyScaler()
    
def create_scaler_standard():
    return StandardScaler()

def create_scaler_minmax():
    return MinMaxScaler()

def crete_scaler_binarizer():
    return Binarizer()

## <center>Toy dataset</center>

### <center>Prepare the dataset</center>

The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper "The use of multiple measurements in taxonomic problems" as an example of linear discriminant analysis.

The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

In [None]:
X, y = datasets.load_iris(True)

def discretize_x(x, bins=2):
    for i in range(x.shape[1]):  
        x[:, i] = pd.qcut(x[:, i], bins, labels=False, duplicates='drop')
    return x

print(X.shape)
X = discretize_x(X, 10)
print(X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=520)
data_train = np.concatenate([X_train, y_train.reshape((y_train.shape[0], 1))], axis=1)
data_test = np.concatenate([X_test, y_test.reshape((y_test.shape[0], 1))], axis=1)

Let's trim the data to have just first 2 variables (length and width of the sepals).
Also, let's remove repeating instances (just to make visualization more tractable).

### <center>Bayesian Network</center>

In [None]:
# You should use that to install pomegranate
# conda install pomegranate
from pomegranate import BayesianNetwork

In [None]:
import matplotlib.patches as mpatch
def plot_graph(structure, labels):
    import networkx as nx
    import matplotlib.pyplot as plt
    
    G = nx.DiGraph()
    
    for label in labels:
        G.add_node(label)
    
    edges = []
    for t in range(len(structure)):
        for f in structure[t]:
            edges.append((labels[f], labels[t]))
    
    
    G.add_edges_from(edges)
    # Need to create a layout when doing
    # separate calls to draw nodes and edges
    pos = nx.shell_layout(G)
    nx.draw_networkx_nodes(G, pos, node_color='black', node_size=1000)# , cmap=plt.get_cmap('jet'))
    nx.draw_networkx_labels(G, pos, font_size=12, font_color='w')
    nx.draw_networkx_edges(G, pos, edge_color='r', arrowstyle=mpatch.ArrowStyle.CurveB(head_length=3.0, head_width=0.5))
    plt.show()

In [None]:
# 'chow-liu', 'greedy', 'exact', 'exact-dp'
model = BayesianNetwork.from_samples(data_train, max_parents=2, algorithm='exact-dp', reduce_dataset=True, n_jobs=-1)
# print(model.structure)
plot_graph(model.structure, ['X' + str(x) for x in range(X.shape[1])] + ['y'])

In [None]:
print(model.states[0].distribution)

In [None]:
y_train_hat = model.predict(np.concatenate([X_train, np.full((X_train.shape[0], 1), np.nan)], axis=1))
y_test_hat = model.predict(np.concatenate([X_test, np.full((X_test.shape[0], 1), np.nan)], axis=1))

In [None]:
y_train_hat = np.array([x[4] for x in y_train_hat])
y_test_hat = np.array([x[4] for x in y_test_hat])

In [None]:
print('Accuracy (train):', metrics.accuracy_score(y_train, y_train_hat))
cm = metrics.confusion_matrix(y_train, y_train_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
print('Accuracy (test):', metrics.accuracy_score(y_test, y_test_hat))
cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

## <center>Real-world dataset</center>

### <center>Load the dataset</center>

Let's consider that dataset:
https://archive.ics.uci.edu/ml/datasets/mushroom

The variables in the data set are:

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d

In [None]:
labels = [
    'class',
    'cap-shape',
    'cap-surface',
    'cap-color',
    'bruises',
    'odor',
    'gill-attachment',
    'gill-spacing',
    'gill-size',
    'gill-color',
    'stalk-shape',
    'stalk-root',
    'stalk-surface-above-ring',
    'stalk-surface-below-ring',
    'stalk-color-above-ring',
    'stalk-color-below-ring',
    'veil-type',
    'veil-color',
    'ring-number',
    'ring-type',
    'spore-print-color',
    'population',
    'habitat'
]

In [None]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', header=None, names=labels, na_values='?')
#data = data.sample(frac=0.3, random_state=520)
del data['stalk-root']

In [None]:
print(data.head())

In [None]:
def categorize_df(df):
    df_num = df.copy()
    for col in df.columns:
        print(col)
        print(df_num[col].astype('category').cat.categories)
        df_num[col] = np.array(df_num[col].astype('category').cat.codes, dtype='float')
    df_num[df_num == -1.0] = np.nan
    return df_num

In [None]:
data_num = categorize_df(data)

In [None]:
data_train, data_test = train_test_split(data_num, test_size=0.2, random_state=520)

In [None]:
X_train = data_train.values[:, 1:]
X_test = data_test.values[:, 1:]

In [None]:
y_train = np.array(data_train['class'])
y_test = np.array(data_test['class'])

In [None]:
data_train_predict = data_train.copy().values
data_train_predict[:, 0] = np.nan
data_test_predict = data_test.copy().values
data_test_predict[:, 0] = np.nan

In [None]:
data_train.to_numpy().shape

In [None]:
model = BayesianNetwork.from_samples(data_train.to_numpy(), max_parents=2, algorithm='chow-liu', reduce_dataset=True, n_jobs=-1)
plot_graph(model.structure, ['y'] + ['X' + str(x) for x in range(data_train.shape[1]-1)])

In [None]:
y_train_hat = model.predict(data_train_predict, n_jobs=-1, check_input=False)
y_test_hat = model.predict(data_test_predict, n_jobs=-1, check_input=False)

In [None]:
y_train_hat = np.array([x[0] for x in y_train_hat])
y_test_hat = np.array([x[0] for x in y_test_hat])

In [None]:
print(y_train)

In [None]:
print(y_train_hat)

In [None]:
print('Accuracy (train):', metrics.accuracy_score(y_train, y_train_hat))
cm = metrics.confusion_matrix(y_train, y_train_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix (BN, train)')
plt.show()

In [None]:
print('Accuracy (test):', metrics.accuracy_score(y_test, y_test_hat))
cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix (BN, test)')
plt.show()

In [None]:
X_train2 = np.delete(X_train, 10, axis=1)
X_test2 = np.delete(X_test, 10, axis=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_tree = GridSearchCV(DecisionTreeClassifier(random_state=520),
                          cv=5,
                          param_grid={
                              "max_depth": list(range(1, 40, 2)),
                              "min_samples_split": list(range(2, 5, 2))
                          })
model_tree.fit(X_train2, y_train)
print('The parameters found by CV search:')
print(model_tree.best_params_)
y_train_hat = model_tree.predict(X_train2)
y_test_hat = model_tree.predict(X_test2)

In [None]:
print('Accuracy (train):', metrics.accuracy_score(y_train, y_train_hat))
cm = metrics.confusion_matrix(y_train, y_train_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix (Tree, train)')
plt.show()

In [None]:
print('Accuracy (test):', metrics.accuracy_score(y_test, y_test_hat))
cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix (Tree, test)')
plt.show()

In [None]:
data_test_predict = data_test.copy().values
data_test_y = data_test.values[:, 5]
data_test_predict[:, 5] = np.nan

In [None]:
data_test_y_hat = model.predict(data_test_predict, n_jobs=-1, check_input=False)
data_test_y_hat = np.array([x[5] for x in data_test_y_hat])

In [None]:
print(data_test_y_hat)
print(data_test_y)

In [None]:
print('Accuracy (test):', metrics.accuracy_score(data_test_y, data_test_y_hat))
cm = metrics.confusion_matrix(data_test_y, data_test_y_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix (BN, different variable)')
plt.show()