# Bayesian Networks

## Contents

- [1. Imports](#Imports)
- [2. Generate data with binary values](#Generate-data-with-binary-values)
- [3. Learn structure and fit training data](#Learn-structure-and-fit-training-data)
- [4. Predict labels for test part](#Predict-labels-for-test-part)
- [5. Common structure](#Common-structure)

[Back to Chemfin](../Chemfin.ipynb)

### Imports

Required packages: [pomegranate](http://pomegranate.readthedocs.io/en/latest/), [numpy](http://www.numpy.org/), [pandas](https://pandas.pydata.org/), [scikit-learn](http://scikit-learn.org/), [networkx](http://dschult-networkx.readthedocs.io/en/latest/) (version $\geq$ 2.0)

[Back to contents](#Contents)

In [1]:
import sys
sys.path.append('../src/')
import copy

import numpy as np
import pandas as pd
import os

from pomegranate import BayesianNetwork

import networkx as nx

import bayesian_networks as bn
import bn_predict

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from io_work import stringSplitByNumbers

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



### Generate data with binary values

Output is to be saved as csv file.

[Back to contents](#Contents)

In [2]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'


filename_save = 'bn_dataset.csv'
filename_save2 = 'bn_test2.csv'

left_fraction = 30
geN = 20

data = bn.loadMatrix(data_dirname+filename_dataset, one_node=1, ignore_negative=0)
data, tau = bn.thresholdMatrix(data, left_fraction=left_fraction, one_node=1)
data.to_csv(data_dirname+filename_save)
colnames = data.columns.values

data = bn.loadMatrix(data_dirname+filename_dataset2, one_node=1, ignore_negative=0)
data, tau = bn.thresholdMatrix(data, left_fraction=left_fraction, one_node=1, cut_const_cols=0)
data = data[colnames]
data.to_csv(data_dirname+filename_save2)

### Learn structure and fit training data

[Back to contents](#Contents)

In [3]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_filename_prefix = 'model_bn_'

#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'
filename_dataset = 'bn_dataset.csv'

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

data = pd.read_csv(data_dirname+filename_dataset, index_col=0)

bn.produceModelsForValidationToJSON(
    data, train_indices, model_dirname, filename_base=model_filename_prefix
)

Learning...
Model estimated in 503.63689 clock, 506.40727 time
Total Time (s): 216.6157
Model fitted in 215.90355 clock, 216.61671 time
Learning...
Model estimated in 533.53843 clock, 535.83321 time
Total Time (s): 218.7484
Model fitted in 218.44699 clock, 218.74921 time
Learning...
Model estimated in 568.39078 clock, 570.66406 time
Total Time (s): 238.6007
Model fitted in 237.33557 clock, 238.60185 time
Learning...
Model estimated in 564.56009 clock, 568.97127 time
Total Time (s): 243.7072
Model fitted in 241.54433 clock, 243.70808 time
Learning...
Model estimated in 561.62279 clock, 564.95737 time
Total Time (s): 256.9784
Model fitted in 255.57348 clock, 256.97914 time
Learning...
Model estimated in 487.37145 clock, 489.19692 time
Total Time (s): 198.9727
Model fitted in 199.81884 clock, 198.97379 time
Learning...
Model estimated in 485.24295 clock, 485.12608 time
Total Time (s): 215.2760
Model fitted in 215.21316 clock, 215.27730 time
Learning...
Model estimated in 464.49096 clock, 

### Predict labels for test part

It is a very slow operation, thus we parallelize code with multiprocessing package and observe only validational datasets.

[Back to contents](#Contents)

In [None]:
N_jobs = 12

data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_filename_prefix = 'model_bn'
results_dirname = '../results/large_bayesian_networks/'

#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

filename_dataset = 'bn_dataset.csv'
filename_dataset2 = 'bn_test2.csv'

df = np.load(data_dirname + filename_cv)
test_indices = df['test_indices']

data = pd.read_csv(data_dirname+filename_dataset, index_col=0)
colnames = data.columns.values
data2 = pd.read_csv(data_dirname+filename_dataset2, index_col=0)
data2 = data2[colnames]

all_model_filenames = os.listdir(model_dirname)
all_model_filenames = filter(lambda x: x.startswith(model_filename_prefix), all_model_filenames)
all_model_filenames = filter(lambda x: x.endswith('.json'), all_model_filenames)
all_model_filenames = sorted(all_model_filenames, key=stringSplitByNumbers)

print "We found %d models in %s directory" % (len(all_model_filenames), model_dirname)
index = 0
for model_filename in all_model_filenames:    
    print model_filename
    y = data.iloc[:, 0].values
    X = data.iloc[:, 1:].values
    X_test = X[test_indices[index], :].copy()
    y_test = y[test_indices[index]].copy()
    X_test = X_test.astype(int)
    X_test = np.hstack([np.nan*np.empty([y_test.size, 1]), X_test])
    bn_predict.launcher(N_jobs, model_filename, X_test, y_test, model_dirname, results_dirname)
    bn_predict.gatherResults(index, results_dirname, filename_dataset.split('.')[0])
    
    X_test = data2.iloc[:, 1:].values
    y_test = data2.iloc[:, 0].values
    X_test = X_test.astype(int)
    X_test = np.hstack([np.nan*np.empty([y_test.size, 1]), X_test])
    bn_predict.launcher(N_jobs, model_filename, X_test, y_test, model_dirname, results_dirname)
    bn_predict.gatherResults(index, results_dirname, postfix=filename_dataset2.split('.')[0])
    index += 1
    print "============= index %d ready =========" % (index)

We found 25 models in ../models/large_bayesian_networks/ directory
model_bn_0.json
process 1 with 13 samples (1-13)
process 2 with 13 samples (14-26)
process 3 with 13 samples (27-39)
process 4 with 13 samples (40-52)
process 5 with 13 samples (53-65)
process 6 with 13 samples (66-78)
process 7 with 13 samples (79-91)
process 8 with 13 samples (92-104)
process 9 with 13 samples (105-117)
process 10 with 13 samples (118-130)
process 11 with 12 samples (131-142)
process 12 with 12 samples (143-154)
Thread 8: 7.69% , accuracy=0.00000, averaged time: 868.471 s/sample
Thread 0: 7.69% , accuracy=1.00000, averaged time: 909.141 s/sample
Thread 11: 8.33% , accuracy=0.00000, averaged time: 913.363 s/sample
Thread 2: 7.69% , accuracy=1.00000, averaged time: 976.711 s/sample
Thread 7: 7.69% , accuracy=1.00000, averaged time: 1053.015 s/sample
Thread 6: 7.69% , accuracy=1.00000, averaged time: 1060.823 s/sample
Thread 1: 7.69% , accuracy=1.00000, averaged time: 1061.784 s/sample
Thread 3: 7.69% , 

In [None]:
results_dirname = '../results/large_bayesian_networks/'

In [9]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_filename_prefix = 'model_bn'
results_dirname = '../results/large_bayesian_networks/'

filename_cv = 'cv_indices.npz'
#filename_cv = 'physical_cv_indices_nc.npz'

filename_dataset = 'bn_dataset.csv'
filename_dataset2 = 'bn_test2.csv'

results_filename_prefix = 'validation_results_on_model_'

all_result_filenames = os.listdir(results_dirname)
all_result_filenames = filter(lambda x: x.startswith(results_filename_prefix), all_result_filenames)
all_result_filenames = filter(lambda x: x.endswith('.csv'), all_result_filenames)
all_result_filenames = sorted(all_result_filenames, key=stringSplitByNumbers)

print "We found %d models in %s directory" % (len(all_result_filenames), results_dirname)

all_result_filenames_test = filter(lambda x: filename_dataset in x, all_result_filenames)
all_result_filenames_test2 = filter(lambda x: filename_dataset2 in x, all_result_filenames)

accs, f1s = [], []
for i in range(len(all_result_filenames_test)):
    loc_accs, loc_f1s = [], []
    for current_filename in [all_result_filenames_test[i], all_result_filenames_test2[i]]:
        df = pd.read_csv(results_dirname+current_filename, index_col=0)
        labels = list(map(float, df.columns.values[:-1]))
        labels = np.array(labels).astype('i')
        true_labels = df.iloc[:, -1].values
        predicted_labels = df.iloc[:, :-1].values
        ind = np.argmax(predicted_labels, axis=1)
        predicted_labels = labels[ind]
        current_accuracy = accuracy_score(true_labels, predicted_labels)
        current_f1score = f1_score(true_labels, predicted_labels, average='weighted')
        loc_accs.append(current_accuracy)
        loc_f1s.append(current_f1score)
    accs.append(loc_accs)
    f1s.append(loc_f1s)
        
accs = np.median(accs, axis=0)
f1s = np.median(f1s, axis=0)
print('accuracy [test/test2]: %.5f/%.5f' % (accs[0], accs[1]))
print('f1 [test/test2]: %.5f/%.5f' % (f1s[0], f1s[1]))

We found 50 models in ../results/large_bayesian_networks/ directory
accuracy [test/test2]: 0.79310/0.75000
f1 [test/test2]: 0.77882/0.81999


In [None]:
accuracy [test/test2]: 0.78505/0.72727
f1 [test/test2]: 0.76482/0.81597

### Common structure

[Back to contents](#Contents)

In [2]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_dirname_save = '../models/large_bayesian_networks/intersection/'
model_filename_prefix = 'model_bn'
save_filename = 'common_tree.pkl'

filename_dataset = 'bn_dataset.csv'

all_model_filenames = os.listdir(model_dirname)
all_model_filenames = filter(lambda x: x.startswith(model_filename_prefix), all_model_filenames)
all_model_filenames = filter(lambda x: x.endswith('.json'), all_model_filenames)
all_model_filenames = sorted(all_model_filenames, key=stringSplitByNumbers)

G = None
root = 0

print "We found %d models in %s directory" % (len(all_model_filenames), model_dirname)
index = 0
for model_filename in all_model_filenames:
    model = BayesianNetwork.from_json(model_dirname+model_filename)
    tree = nx.Graph()

    dictTree = {}
    for k in xrange(len(model.structure)):
        dictTree[k] = model.structure[k]
    nx.from_dict_of_lists(dictTree, create_using=tree)
    if G is None:
        G = copy.deepcopy(tree)
    else:
        G = nx.intersection(G, tree)
G = max(nx.connected_component_subgraphs(G), key=len)
tree = nx.DiGraph()
nx.from_edgelist(G.edges, create_using=tree)
#tree = tree.reverse()
#json_data = nx.readwrite.json_graph.tree_data(tree, root)
nx.write_gpickle(tree, model_dirname_save+save_filename)
# G = nx.read_gpickle(model_dirname_save+save_filename)

We found 25 models in ../models/large_bayesian_networks/ directory
