# Bayesian Networks

## Contents

- [1. Imports](#Imports)
- [2. Generate data with binary values](#Generate-data-with-binary-values)
- [3. Learn structure and fit training data](#Learn-structure-and-fit-training-data)
- [4. Predict labels for test part](#Predict-labels-for-test-part)
- [5. Common structure](#Common-structure)

[Back to Chemfin](../Chemfin.ipynb)

### Imports

Required packages: [pomegranate](http://pomegranate.readthedocs.io/en/latest/), [numpy](http://www.numpy.org/), [pandas](https://pandas.pydata.org/), [scikit-learn](http://scikit-learn.org/), [networkx](http://dschult-networkx.readthedocs.io/en/latest/) (version $\geq$ 2.0)

[Back to contents](#Contents)

In [2]:
import sys
sys.path.append('../src/')
import copy

import numpy as np
import pandas as pd
import os

from pomegranate import BayesianNetwork

import networkx as nx

import bayesian_networks as bn
import bn_predict

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from io_work import stringSplitByNumbers

### Generate data with binary values

Output is to be saved as csv file.

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

filename_save = 'bn_dataset.csv'
filename_save2 = 'bn_test2.csv'

left_fraction = 30
geN = 20

data = bn.loadMatrix(data_dirname+filename_dataset, one_node=1, ignore_negative=0)
data, tau = bn.thresholdMatrix(data, left_fraction=left_fraction, one_node=1)
data.to_csv(data_dirname+filename_save)
colnames = data.columns.values

data = bn.loadMatrix(data_dirname+filename_dataset2, one_node=1, ignore_negative=0)
data, tau = bn.thresholdMatrix(data, left_fraction=left_fraction, one_node=1, cut_const_cols=0)
data = data[colnames]
data.to_csv(data_dirname+filename_save2)

### Learn structure and fit training data

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_filename_prefix = 'model_bn_'

filename_cv = 'cv_indices.npz'
filename_dataset = 'bn_dataset.csv'

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

data = pd.read_csv(data_dirname+filename_dataset, index_col=0)

bn.produceModelsForValidationToJSON(data, train_indices, model_dirname, filename_base=model_filename_prefix)

### Predict labels for test part

It is a very slow operation, thus we parallelize code with multiprocessing package and observe only validational datasets.

[Back to contents](#Contents)

In [None]:
N_jobs = 2

data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_filename_prefix = 'model_bn'
results_dirname = '../results/large_bayesian_networks/'

filename_cv = 'cv_indices.npz'
filename_dataset = 'bn_dataset.csv'
filename_dataset2 = 'bn_test2.csv'

df = np.load(data_dirname + filename_cv)
test_indices = df['test_indices']

data = pd.read_csv(data_dirname+filename_dataset, index_col=0)
colnames = data.columns.values
data2 = pd.read_csv(data_dirname+filename_dataset2, index_col=0)
data2 = data2[colnames]

all_model_filenames = os.listdir(model_dirname)
all_model_filenames = filter(lambda x: x.startswith(model_filename_prefix), all_model_filenames)
all_model_filenames = filter(lambda x: x.endswith('.json'), all_model_filenames)
all_model_filenames = sorted(all_model_filenames, key=stringSplitByNumbers)

print "We found %d models in %s directory" % (len(all_model_filenames), model_dirname)
index = 0
for model_filename in all_model_filenames:    
    print model_filename
    y = data.iloc[:, 0].values
    X = data.iloc[:, 1:].values
    X_test = X[test_indices[index], :].copy()
    y_test = y[test_indices[index]].copy()
    X_test = X_test.astype(int)
    X_test = np.hstack([np.nan*np.empty([y_test.size, 1]), X_test])
    bn_predict.launcher(N_jobs, model_filename, X_test, y_test, model_dirname, results_dirname)
    bn_predict.gatherResults(index, results_dirname, filename_dataset.split('.')[0])
    
    X_test = data2.iloc[:, 1:].values
    y_test = data2.iloc[:, 0].values
    X_test = X_test.astype(int)
    X_test = np.hstack([np.nan*np.empty([y_test.size, 1]), X_test])
    bn_predict.launcher(N_jobs, model_filename, X_test, y_test, model_dirname, results_dirname)
    bn_predict.gatherResults(index, results_dirname, postfix=filename_dataset2.split('.')[0])
    index += 1
    print "============= index %d ready =========" % (index)

### Common structure

[Back to contents](#Contents)

In [12]:
data_dirname = '../data/'
model_dirname = '../models/large_bayesian_networks/'
model_dirname_save = '../models/large_bayesian_networks/intersection/'
model_filename_prefix = 'model_bn'
save_filename = 'common_tree.pkl'

filename_dataset = 'bn_dataset.csv'

all_model_filenames = os.listdir(model_dirname)
all_model_filenames = filter(lambda x: x.startswith(model_filename_prefix), all_model_filenames)
all_model_filenames = filter(lambda x: x.endswith('.json'), all_model_filenames)
all_model_filenames = sorted(all_model_filenames, key=stringSplitByNumbers)

G = None
root = 0

print "We found %d models in %s directory" % (len(all_model_filenames), model_dirname)
index = 0
for model_filename in all_model_filenames:
    model = BayesianNetwork.from_json(model_dirname+model_filename)
    tree = nx.Graph()

    dictTree = {}
    for k in xrange(len(model.structure)):
        dictTree[k] = model.structure[k]
    nx.from_dict_of_lists(dictTree, create_using=tree)
    if G is None:
        G = copy.deepcopy(tree)
    else:
        G = nx.intersection(G, tree)
G = max(nx.connected_component_subgraphs(G), key=len)
tree = nx.DiGraph()
nx.from_edgelist(G.edges, create_using=tree)
#tree = tree.reverse()
#json_data = nx.readwrite.json_graph.tree_data(tree, root)
nx.write_gpickle(tree, model_dirname_save+save_filename)
# G = nx.read_gpickle(model_dirname_save+save_filename)

We found 25 models in ../models/large_bayesian_networks/ directory
