# Autoencoder

## Contents
- [1. Imports](#Imports)
- [2. Build models](#Build-models)
- [3. Approximation error](#Approximation-error)
- [4. Encode data with computed autoencoders](#Encode-data-with-computed-autoencoders)
- [5. Investigate how size of last encoding layer affects results: training models](#Investigate-how-size-of-last-encoding-layer-affects-results:-training-models)
- [6. Investigate how size of last encoding layer affects results: encode data](#Investigate-how-size-of-last-encoding-layer-affects-results:-encode-data)
- [7. Investigate how size of last encoding layer affects results: predict with logistic regression](#Investigate-how-size-of-last-encoding-layer-affects-results:-predict-with-logistic-regression)
- [8. Logistic regression classifier with encoded data](#Logistic-regression-classifier-with-encoded-data)
- [9. Gaussian Naive Bayes classifier with encoded data](#Gaussian-Naive-Bayes-classifier-with-encoded-data)
- [10. Hybrid Bayesian classifier with bnlearn](#Hybrid-Bayesian-classifier-with-bnlearn)




[Back to Chemfin](../Chemfin.ipynb)


### Imports
The first cell with code includes all necessary inputs.

Requires [numpy](http://www.numpy.org/), [scikit-learn](http://scikit-learn.org/), [pyTorch](http://pytorch.org/), [Rpy2](https://rpy2.readthedocs.io).

[Back to contents](#Contents)

In [6]:
import ctypes
mkl_rt = ctypes.CDLL('libmkl_rt.so')
print(mkl_rt.mkl_get_max_threads())
mkl_get_max_threads = mkl_rt.mkl_get_max_threads
def mkl_set_num_threads(cores):
    mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(cores)))

mkl_set_num_threads(4)
print(mkl_get_max_threads())

4
4


In [7]:
%env MKL_NUM_THREADS=4
%env OMP_NUM_THREADS=4


env: MKL_NUM_THREADS=4
env: OMP_NUM_THREADS=4


In [1]:
#%env OMP_NUM_THREADS=8

import sys
sys.path.append('../src/')
import copy

import numpy as np
import os
import torch
torch.set_num_threads(8)
from torch import nn
torch.set_num_threads(4)
from torch.autograd import Variable
from torch.utils.data import DataLoader


import autoencoder as ae

random_state = 150
torch.manual_seed(random_state);


from computational_utils import reshape
import bayesian_networks as bn

from io_work import stringSplitByNumbers

import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from computational_utils import reshape
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB


### Build models

Next cell contains script to build autoencoder models relevant to CV indexes supplied by data/cv_indices.npz.

Parameters to control are:

- sizes: list of integers which specifies output sizes for each encoding layer
- batch_size: number of samples to be used for computing new update at each epoch
- nEpoch: number of epochs for each layer
- num_workers: number of parallel processes to work

[Back to contents](#Contents)

In [None]:
#truncated_features = True

data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
filename_dataset = 'dataset.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

'''
if truncated_features:
    model_filename_prefix = 'model_ae_truncated_'

    left_fraction = 30
    geN = 20

    data = bn.loadMatrix(data_dirname+filename_dataset, one_node=1, ignore_negative=0)
    data, tau = bn.thresholdMatrix(data, left_fraction=left_fraction, one_node=1)
    T, labels = data.iloc[:, 1:].values, data.iloc[:, 0].values
    print 'truncated'
else:
'''
model_filename_prefix = 'model_ae_'

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)
print 'full'
    
sizes = [400, 100, 25]
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 1

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']



ae.buildAutoencoderModels(
    T, train_indices, test_indices, sizes, model_dirname, nEpoch,
    batch_size, num_workers, model_filename_prefix
)

full
(1) Errors on training set (1708 samples): 
min=8.428e-02 / mean=1.438e-01 / median=1.384e-01 / max=3.616e-01
(1) Errors on validation set (154 samples): 
min=9.462e-02 / mean=2.482e-01 / median=2.193e-01 / max=7.921e-01
(2) Errors on training set (1708 samples): 
min=5.450e-02 / mean=9.692e-02 / median=9.392e-02 / max=2.032e-01
(2) Errors on validation set (154 samples): 
min=5.876e-02 / mean=2.430e-01 / median=2.078e-01 / max=7.188e-01
(3) Errors on training set (1708 samples): 
min=5.335e-02 / mean=1.079e-01 / median=1.035e-01 / max=2.370e-01
(3) Errors on validation set (154 samples): 
min=5.860e-02 / mean=2.960e-01 / median=2.574e-01 / max=8.762e-01
(1) Errors on training set (1751 samples): 
min=8.201e-02 / mean=1.444e-01 / median=1.389e-01 / max=5.689e-01
(1) Errors on validation set (145 samples): 
min=1.042e-01 / mean=2.228e-01 / median=1.963e-01 / max=7.212e-01
(2) Errors on training set (1751 samples): 
min=4.348e-02 / mean=9.191e-02 / median=8.939e-02 / max=2.129e-01
(

### Approximation error

In this code data encoded and decoded with previously trained models. Resulting approximation (relative residual error by means of $l_2$ norm) is printed sample-wise.

[Back to contents](#Contents)

In [10]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
model_filename_prefix = 'model_ae_'

filename_dataset = 'dataset.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

sizes = [400, 100, 25]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)
N = T.shape[1]

ae.checkRelRes(T, train_indices, test_indices, sizes, model_dirname, model_filename_prefix)

model_ae_0
Training set:
min=5.335e-02 / mean=1.079e-01 / median=1.035e-01 / max=2.370e-01
Validation set:
min=5.860e-02 / mean=2.960e-01 / median=2.574e-01 / max=8.762e-01
model_ae_1
Training set:
min=4.946e-02 / mean=1.085e-01 / median=1.057e-01 / max=2.953e-01
Validation set:
min=6.976e-02 / mean=2.467e-01 / median=1.992e-01 / max=9.702e-01
model_ae_2
Training set:
min=5.090e-02 / mean=1.095e-01 / median=1.042e-01 / max=2.588e-01
Validation set:
min=5.927e-02 / mean=2.508e-01 / median=2.077e-01 / max=6.981e-01
model_ae_3
Training set:
min=4.767e-02 / mean=1.058e-01 / median=1.021e-01 / max=3.599e-01
Validation set:
min=8.045e-02 / mean=2.671e-01 / median=2.188e-01 / max=7.499e-01
model_ae_4
Training set:
min=4.777e-02 / mean=1.074e-01 / median=1.027e-01 / max=2.567e-01
Validation set:
min=5.558e-02 / mean=2.779e-01 / median=2.314e-01 / max=7.432e-01
model_ae_5
Training set:
min=5.573e-02 / mean=1.097e-01 / median=1.062e-01 / max=3.270e-01
Validation set:
min=5.468e-02 / mean=2.639e-

### Encode data with computed autoencoders

It will produce data encoded with models from previous steps.

[Back to contents](#Contents)

In [11]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
#model_filename_prefix = 'model_aeGE20_'
model_filename_prefix = 'model_ae_'
filename_dataset = 'dataset.npz'
save_filename = 'autoencoded_' + filename_dataset
filename_dataset2 = 'test2.npz'
save_filename2 = 'autoencoded_' + filename_dataset2

sizes = [400, 100, 25]
num_workers = 1

df = np.load(data_dirname+filename_dataset)
ae.encodeDataset(df, sizes, model_dirname, model_filename_prefix,
                  data_dirname+save_filename, num_workers, return_result=0)
df = np.load(data_dirname+filename_dataset2)
ae.encodeDataset(df, sizes, model_dirname, model_filename_prefix,
                  data_dirname+save_filename2, num_workers, return_result=0)

encoded with model_ae_0. Comp.time=0.23902
encoded with model_ae_1. Comp.time=0.23884
encoded with model_ae_2. Comp.time=0.24471
encoded with model_ae_3. Comp.time=0.24480
encoded with model_ae_4. Comp.time=0.24264
encoded with model_ae_5. Comp.time=0.23970
encoded with model_ae_6. Comp.time=0.24314
encoded with model_ae_7. Comp.time=0.24486
encoded with model_ae_8. Comp.time=0.23966
encoded with model_ae_9. Comp.time=0.24009
encoded with model_ae_10. Comp.time=0.23975
encoded with model_ae_11. Comp.time=0.24006
encoded with model_ae_12. Comp.time=0.23938
encoded with model_ae_13. Comp.time=0.24465
encoded with model_ae_14. Comp.time=0.24483
encoded with model_ae_15. Comp.time=0.24414
encoded with model_ae_16. Comp.time=0.23968
encoded with model_ae_17. Comp.time=0.23913
encoded with model_ae_18. Comp.time=0.23983
encoded with model_ae_19. Comp.time=0.23939
encoded with model_ae_20. Comp.time=0.23964
encoded with model_ae_21. Comp.time=0.23965
encoded with model_ae_22. Comp.time=0.2394

### Investigate how size of last encoding layer affects results: training models

Here we use only one repeat of 5-fold CV

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/last_layer/'
model_filename_prefix = 'model_ae_'

filename_dataset = 'dataset.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

sizes = [400, 100, 25]
sizes_ll = range(1, sizes[-1]+1)
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 2
n_splits = 5


df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize features of each sample
T /= np.linalg.norm(T, axis=1, keepdims=1)

ae.investigateLastLayerTrain(T, train_indices[:n_splits], test_indices[:n_splits], sizes, sizes_ll,
        model_dirname, nEpoch, batch_size, num_workers, model_filename_prefix)


(1) Errors on training set (1708 samples): 
min=8.584e-02 / mean=1.455e-01 / median=1.397e-01 / max=4.386e-01
(1) Errors on validation set (154 samples): 
min=9.600e-02 / mean=2.503e-01 / median=2.243e-01 / max=8.443e-01
(2) Errors on training set (1708 samples): 
min=4.957e-02 / mean=9.409e-02 / median=9.081e-02 / max=2.073e-01
(2) Errors on validation set (154 samples): 
min=5.442e-02 / mean=2.427e-01 / median=2.100e-01 / max=7.229e-01
(1) Errors on training set (1751 samples): 
min=8.259e-02 / mean=1.439e-01 / median=1.392e-01 / max=4.110e-01
(1) Errors on validation set (145 samples): 
min=1.063e-01 / mean=2.231e-01 / median=1.967e-01 / max=7.348e-01
(2) Errors on training set (1751 samples): 
min=4.329e-02 / mean=9.175e-02 / median=8.890e-02 / max=2.614e-01
(2) Errors on validation set (145 samples): 
min=6.902e-02 / mean=2.036e-01 / median=1.663e-01 / max=5.968e-01
(1) Errors on training set (1797 samples): 
min=8.300e-02 / mean=1.413e-01 / median=1.361e-01 / max=3.367e-01
(1) Er

## Varying last layer size: approximation error

In this code data encoded and decoded with previously trained models. Resulting approximation (relative residual error by means of  l1/l2  norm) is printed sample-wise.

In [6]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/last_layer/'
model_filename_hard_prefix = 'll='
model_filename_base = 'model_ae_'

results_filename = 'relres_ae_ll'

filename_dataset = 'dataset.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'


sizes = [400, 100, 25]
sizes_ll = range(1, sizes[-1]+1)
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 2
n_splits = 5

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)
N = T.shape[1]

# optimizer parameters (needed to initialize AE class before parameters loading)
learning_rate = 0.0025
betas = (0.9, 0.999)
eps = 1e-5
optimizer = lambda params: torch.optim.Adam(params, lr=learning_rate, betas=betas, eps=eps)
nls = [nn.ReLU()]+[nn.Sigmoid()]*(len(sizes))


model_fname_list = os.listdir(model_dirname)
model_fname_list = filter(lambda x: x.startswith(model_filename_hard_prefix), model_fname_list)
model_fname_list = filter(lambda x: model_filename_base in x, model_fname_list)
model_fname_list = sorted(model_fname_list, key=stringSplitByNumbers)

relres_train = []
relres_test = []
for i_sz in xrange(len(sizes_ll)):
    #current_size = sizes_ll[i_sz]
    sizes_local = [N] + sizes[:-1] + [sizes_ll[i_sz]]
    autoencoder = ae.AutoEncoder(sizes_local, nls, optimizer=optimizer, loss=nn.SmoothL1Loss)
    relres_train_loc = []
    relres_test_loc = []
    for i_cv in xrange(n_splits):
        num_model = i_sz*n_splits + i_cv
        model_fname = model_fname_list[num_model]
        print model_fname, sizes_local
        autoencoder.load_state_dict(torch.load(model_dirname+model_fname))
        _, tmp = ae.getStats(autoencoder, T[train_indices[i_cv]])
        relres_train_loc.append(np.median(tmp))
        _, tmp = ae.getStats(autoencoder, T[test_indices[i_cv]])
        relres_test_loc.append(np.median(tmp))
    relres_train.append(np.median(relres_train_loc))
    relres_test.append(np.median(relres_test_loc))
np.savez_compressed(model_dirname+results_filename, relres_train=relres_train, relres_test=relres_test)

ll=1_model_ae_0 [1600, 400, 100, 1]
min=1.956e-01 / mean=4.959e-01 / median=4.828e-01 / max=1.012e+00
min=1.945e-01 / mean=5.290e-01 / median=4.840e-01 / max=1.124e+00
ll=1_model_ae_1 [1600, 400, 100, 1]
min=1.779e-01 / mean=4.931e-01 / median=4.729e-01 / max=1.029e+00
min=1.969e-01 / mean=4.885e-01 / median=4.604e-01 / max=1.048e+00
ll=1_model_ae_2 [1600, 400, 100, 1]
min=1.753e-01 / mean=4.867e-01 / median=4.686e-01 / max=1.081e+00
min=2.070e-01 / mean=5.321e-01 / median=4.819e-01 / max=1.214e+00
ll=1_model_ae_3 [1600, 400, 100, 1]
min=1.429e-01 / mean=4.804e-01 / median=4.620e-01 / max=1.162e+00
min=1.778e-01 / mean=5.350e-01 / median=5.059e-01 / max=1.117e+00
ll=1_model_ae_4 [1600, 400, 100, 1]
min=1.765e-01 / mean=4.887e-01 / median=4.689e-01 / max=1.046e+00
min=1.894e-01 / mean=5.182e-01 / median=4.969e-01 / max=1.205e+00
ll=2_model_ae_0 [1600, 400, 100, 2]
min=1.121e-01 / mean=3.802e-01 / median=3.629e-01 / max=8.718e-01
min=1.326e-01 / mean=5.024e-01 / median=4.598e-01 / max=1.

### Investigate how size of last encoding layer affects results: encode data

Here we use only one repeat of 5-fold CV

[Back to contents](#Contents)

In [14]:
data_dirname = '../data/'
model_prefix = 'll='
model_dirname = '../models/autoencoder/last_layer/'
model_filename_prefix = 'model_ae_'
filename_dataset = 'dataset.npz'
save_filename_postfix = 'autoencoded_ll_' + filename_dataset

sizes = [400, 100, 25]
num_workers = 2

df = np.load(data_dirname+filename_dataset)

for k in xrange(sizes[-1]):
    sizes_ll = sizes[:-1] + [k+1]
    model_full_prefix = model_prefix + str(k+1) + '_' + model_filename_prefix
    save_filename = model_prefix+str(k+1)+'_'+save_filename_postfix
    ae.encodeDataset(
        df, sizes_ll, model_dirname, model_full_prefix,
        data_dirname+save_filename, num_workers, return_result=0
    )
    print model_dirname+model_full_prefix

encoded with ll=1_model_ae_0. Comp.time=0.23442
encoded with ll=1_model_ae_1. Comp.time=0.23787
encoded with ll=1_model_ae_2. Comp.time=0.23444
encoded with ll=1_model_ae_3. Comp.time=0.23111
encoded with ll=1_model_ae_4. Comp.time=0.23116
../models/autoencoder/last_layer/ll=1_model_ae_
encoded with ll=2_model_ae_0. Comp.time=0.23084
encoded with ll=2_model_ae_1. Comp.time=0.23157
encoded with ll=2_model_ae_2. Comp.time=0.23165
encoded with ll=2_model_ae_3. Comp.time=0.23075
encoded with ll=2_model_ae_4. Comp.time=0.23135
../models/autoencoder/last_layer/ll=2_model_ae_
encoded with ll=3_model_ae_0. Comp.time=0.23144
encoded with ll=3_model_ae_1. Comp.time=0.23334
encoded with ll=3_model_ae_2. Comp.time=0.24248
encoded with ll=3_model_ae_3. Comp.time=0.23241
encoded with ll=3_model_ae_4. Comp.time=0.23321
../models/autoencoder/last_layer/ll=3_model_ae_
encoded with ll=4_model_ae_0. Comp.time=0.23172
encoded with ll=4_model_ae_1. Comp.time=0.23176
encoded with ll=4_model_ae_2. Comp.time=

### Investigate how size of last encoding layer affects results: predict with logistic regression

Here we use only one repeat of 5-fold CV

[Back to contents](#Contents)

In [15]:
data_dirname = '../data/'
model_prefix = 'll='
model_dirname = '../models/autoencoder/last_layer/'
model_filename_prefix = 'model_ae_'
filename_dataset_base = 'autoencoded_ll_dataset.npz'

ll_max = 25
n_splits = 5

dirname_results = '../results/'
filename_results = 'll_autoencoder+LR'

#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']



tms = []
#predict_train_all = []
#predict_test_all = []

accuracies = []
f1s = []

for l in xrange(ll_max):
    print 'last layer size=%d' % (l+1)
    filename_data = model_prefix+str(l+1)+'_'+filename_dataset_base
    df = np.load(data_dirname+filename_data)
    X, y = df['data'], df['label']
    y = reshape(y, [-1, 1])
    
    tms_l = []
    predict_train_l = []
    predict_test_l = []

    accuracies_l = []
    f1s_l = []
    for k in xrange(n_splits):
        train_index = train_indices[k]
        test_index = test_indices[k]

        classifier = LogisticRegression(
            penalty='l1', dual=False, tol=0.0001, C=1000.0, fit_intercept=True,
            intercept_scaling=1, class_weight=None, random_state=None,
            solver='saga', max_iter=1000, multi_class='multinomial', verbose=0,
            warm_start=False, n_jobs=1
        )

        tic = time.clock();
        classifier.fit(X[k][train_index], y[train_index])
        toc = time.clock();

        tms_loc = [toc-tic]

        tic = time.clock()
        predict_train = classifier.predict(X[k][train_index])
        toc = time.clock()
        tms_loc.append(toc-tic)
        acc_loc = [accuracy_score(y[train_index], predict_train)]
        f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
        tic = time.clock()
        predict_test = classifier.predict(X[k][test_index])
        toc = time.clock()
        acc_loc.append( accuracy_score(y[test_index], predict_test) )
        f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
        #confusion_matrices.append(confusion_matrix(y[test_index], predict_test))
        tms_loc.append(toc-tic)

        accuracies_l.append(acc_loc)
        f1s_l.append(f1_loc)
        tms_l.append(tms_loc)
        #predict_train_l.append( predict_train )
        #predict_test_l.append( predict_test )
        
    tms.append(tms_l)
    #predict_train_all.append(predict_train_l)
    #predict_test_all.append(predict_test_l)

    accuracies.append(accuracies_l)
    f1s.append(f1s_l)
np.savez_compressed(
    dirname_results+filename_results, tms=tms,
    acc=accuracies, f1=f1s
)    
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=1)
print "f1 measure"
print np.median(f1s, axis=1)

last layer size=1


  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)


last layer size=2
last layer size=3
last layer size=4




last layer size=5
last layer size=6
last layer size=7
last layer size=8
last layer size=9
last layer size=10
last layer size=11
last layer size=12
last layer size=13
last layer size=14
last layer size=15
last layer size=16
last layer size=17
last layer size=18
last layer size=19
last layer size=20
last layer size=21
last layer size=22
last layer size=23
last layer size=24
last layer size=25
accuracies
[[0.12107367 0.12781955]
 [0.1264637  0.12337662]
 [0.17247287 0.15037594]
 [0.25352113 0.14953271]
 [0.37706667 0.24137931]
 [0.52738654 0.34579439]
 [0.6672     0.47663551]
 [0.73656755 0.47787611]
 [0.83894917 0.59398496]
 [0.86293547 0.6137931 ]
 [0.91096272 0.69172932]
 [0.93318104 0.66896552]
 [0.95784543 0.68224299]
 [0.96630497 0.73793103]
 [0.97161937 0.75172414]
 [0.98330551 0.73451327]
 [0.98720089 0.77876106]
 [0.98942682 0.7699115 ]
 [0.99443517 0.80530973]
 [0.99648712 0.81308411]
 [0.99648712 0.82068966]
 [0.99824356 0.81168831]
 [0.99733333 0.85981308]
 [0.99843505 0.84112

In [24]:
dirname_results = '../results/'
filename_results = 'll_autoencoder+LR.npz'
df = np.load(
    dirname_results+filename_results
)    
accuracies = df['acc']
f1s = df['f1']
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=1)
print "f1 measure"
print np.median(f1s, axis=1)

accuracies
[[0.12107367 0.12781955]
 [0.1264637  0.12337662]
 [0.17247287 0.15037594]
 [0.25352113 0.14953271]
 [0.37706667 0.24137931]
 [0.52738654 0.34579439]
 [0.6672     0.47663551]
 [0.73656755 0.47787611]
 [0.83894917 0.59398496]
 [0.86293547 0.6137931 ]
 [0.91096272 0.69172932]
 [0.93318104 0.66896552]
 [0.95784543 0.68224299]
 [0.96630497 0.73793103]
 [0.97161937 0.75172414]
 [0.98330551 0.73451327]
 [0.98720089 0.77876106]
 [0.98942682 0.7699115 ]
 [0.99443517 0.80530973]
 [0.99648712 0.81308411]
 [0.99648712 0.82068966]
 [0.99824356 0.81168831]
 [0.99733333 0.85981308]
 [0.99843505 0.8411215 ]
 [0.99824356 0.84137931]]
f1 measure
[[0.02788155 0.03126521]
 [0.0402193  0.0362217 ]
 [0.08511826 0.06173296]
 [0.19812087 0.08686081]
 [0.33370047 0.20412151]
 [0.5060822  0.30047471]
 [0.65639345 0.44307966]
 [0.73144748 0.45301972]
 [0.83671805 0.56881358]
 [0.86108373 0.58891698]
 [0.91020162 0.67017544]
 [0.92950254 0.65865185]
 [0.9572594  0.67040498]
 [0.96622075 0.72686118]
 [

### Logistic regression classifier with encoded data

[Back to contents](#Contents)

In [16]:
data_dirname = '../data/'
dirname_results = '../results/'
filename_results = 'autoencoder+LR'
data_filename = 'autoencoded_dataset.npz'
data_test2_filename = 'autoencoded_test2.npz'

#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']
test_indices = test_indices.tolist()
train_indices = train_indices.tolist()

df = np.load(data_dirname+data_test2_filename)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

df = np.load(data_dirname+data_filename)
X, y = df['data'], df['label']
y = reshape(y, [-1, 1])
colnames = ['identity'] + ['V%d' % (i) for i in xrange(X.shape[-1])]

tms = []
predict_train_all = []
predict_test_all = []
predict_test2_all = []

confusion_matrices = []
accuracies = []
f1s = []

# correct label in the end
predicted_probas_test = []
predicted_probas_test2 = []
for k in xrange(len(train_indices)):
    print "CV %d / %d" % (k+1, len(train_indices))
    train_index = train_indices[k]
    test_index = test_indices[k]
    
    classifier = LogisticRegression(
        penalty='l1', dual=False, tol=0.0001, C=1000.0, fit_intercept=True,
        intercept_scaling=1, class_weight=None, random_state=None,
        solver='saga', max_iter=1000, multi_class='multinomial', verbose=0,
        warm_start=False, n_jobs=1
    )
    
    tic = time.clock();
    classifier.fit(X[k][train_index], y[train_index])
    toc = time.clock();
    
    tms_loc = [toc-tic]
    
    tic = time.clock()
    predict_train = classifier.predict(X[k][train_index])
    toc = time.clock()
    tms_loc.append(toc-tic)
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    tic = time.clock()
    predict_test = classifier.predict(X[k][test_index])
    toc = time.clock()
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    confusion_matrices.append(confusion_matrix(y[test_index], predict_test))
    tms_loc.append(toc-tic)
    
    tmp = reshape(np.array(y[test_index]), [-1, 1])
    tmp = np.hstack([classifier.predict_proba(X[k][test_index]), tmp])
    predicted_probas_test.append( tmp.copy() )
    tmp = reshape(np.array(y_test2), [-1, 1])
    tmp = np.hstack([classifier.predict_proba(X_test2[k]), tmp])
    predicted_probas_test2.append( tmp.copy() )
    
    predict_test2 = classifier.predict(X_test2[k])
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    tms.append(tms_loc)
    predict_train_all.append( predict_train )
    predict_test_all.append( predict_test )
    predict_test2_all.append( predict_test2 )
    np.savez_compressed(
        dirname_results+filename_results, tms=tms, predict_train=predict_train_all,
        predict_test=predict_test_all, predict_test2=predict_test2_all, test_indices=test_indices,
        train_indices=train_indices, y_test2=y_test2.T, y=y, confusion_matrices=confusion_matrices,
        acc=accuracies, f1=f1s, predicted_probas_test=predicted_probas_test,
        predicted_probas_test2=predicted_probas_test2
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

CV 1 / 25


  'recall', 'true', average, warn_for)


CV 2 / 25
CV 3 / 25
CV 4 / 25
CV 5 / 25
CV 6 / 25
CV 7 / 25
CV 8 / 25
CV 9 / 25
CV 10 / 25
CV 11 / 25
CV 12 / 25
CV 13 / 25
CV 14 / 25
CV 15 / 25
CV 16 / 25
CV 17 / 25
CV 18 / 25
CV 19 / 25
CV 20 / 25
CV 21 / 25
CV 22 / 25
CV 23 / 25
CV 24 / 25
CV 25 / 25
accuracies
[0.99839056 0.83185841 0.72727273]
f1 measure
[0.99839192 0.819094   0.78400072]


In [26]:
dirname_results = '../results/'
filename_results = 'autoencoder+LR.npz'

df = np.load(
    dirname_results+filename_results
)
accuracies = df['acc']
f1s = df['f1']
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

accuracies
[0.99839056 0.83185841 0.72727273]
f1 measure
[0.99839192 0.819094   0.78400072]


### Gaussian Naive Bayes classifier with encoded data

[Back to contents](#Contents)

In [17]:
data_dirname = '../data/'
dirname_results = '../results/'
filename_results = 'autoencoder+NB'
data_filename = 'autoencoded_dataset.npz'
data_test2_filename = 'autoencoded_test2.npz'

#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+data_test2_filename)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

df = np.load(data_dirname+data_filename)
X, y = df['data'], df['label']
y = reshape(y, [-1, 1])
colnames = ['identity'] + ['V%d' % (i) for i in xrange(X.shape[-1])]

tms = []
predict_train_all = []
predict_test_all = []
predict_test2_all = []

# correct label in the end
predicted_probas_test = []
predicted_probas_test2 = []

confusion_matrices = []
accuracies = []
f1s = []

for k in xrange(len(train_indices)):
    train_index = train_indices[k]
    test_index = test_indices[k]
    
    classifier = GaussianNB()
    
    tic = time.clock();
    classifier.fit(X[k][train_index], y[train_index])
    toc = time.clock();
    
    tms_loc = [toc-tic]
    
    tic = time.clock()
    predict_train = classifier.predict(X[k][train_index])
    toc = time.clock()
    tms_loc.append(toc-tic)
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    tic = time.clock()
    predict_test = classifier.predict(X[k][test_index])
    toc = time.clock()
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    confusion_matrices.append(confusion_matrix(y[test_index], predict_test))
    tms_loc.append(toc-tic)
    predict_test2 = classifier.predict(X_test2[k])
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    
    tmp = reshape(np.array(y[test_index]), [-1, 1])
    tmp = np.hstack([classifier.predict_proba(X[k][test_index]), tmp])
    predicted_probas_test.append( tmp.copy() )
    tmp = reshape(np.array(y_test2), [-1, 1])
    tmp = np.hstack([classifier.predict_proba(X_test2[k]), tmp])
    predicted_probas_test2.append( tmp.copy() )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    tms.append(tms_loc)
    predict_train_all.append( predict_train )
    predict_test_all.append( predict_test )
    predict_test2_all.append( predict_test2 )
    np.savez_compressed(
        dirname_results+filename_results, tms=tms, predict_train=predict_train_all,
        predict_test=predict_test_all, predict_test2=predict_test2_all, test_indices=test_indices,
        train_indices=train_indices, y_test2=y_test2.T, y=y, confusion_matrices=confusion_matrices,
        acc=accuracies, f1=f1s, predicted_probas_test=predicted_probas_test,
        predicted_probas_test2=predicted_probas_test2
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

accuracies
[0.90111176 0.6728972  0.72727273]
f1 measure
[0.90479389 0.66350767 0.80925325]


In [27]:
dirname_results = '../results/'
filename_results = 'autoencoder+NB.npz'

df = np.load(
    dirname_results+filename_results
)
accuracies = df['acc']
f1s = df['f1']
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

accuracies
[0.90111176 0.6728972  0.72727273]
f1 measure
[0.90479389 0.66350767 0.80925325]


### Hybrid Bayesian classifier with bnlearn

To reproduce this part of research, user should additionally install kernel for R (please see [Chemfin notebook](../Chemfin.ipynb) ) and bnlearn package.

[Back to contents](#Contents)

In [None]:
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri
from rpy2.robjects.packages import importr
import numpy as np
import pandas as pd
import time

import sys
sys.path.append('../src/')
from computational_utils import reshape

rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

To install bnlearn package, run code in the following cell. Otherwise please skip it.

In [None]:
utils = importr('utils')
utils.install_packages('bnlearn');

Learn structure, fit training set and predict labels for training, validation and test2 parts.

In [None]:
bnlearn = importr('bnlearn')

data_dirname = '../data/'
dirname_results = '../results/'
filename_results = 'autoencoder+HBN'
data_filename = 'autoencoded_dataset.npz'
data_test2_filename = 'autoencoded_test2.npz'

#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+data_test2_filename)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

df = np.load(data_dirname+data_filename)
X, y = df['data'], df['label']
y = reshape(y, [-1, 1])
colnames = ['identity'] + ['V%d' % (i) for i in xrange(X.shape[-1])]

tms = []
predict_train_all = []
predict_test_all = []
predict_test2_all = []

confusion_matrices = []
accuracies = []
f1s = []

for k in xrange(len(train_indices)):
    train_index = train_indices[k]
    test_index = test_indices[k]
    dataset_train = np.hstack([y[train_index], X[k, train_index, :]])
    dataset_train = pd.DataFrame(dataset_train, columns=colnames)
    dataset_train['identity'] = dataset_train['identity'].apply(str)
    dmap = dataset_train.iloc[:, 0].values
    dmap = np.unique(dmap)
    dmap = np.array(map(float, dmap)).astype('i')
    dataset_test = np.hstack([y[test_index], X[k, test_index, :]])
    dataset_test = pd.DataFrame(dataset_test, columns=colnames)
    dataset_test['identity'] = dataset_test['identity'].apply(str)
    dataset_test2 = np.hstack([y_test2, X_test2[k, :, :]])
    dataset_test2 = pd.DataFrame(dataset_test2, columns=colnames)
    dataset_test2['identity'] = dataset_test2['identity'].apply(str)
    
    tic = time.clock()
    hBN_structure = bnlearn.mmhc(dataset_train)
    toc = time.clock()
    tms_loc = [toc-tic]
    fitted_bn = bnlearn.bn_fit(hBN_structure, dataset_train, method='mle')
    
    tic = time.clock()
    predict_train = bnlearn.predict_bn_fit(
        fitted_bn, node='identity', data=dataset_train.iloc[:, 1:], method='bayes-lw'
    )
    
    predict_test = bnlearn.predict_bn_fit(
        fitted_bn, node='identity', data=dataset_test.iloc[:, 1:], method='bayes-lw'
    )
    
    toc = time.clock()
    predict_train = np.array(predict_train)-1
    predict_train = dmap[predict_train]
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    predict_test = np.array(predict_test)-1
    predict_test = dmap[predict_test]
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    tms_loc.append(toc-tic)
    predict_test2 = bnlearn.predict_bn_fit(
        fitted_bn, node='identity', data=dataset_test2.iloc[:, 1:], method='bayes-lw'
    )
    predict_test2 = np.array(predict_test2)-1
    predict_test2 = dmap[predict_test2]
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    tms.append(tms_loc)
    predict_train_all.append( predict_train )
    predict_test_all.append( predict_test )
    predict_test2_all.append( predict_test2 )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    
    np.savez_compressed(
        dirname_results+filename_results, tms=tms, predict_train=predict_train_all,
        predict_test=predict_test_all, predict_test2=predict_test2_all, test_indices=test_indices,
        train_indices=train_indices, y_test2=y_test2.T, y=y
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

In [30]:
dirname_results = '../results/'
filename_results = 'autoencoder+HBN.npz'

df = np.load(
    dirname_results+filename_results
)

accuracies = []
f1s = []
y = df['y']
y_test2 = df['y_test2'].flatten()
for k in xrange(len(train_indices)):
    acc_loc = []
    f1_loc = []
    train_index = train_indices[k]
    test_index = test_indices[k]
    
    predict_train = df['predict_train'][k]
    predict_test = df['predict_test'][k]
    predict_test2 = df['predict_test2'][k]
    
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    

accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

accuracies
[0.92175274 0.68831169 0.63636364]
f1 measure
[0.92322622 0.6599836  0.72402597]


### Autoencoder learned on whole database



In [27]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
model_filename_prefix = 'full_data_model_ae_'

filename_dataset = 'dataset.npz'

sizes = [400, 100, 25]
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 14

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)

learning_rate = 0.0025
betas = (0.9, 0.999)
eps = 1e-5
optimizer = lambda params: torch.optim.Adam(params, lr=learning_rate, betas=betas, eps=eps) # Sparse
N = T.shape[1]
# AE structure + instance
nls = [nn.ReLU()]+[nn.Sigmoid()]*len(sizes)
sizes = [N] + sizes
dataset = torch.from_numpy(T.copy())
data = ae.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
times = np.zeros([len(sizes)-1, 2])
# Nlevels, l1/l2
stats_integral = np.zeros([len(sizes)-1])
# N samples, Nlevels, l1/l2
sample_stats = np.zeros([len(sizes)-1, T.shape[0]])
# Ntrain, Nlevels, nEpoch
loss_values = np.zeros([len(sizes)-1, max(nEpoch)])
for k in xrange(len(sizes)-1):
    autoencoder = ae.AutoEncoder(sizes=sizes[:k+2], nls=nls[:k+2], optimizer=optimizer, loss=nn.SmoothL1Loss)
    if k > 0:
        for i in xrange(k):
            autoencoder.Encoder[2*i].weight = weights[2*i]
            autoencoder.Encoder[2*i].bias = biases[2*i]
            autoencoder.Decoder[2*(i+1)].weight = weights[2*i+1]
            autoencoder.Decoder[2*(i+1)].bias = biases[2*i+1]
    t1_time = time.time(); t1_clock = time.clock()
    loss_values_level = autoencoder.fit(data, nEpoch[k], verbose=0)
    t2_clock = time.clock(); t2_time = time.time()
    times[k, 0] = t2_time-t1_time
    times[k, 1] = t2_clock-t1_clock
    torch.save(autoencoder.state_dict(), model_dirname+model_filename_prefix)
    print '(%d) Errors on input set (%d samples): ' % (k+1, T.shape[0])
    stats_level = ae.getStats(autoencoder, T)

    stats_integral[k] = stats_level[0]
    sample_stats[k] = stats_level[1]
    loss_values[k, :nEpoch[k]] = loss_values_level
    np.savez_compressed(
        model_dirname+'full_ae_stats', stats_integral=stats_integral,
        loss_values=loss_values,
        sample_stats=sample_stats, nEpoch=nEpoch, times=times
    )
    # test on train/valid. sets
    if k < (len(sizes)-2):
        weights, biases = [], []
        for i in xrange(k+1):
            weights.append( copy.deepcopy(autoencoder.Encoder[2*i].weight) )
            weights.append( copy.deepcopy(autoencoder.Decoder[2*i].weight) )
            biases.append( copy.deepcopy(autoencoder.Encoder[2*i].bias) )
            biases.append( copy.deepcopy(autoencoder.Decoder[2*i].bias) )

(1) Errors on input set (2263 samples): 
min=5.964e-02 / mean=1.099e-01 / median=1.061e-01 / max=2.278e-01
(2) Errors on input set (2263 samples): 
min=4.737e-02 / mean=8.862e-02 / median=8.615e-02 / max=1.804e-01
(3) Errors on input set (2263 samples): 
min=5.128e-02 / mean=1.058e-01 / median=1.013e-01 / max=5.210e-01


In [32]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
model_fname = 'full_data_model_ae_'
save_filename = 'full_dataset_autoencoded'

filename_dataset = 'dataset.npz'

sizes = [400, 100, 25]
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 14

resultsTime = []
# optimizer parameters
learning_rate = 0.0025
betas = (0.9, 0.999)
eps = 1e-5
optimizer = lambda params: torch.optim.Adam(params, lr=learning_rate, betas=betas, eps=eps)
    
df = np.load(data_dirname+filename_dataset)
T, y = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)
    
    
N = T.shape[1]
sizes = [N] + sizes

nls = [nn.ReLU()]+[nn.Sigmoid()]*(len(sizes)-1)

autoencoder = ae.AutoEncoder(sizes, nls, optimizer=optimizer, loss=nn.SmoothL1Loss)
autoencoder.load_state_dict(torch.load(model_dirname+model_fname))

X = Variable(torch.from_numpy(T.copy()))
tic = time.clock()
Y = autoencoder.encode(X)
toc = time.clock()
X = Y.data.numpy()
resultsTime = toc-tic
print "encoded with %s. Comp.time=%.5f s" % (model_fname, resultsTime)
np.savez_compressed(data_dirname+save_filename, data=X, label=y, time=resultsTime)

encoded with full_data_model_ae_. Comp.time=0.47915 s
