# Autoencoder

## Contents
- [1. Imports](#Imports)
- [2. Build models](#Build-models)
- [3. Approximation error](#Approximation-error)
- [4. Encode data with computed autoencoders](#Encode-data-with-computed-autoencoders)
- [5. Investigate how size of last encoding layer affects results: training models](#Investigate-how-size-of-last-encoding-layer-affects-results:-training-models)
- [6. Investigate how size of last encoding layer affects results: encode data](#Investigate-how-size-of-last-encoding-layer-affects-results:-encode-data)
- [7. Investigate how size of last encoding layer affects results: predict with logistic regression](#Investigate-how-size-of-last-encoding-layer-affects-results:-predict-with-logistic-regression)
- [8. Logistic regression classifier with encoded data](#Logistic-regression-classifier-with-encoded-data)
- [9. Gaussian Naive Bayes classifier with encoded data](#Gaussian-Naive-Bayes-classifier-with-encoded-data)
- [10. Hybrid Bayesian classifier with bnlearn](#Hybrid-Bayesian-classifier-with-bnlearn)




[Back to Chemfin](../Chemfin.ipynb)


### Imports
The first cell with code includes all necessary inputs.

Requires [numpy](http://www.numpy.org/), [scikit-learn](http://scikit-learn.org/), [pyTorch](http://pytorch.org/), [Rpy2](https://rpy2.readthedocs.io).

[Back to contents](#Contents)

In [None]:
import sys
sys.path.append('../src/')

import numpy as np
import os
import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader

import autoencoder as ae

random_state = 150
torch.manual_seed(random_state);


from computational_utils import reshape

from io_work import stringSplitByNumbers

### Build models

Next cell contains script to build autoencoder models relevant to CV indexes supplied by data/cv_indices.npz.

Parameters to control are:

- sizes: list of integers which specifies output sizes for each encoding layer
- batch_size: number of samples to be used for computing new update at each epoch
- nEpoch: number of epochs for each layer
- num_workers: number of parallel processes to work

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
model_filename_prefix = 'model_ae_'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

sizes = [400, 100, 25]
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)

ae.buildAutoencoderModels(
    T, train_indices, test_indices, sizes, model_dirname, nEpoch,
    batch_size, num_workers, model_filename_prefix
)

### Approximation error

In this code data encoded and decoded with previously trained models. Resulting approximation (relative residual error by means of $l_2$ norm) is printed sample-wise.

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
model_filename_prefix = 'model_ae_'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

sizes = [400, 100, 25]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)
N = T.shape[1]

ae.checkRelRes(T, train_indices, test_indices, sizes, model_dirname, model_filename_prefix)

### Encode data with computed autoencoders

It will produce data encoded with models from previous steps.

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/'
model_filename_prefix = 'model_ae_'
filename_dataset = 'dataset.npz'
save_filename = 'autoencoded_' + filename_dataset
filename_dataset2 = 'test2.npz'
save_filename2 = 'autoencoded_' + filename_dataset2

sizes = [400, 100, 25]
num_workers = 2

df = np.load(data_dirname+filename_dataset)
ae.encodeDataset(df, sizes, model_dirname, model_filename_prefix,
                  data_dirname+save_filename, num_workers, return_result=0)
df = np.load(data_dirname+filename_dataset2)
ae.encodeDataset(df, sizes, model_dirname, model_filename_prefix,
                  data_dirname+save_filename2, num_workers, return_result=0)

### Investigate how size of last encoding layer affects results: training models

Here we use only one repeat of 5-fold CV

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/autoencoder/last_layer/'
model_filename_prefix = 'model_ae_'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

sizes = [400, 100, 25]
sizes_ll = range(1, sizes[-1]+1)
nEpoch = [1000, 1000, 1000]
batch_size = 200
num_workers = 2
n_splits = 5


df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
T, labels = df['data'], df['label']
# unfold into matrix
T = reshape(T, [T.shape[0], -1])
# normalize among samples
T /= np.linalg.norm(T, axis=1, keepdims=1)

ae.investigateLastLayerTrain(T, train_indices[:n_splits], test_indices[:n_splits], sizes, sizes_ll,
        model_dirname, nEpoch, batch_size, num_workers, model_filename_prefix)


### Investigate how size of last encoding layer affects results: encode data

Here we use only one repeat of 5-fold CV

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_prefix = 'll='
model_dirname = '../models/autoencoder/last_layer/'
model_filename_prefix = 'model_ae_'
filename_dataset = 'dataset.npz'
save_filename_postfix = 'autoencoded_' + filename_dataset

sizes = [400, 100, 25]
num_workers = 2

df = np.load(data_dirname+filename_dataset)

for k in xrange(sizes[-1]):
    sizes_ll = sizes[:-1] + [k+1]
    model_full_prefix = model_prefix + str(k+1) + '_' + model_filename_prefix
    save_filename = model_prefix+str(k+1)+'_'+save_filename_postfix
    ae.encodeDataset(
        df, sizes_ll, model_dirname, model_full_prefix,
        data_dirname+save_filename, num_workers, return_result=0
    )
    print model_dirname+model_full_prefix

### Investigate how size of last encoding layer affects results: predict with logistic regression

Here we use only one repeat of 5-fold CV

[Back to contents](#Contents)

In [None]:
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import sys
sys.path.append('../src/')

from computational_utils import reshape

In [None]:
data_dirname = '../data/'
model_prefix = 'll='
model_dirname = '../models/autoencoder/last_layer/'
model_filename_prefix = 'model_ae_'
filename_dataset_base = 'autoencoded_dataset.npz'

ll_max = 25
n_splits = 5

dirname_results = '../results/'
filename_results = 'll_autoencoder+LR'

filename_cv = 'cv_indices.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']



tms = []
#predict_train_all = []
#predict_test_all = []

accuracies = []
f1s = []

for l in xrange(ll_max):
    print 'last layer size=%d' % (l+1)
    filename_data = model_prefix+str(l+1)+'_'+filename_dataset_base
    df = np.load(data_dirname+filename_data)
    X, y = df['data'], df['label']
    y = reshape(y, [-1, 1])
    
    tms_l = []
    predict_train_l = []
    predict_test_l = []

    accuracies_l = []
    f1s_l = []
    for k in xrange(n_splits):
        train_index = train_indices[k]
        test_index = test_indices[k]

        classifier = LogisticRegression(
            penalty='l1', dual=False, tol=0.0001, C=1000.0, fit_intercept=True,
            intercept_scaling=1, class_weight=None, random_state=None,
            solver='saga', max_iter=1000, multi_class='multinomial', verbose=0,
            warm_start=False, n_jobs=1
        )

        tic = time.clock();
        classifier.fit(X[k][train_index], y[train_index])
        toc = time.clock();

        tms_loc = [toc-tic]

        tic = time.clock()
        predict_train = classifier.predict(X[k][train_index])
        toc = time.clock()
        tms_loc.append(toc-tic)
        acc_loc = [accuracy_score(y[train_index], predict_train)]
        f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
        tic = time.clock()
        predict_test = classifier.predict(X[k][test_index])
        toc = time.clock()
        acc_loc.append( accuracy_score(y[test_index], predict_test) )
        f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
        #confusion_matrices.append(confusion_matrix(y[test_index], predict_test))
        tms_loc.append(toc-tic)

        accuracies_l.append(acc_loc)
        f1s_l.append(f1_loc)
        tms_l.append(tms_loc)
        #predict_train_l.append( predict_train )
        #predict_test_l.append( predict_test )
        
    tms.append(tms_l)
    #predict_train_all.append(predict_train_l)
    #predict_test_all.append(predict_test_l)

    accuracies.append(accuracies_l)
    f1s.append(f1s_l)
np.savez_compressed(
    dirname_results+filename_results, tms=tms,
    acc=accuracies, f1=f1s
)    
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=1)
print "f1 measure"
print np.median(f1s, axis=1)

### Logistic regression classifier with encoded data

[Back to contents](#Contents)

In [None]:
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

In [None]:
data_dirname = '../data/'
dirname_results = '../results/'
filename_results = 'autoencoder+LR'
data_filename = 'autoencoded_dataset.npz'
data_test2_filename = 'autoencoded_test2.npz'

filename_cv = 'cv_indices.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+data_test2_filename)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

df = np.load(data_dirname+data_filename)
X, y = df['data'], df['label']
y = reshape(y, [-1, 1])
colnames = ['identity'] + ['V%d' % (i) for i in xrange(X.shape[-1])]

tms = []
predict_train_all = []
predict_test_all = []
predict_test2_all = []

confusion_matrices = []
accuracies = []
f1s = []

for k in xrange(len(train_indices)):
    train_index = train_indices[k]
    test_index = test_indices[k]
    
    classifier = LogisticRegression(
        penalty='l1', dual=False, tol=0.0001, C=1000.0, fit_intercept=True,
        intercept_scaling=1, class_weight=None, random_state=None,
        solver='saga', max_iter=1000, multi_class='multinomial', verbose=0,
        warm_start=False, n_jobs=1
    )
    
    tic = time.clock();
    classifier.fit(X[k][train_index], y[train_index])
    toc = time.clock();
    
    tms_loc = [toc-tic]
    
    tic = time.clock()
    predict_train = classifier.predict(X[k][train_index])
    toc = time.clock()
    tms_loc.append(toc-tic)
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    tic = time.clock()
    predict_test = classifier.predict(X[k][test_index])
    toc = time.clock()
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    confusion_matrices.append(confusion_matrix(y[test_index], predict_test))
    tms_loc.append(toc-tic)
    predict_test2 = classifier.predict(X_test2[k])
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    tms.append(tms_loc)
    predict_train_all.append( predict_train )
    predict_test_all.append( predict_test )
    predict_test2_all.append( predict_test2 )
    np.savez_compressed(
        dirname_results+filename_results, tms=tms, predict_train=predict_train_all,
        predict_test=predict_test_all, predict_test2=predict_test2_all, test_indices=test_indices,
        train_indices=train_indices, y_test2=y_test2.T, y=y, confusion_matrices=confusion_matrices,
        acc=accuracies, f1=f1s
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

### Gaussian Naive Bayes classifier with encoded data

[Back to contents](#Contents)

In [None]:
import numpy as np
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

In [None]:
data_dirname = '../data/'
dirname_results = '../results/'
filename_results = 'autoencoder+NB'
data_filename = 'autoencoded_dataset.npz'
data_test2_filename = 'autoencoded_test2.npz'

filename_cv = 'cv_indices.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+data_test2_filename)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

df = np.load(data_dirname+data_filename)
X, y = df['data'], df['label']
y = reshape(y, [-1, 1])
colnames = ['identity'] + ['V%d' % (i) for i in xrange(X.shape[-1])]

tms = []
predict_train_all = []
predict_test_all = []
predict_test2_all = []

confusion_matrices = []
accuracies = []
f1s = []

for k in xrange(len(train_indices)):
    train_index = train_indices[k]
    test_index = test_indices[k]
    
    classifier = GaussianNB()
    
    tic = time.clock();
    classifier.fit(X[k][train_index], y[train_index])
    toc = time.clock();
    
    tms_loc = [toc-tic]
    
    tic = time.clock()
    predict_train = classifier.predict(X[k][train_index])
    toc = time.clock()
    tms_loc.append(toc-tic)
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    tic = time.clock()
    predict_test = classifier.predict(X[k][test_index])
    toc = time.clock()
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    confusion_matrices.append(confusion_matrix(y[test_index], predict_test))
    tms_loc.append(toc-tic)
    predict_test2 = classifier.predict(X_test2[k])
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    tms.append(tms_loc)
    predict_train_all.append( predict_train )
    predict_test_all.append( predict_test )
    predict_test2_all.append( predict_test2 )
    np.savez_compressed(
        dirname_results+filename_results, tms=tms, predict_train=predict_train_all,
        predict_test=predict_test_all, predict_test2=predict_test2_all, test_indices=test_indices,
        train_indices=train_indices, y_test2=y_test2.T, y=y, confusion_matrices=confusion_matrices,
        acc=accuracies, f1=f1s
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)

### Hybrid Bayesian classifier with bnlearn

To reproduce this part of research, user should additionally install kernel for R (please see [Chemfin notebook](../Chemfin.ipynb) ) and bnlearn package.

[Back to contents](#Contents)

In [None]:
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri
from rpy2.robjects.packages import importr
import numpy as np
import pandas as pd
import time

import sys
sys.path.append('../src/')
from computational_utils import reshape

rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

To install bnlearn package, run code in the following cell. Otherwise please skip it.

In [None]:
utils = importr('utils')
utils.install_packages('bnlearn');

Learn structure, fit training set and predict labels for training, validation and test2 parts.

In [None]:
bnlearn = importr('bnlearn')

data_dirname = '../data/'
dirname_results = '../results/'
filename_results = 'autoencoder+HBN'
data_filename = 'autoencoded_dataset.npz'
data_test2_filename = 'autoencoded_test2.npz'

filename_cv = 'cv_indices.npz'
df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+data_test2_filename)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

df = np.load(data_dirname+data_filename)
X, y = df['data'], df['label']
y = reshape(y, [-1, 1])
colnames = ['identity'] + ['V%d' % (i) for i in xrange(X.shape[-1])]

tms = []
predict_train_all = []
predict_test_all = []
predict_test2_all = []

confusion_matrices = []
accuracies = []
f1s = []

for k in xrange(len(train_indices)):
    train_index = train_indices[k]
    test_index = test_indices[k]
    dataset_train = np.hstack([y[train_index], X[k, train_index, :]])
    dataset_train = pd.DataFrame(dataset_train, columns=colnames)
    dataset_train['identity'] = dataset_train['identity'].apply(str)
    dmap = dataset_train.iloc[:, 0].values
    dmap = np.unique(dmap)
    dataset_test = np.hstack([y[test_index], X[k, test_index, :]])
    dataset_test = pd.DataFrame(dataset_test, columns=colnames)
    dataset_test['identity'] = dataset_test['identity'].apply(str)
    dataset_test2 = np.hstack([y_test2, X_test2[k, :, :]])
    dataset_test2 = pd.DataFrame(dataset_test2, columns=colnames)
    dataset_test2['identity'] = dataset_test2['identity'].apply(str)
    
    tic = time.clock()
    hBN_structure = bnlearn.mmhc(dataset_train)
    toc = time.clock()
    tms_loc = [toc-tic]
    fitted_bn = bnlearn.bn_fit(hBN_structure, dataset_train, method='mle')
    
    tic = time.clock()
    predict_train = bnlearn.predict_bn_fit(
        fitted_bn, node='identity', data=dataset_train.iloc[:, 1:], method='bayes-lw'
    )
    
    predict_test = bnlearn.predict_bn_fit(
        fitted_bn, node='identity', data=dataset_test.iloc[:, 1:], method='bayes-lw'
    )
    
    toc = time.clock()
    predict_train = np.array(predict_train)
    predict_train = dmap[predict_train]
    acc_loc = [accuracy_score(y[train_index], predict_train)]
    f1_loc = [f1_score(y[train_index], predict_train, average='weighted')]
    predict_test = np.array(predict_test)
    predict_test = dmap[predict_test]
    acc_loc.append( accuracy_score(y[test_index], predict_test) )
    f1_loc.append(f1_score(y[test_index], predict_test, average='weighted') )
    tms_loc.append(toc-tic)
    predict_test2 = bnlearn.predict_bn_fit(
        fitted_bn, node='identity', data=dataset_test2.iloc[:, 1:], method='bayes-lw'
    )
    predict_test2 = np.array(predict_test2)
    predict_test2 = dmap[predict_test2]
    acc_loc.append( accuracy_score(y_test2, predict_test2) )
    f1_loc.append(f1_score(y_test2, predict_test2, average='weighted') )
    tms.append(tms_loc)
    predict_train_all.append( predict_train )
    predict_test_all.append( predict_test )
    predict_test2_all.append( predict_test2 )
    
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    
    np.savez_compressed(
        dirname_results+filename_results, tms=tms, predict_train=predict_train_all,
        predict_test=predict_test_all, predict_test2=predict_test2_all, test_indices=test_indices,
        train_indices=train_indices, y_test2=y_test2.T, y=y
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
print "accuracies"
print np.median(accuracies, axis=0)
print "f1 measure"
print np.median(f1s, axis=0)