# Tucker decomposition

## Contents

- [1. Imports](#Imports)
- [2. Train models with fixed rank for CV](#Train-models-with-fixed-rank-for-CV )
- [3. Predict with learned models](#Predict-with-learned-models)
- [4. Rank selection: build models](#Rank-selection:-build-models)
- [5. Rank selection: inspect accuracy](#Rank-selection:-inspect-accuracy)

[Back to Chemfin](../Chemfin.ipynb)

### Imports

The first cell with code includes almost all necessary inputs.

Required packages: [numpy](http://www.numpy.org/), [scikit-learn](http://scikit-learn.org/), [dcor](https://pypi.python.org/pypi/dcor).

[Back to contents](#Contents)

In [None]:
import sys
sys.path.append('../src/')

from computational_utils import reshape

import numpy as np
import copy
import time

from sklearn.metrics import accuracy_score

from tucker import estimateMzPolarityFactors
from tucker import TuckerClassifierLCMS

### Train models with fixed rank for CV

It is assumed that CV indices has been already precomputed with [Initialize](./1_Initialize.ipynb).

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

maxitnum = 1000

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
for ind in xrange(len(train_indices)):
    clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
    tic = time.clock()
    clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
    toc = time.clock()
    tms.append(toc-tic)
    clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix+'_'+str(ind))
    np.savez_compressed(
        model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
        tms=tms
    )
    print "ind=%d rank=%d time=%.2fs" % (
        ind, rank_mz, tms[-1]
    )

### Predict with learned models
Principle angle used as measure of distance between two column-spaces.
Requires additional imports of sklearn metrics.

[Back to contents](#Contents)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

df = np.load(data_dirname+filename_dataset2)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

tms = []
accuracies = []
f1s = []
confusion_matrices = []
for ind in xrange(len(train_indices)):
    clf = TuckerClassifierLCMS(Nmz, r)
    model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+str(ind)+'.npz'
    print model_filename
    clf.loadParameters(model_dirname+model_filename)
    
    tic = time.clock()
    y_train_pred = clf.predict(X[train_indices[ind]])
    toc = time.clock()
    tms_loc = [toc-tic]
    acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
    f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]
    
    tic = time.clock()
    y_test_pred = clf.predict(X[test_indices[ind]])
    toc = time.clock()
    tms_loc.append(toc-tic)
    conf_mat = confusion_matrix(y[test_indices[ind]], y_test_pred)
    acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
    f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )
    
    y_test2_pred = clf.predict(X_test2)
    acc_loc.append( accuracy_score(y_test2, y_test2_pred) )
    f1_loc.append( f1_score(y_test2, y_test2_pred, average='weighted') )
    
    tms.append(tms_loc)
    confusion_matrices.append(conf_mat)
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    np.savez_compressed(
        results_dirname+'rankMZ='+str(rank_mz)+model_filename_base+'+CCA',
        tms=tms, confusion_matrices=confusion_matrices, accuracies=accuracies,
        f1s=f1s, classes=clf.classes
    )
    print "ind=%d rank=%d time=%.2fs/%.2fs acc=%.4f/%.4f/%.4f f1=%.4f/%.4f/%.4f" % (
        ind, rank_mz, tms[-1][0], tms[-1][1], acc_loc[0], acc_loc[1], acc_loc[2],
        f1_loc[0], f1_loc[1], f1_loc[2]
    )

### Rank selection: build models

Grid search approach followed by inspection of accuracy (one times repeated K-fold)

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/rank_selection/'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

maxitnum = 1000
n_splits = 5

rank_mz_max = 25
ranks_mz = np.arange(rank_mz_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
for ind in xrange(n_splits):
    for i in xrange(len(ranks_mz)):
        rank_mz = ranks_mz[i]
        r = [rank_mz, rank_pol]
        clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
        tic = time.clock()
        clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
        toc = time.clock()
        tms.append(toc-tic)
        clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix+'_'+str(ind))
        np.savez_compressed(
            model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
            tms=tms
        )
        print "ind=%d rank=%d time=%.2fs" % (
            ind, rank_mz, tms[-1]
        )

### Rank selection: inspect accuracy

[Back to contents](#Contents)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import dcor

In [None]:
metrics = ['cca', 'dcor']

metric_name = metrics[0]

if metric_name == 'cca':
    metric = None
elif metric_name == 'dcor':
    metric = dcor.distance_correlation
else:
    raise NotImplementedError
    

data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/rank_selection/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

n_splits = 5
rank_mz_max = 25
ranks_mz = np.arange(rank_mz_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
accuracies = []
f1s = []
for ind in xrange(n_splits):
    tms_l = []
    accuracies_l = []
    f1s_l = []
    for i in xrange(len(ranks_mz)):
        rank_mz = ranks_mz[i]
        r = [rank_mz, rank_pol]
        
        clf = TuckerClassifierLCMS(Nmz, r)
        model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+str(ind)+'.npz'
        print model_filename
        clf.loadParameters(model_dirname+model_filename)

        tic = time.clock()
        y_train_pred = clf.predict(X[train_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc = [toc-tic]
        acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
        f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]

        tic = time.clock()
        y_test_pred = clf.predict(X[test_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc.append(toc-tic)
        acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
        f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )


        tms_l.append(tms_loc)
        accuracies_l.append(acc_loc)
        f1s_l.append(f1_loc)
    accuracies.append(accuracies_l)
    f1s.append(f1s_l)
    tms.append(tms_l)
    np.savez_compressed(
        results_dirname+'rank_selection_metric='+metric_name+'_'+model_filename_prefix+'+'+metric_name,
        tms=tms, accuracies=accuracies, f1s=f1s
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
tms = np.array(tms)

print "Accuracies:"
print np.median(accuracies, axis=0)
print "F1-measures:"
print np.median(f1s, axis=0)
print "Prediction time (for all samples):"
print np.median(np.sum(tms, axis=-1), axis=0)