# Tucker decomposition

## Contents

- [1. Imports](#Imports)
- [2. Train models with fixed rank for CV](#Train-models-with-fixed-rank-for-CV )
- [3. Predict with learned models](#Predict-with-learned-models)
- [4. Rank selection: build models](#Rank-selection:-build-models)
- [5. Rank selection: inspect accuracy](#Rank-selection:-inspect-accuracy)

[Back to Chemfin](../Chemfin.ipynb)

### Imports

The first cell with code includes almost all necessary inputs.

Required packages: [numpy](http://www.numpy.org/), [scikit-learn](http://scikit-learn.org/), [dcor](https://pypi.python.org/pypi/dcor).

[Back to contents](#Contents)

In [1]:
%env OMP_NUM_THREADS=8
%env MKL_NUM_THREADS=8
import sys
sys.path.append('../src/')

from computational_utils import reshape

import numpy as np
import copy
import time

from sklearn.metrics import accuracy_score

from tucker import estimateMzPolarityFactors
from tucker import TuckerClassifierLCMS

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import dcor

import pandas as pd

env: OMP_NUM_THREADS=8
env: MKL_NUM_THREADS=8


### Train models with fixed rank for CV

It is assumed that CV indices has been already precomputed with [Initialize](./1_Initialize.ipynb).

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'
maxitnum = 1000

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
for ind in xrange(len(train_indices)):
    clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
    tic = time.clock()
    clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
    toc = time.clock()
    tms.append(toc-tic)
    clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix+'_'+str(ind))
    np.savez_compressed(
        model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
        tms=tms
    )
    print "ind=%d rank=%d time=%.2fs" % (
        ind, rank_mz, tms[-1]
    )

ind=0 rank=25 time=28321.34s
ind=1 rank=25 time=31873.51s
ind=2 rank=25 time=32788.16s


### Predict with learned models
Principle angle used as measure of distance between two column-spaces.
Requires additional imports of sklearn metrics.

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

df = np.load(data_dirname+filename_dataset2)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

tms = []
accuracies = []
f1s = []
confusion_matrices = []

predicted_pa_test = []
predicted_pa_test2 = []

for ind in xrange(len(train_indices)):
    clf = TuckerClassifierLCMS(Nmz, r)
    model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+str(ind)+'.npz'
    print model_filename
    clf.loadParameters(model_dirname+model_filename)
    
    tic = time.clock()
    y_train_pred = clf.predict(X[train_indices[ind]])
    toc = time.clock()
    tms_loc = [toc-tic]
    acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
    f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]
    
    tic = time.clock()
    y_test_pred, y_test_pred_explicit = clf.predict(X[test_indices[ind]], return_all=True)
    toc = time.clock()
    tms_loc.append(toc-tic)
    conf_mat = confusion_matrix(y[test_indices[ind]], y_test_pred)
    acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
    f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )
    
    y_test2_pred, y_test2_pred_explicit = clf.predict(X_test2, return_all=True)
    acc_loc.append( accuracy_score(y_test2, y_test2_pred) )
    f1_loc.append( f1_score(y_test2, y_test2_pred, average='weighted') )
    
    tms.append(tms_loc)
    confusion_matrices.append(conf_mat)
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    
    y_test_pred_explicit = y_test_pred_explicit.assign(TRUE=y[test_indices[ind]])
    predicted_pa_test.append( y_test_pred_explicit.values )
    y_test2_pred_explicit = y_test2_pred_explicit.assign(TRUE=y_test2)
    predicted_pa_test2.append( y_test2_pred_explicit.values )
    
    np.savez_compressed(
        results_dirname+'rankMZ='+str(rank_mz)+model_filename_base+'+CCA',
        tms=tms, confusion_matrices=confusion_matrices, accuracies=accuracies,
        f1s=f1s, classes=clf.classes, predicted_pa_test=predicted_pa_test,
        predicted_pa_test2=predicted_pa_test2
    )
    print "ind=%d rank=%d time=%.2fs/%.2fs acc=%.4f/%.4f/%.4f f1=%.4f/%.4f/%.4f" % (
        ind, rank_mz, tms[-1][0], tms[-1][1], acc_loc[0], acc_loc[1], acc_loc[2],
        f1_loc[0], f1_loc[1], f1_loc[2]
    )

rankMZ=25_model_td_0.npz


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


ind=0 rank=25 time=875.21s/83.42s acc=0.9760/0.7208/0.8636 f1=0.9761/0.7135/0.9211
rankMZ=25_model_td_1.npz
ind=1 rank=25 time=916.62s/75.94s acc=0.9846/0.8000/0.8636 f1=0.9847/0.7959/0.9211
rankMZ=25_model_td_2.npz
ind=2 rank=25 time=937.53s/61.44s acc=0.9839/0.8195/0.8182 f1=0.9841/0.8093/0.8850
rankMZ=25_model_td_3.npz
ind=3 rank=25 time=947.11s/53.59s acc=0.9813/0.7965/0.7955 f1=0.9816/0.7877/0.8537
rankMZ=25_model_td_4.npz
ind=4 rank=25 time=1008.11s/49.56s acc=0.9802/0.7757/0.8409 f1=0.9801/0.7657/0.9053
rankMZ=25_model_td_5.npz
ind=5 rank=25 time=891.58s/85.68s acc=0.9789/0.7273/0.8182 f1=0.9791/0.7133/0.8955
rankMZ=25_model_td_6.npz
ind=6 rank=25 time=927.99s/82.75s acc=0.9806/0.7862/0.8636 f1=0.9808/0.7636/0.9211
rankMZ=25_model_td_7.npz
ind=7 rank=25 time=958.98s/70.42s acc=0.9809/0.7594/0.8864 f1=0.9808/0.7471/0.9337
rankMZ=25_model_td_8.npz
ind=8 rank=25 time=950.80s/59.40s acc=0.9744/0.8407/0.8409 f1=0.9747/0.8272/0.8714
rankMZ=25_model_td_9.npz
ind=9 rank=25 time=966.80s/

In [16]:
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

rank = 25

df = np.load( results_dirname+'rankMZ='+str(rank)+model_filename_base+'+CCA.npz' )


accuracies = np.median(df['accuracies'], axis=0)
f1s = np.median(df['f1s'], axis=0)
tms = df['tms']

numSampPerFold = map(lambda x: float(len(x)), test_indices)
tms[:, 1] /= numSampPerFold

tms = np.median(tms, axis=0)

dataDict = {
    'train': [accuracies[0], f1s[0]],
    'test': [accuracies[1], f1s[1]],
    'test2': [accuracies[2], f1s[2]],
    'index': ['accuracy', 'F1']
}

print 'One sample (from test) prediction time: %.2f s' % (tms[1])

table = pd.DataFrame(dataDict)
table.set_index('index')
table


One sample (from test) prediction time: 0.51 s


Unnamed: 0,index,test,test2,train
0,accuracy,0.785047,0.840909,0.979911
1,F1,0.765732,0.905303,0.979879


One sample (from test) prediction time: 0.47 s


Unnamed: 0,index,test,test2,train
0,accuracy,0.77931,0.840909,0.97969
1,F1,0.76262,0.897998,0.97977


### Rank selection: build models

Grid search approach followed by inspection of accuracy (one times repeated K-fold)

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/rank_selection/'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'

maxitnum = 1000
n_splits = 5

rank_mz_max = 25
ranks_mz = np.arange(rank_mz_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
for ind in xrange(n_splits):
    for i in xrange(len(ranks_mz)):
        rank_mz = ranks_mz[i]
        r = [rank_mz, rank_pol]
        clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
        tic = time.clock()
        clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
        toc = time.clock()
        tms.append(toc-tic)
        clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix+'_'+str(ind))
        np.savez_compressed(
            model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
            tms=tms
        )
        print "ind=%d rank=%d time=%.2fs" % (
            ind, rank_mz, tms[-1]
        )

ind=0 rank=1 time=70.01s
ind=0 rank=2 time=324.70s
ind=0 rank=3 time=542.91s
ind=0 rank=4 time=1201.56s
ind=0 rank=5 time=1712.20s
ind=0 rank=6 time=2929.71s
ind=0 rank=7 time=4452.68s
ind=0 rank=8 time=5132.80s
ind=0 rank=9 time=6621.87s
ind=0 rank=10 time=7289.60s
ind=0 rank=11 time=8276.01s
ind=0 rank=12 time=9645.99s
ind=0 rank=13 time=11372.01s
ind=0 rank=14 time=12437.62s
ind=0 rank=15 time=14299.24s
ind=0 rank=16 time=13842.07s
ind=0 rank=17 time=14949.49s
ind=0 rank=18 time=15207.60s
ind=0 rank=19 time=17285.15s
ind=0 rank=20 time=19357.69s
ind=0 rank=21 time=22528.68s
ind=0 rank=22 time=21637.98s
ind=0 rank=23 time=26229.13s
ind=0 rank=24 time=26979.81s
ind=0 rank=25 time=24437.93s
ind=1 rank=1 time=95.13s
ind=1 rank=2 time=612.36s
ind=1 rank=3 time=853.66s
ind=1 rank=4 time=1473.31s
ind=1 rank=5 time=2529.90s
ind=1 rank=6 time=4313.67s
ind=1 rank=7 time=5164.87s
ind=1 rank=8 time=6304.96s
ind=1 rank=9 time=9310.80s
ind=1 rank=10 time=11987.69s
ind=1 rank=11 time=14162.58s
ind

### Rank selection: inspect accuracy

[Back to contents](#Contents)

In [None]:
metrics = ['cca', 'dcor']

metric_name = metrics[1]

if metric_name == 'cca':
    metric = None
elif metric_name == 'dcor':
    metric = dcor.distance_correlation
else:
    raise NotImplementedError
    

data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/rank_selection/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
#filename_cv = 'cv_indices.npz'
filename_cv = 'physical_cv_indices_nc.npz'


n_splits = 5
rank_mz_max = 25
ranks_mz = np.arange(rank_mz_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
accuracies = []
f1s = []
for ind in xrange(n_splits):
    tms_l = []
    accuracies_l = []
    f1s_l = []
    for i in xrange(len(ranks_mz)):
        rank_mz = ranks_mz[i]
        r = [rank_mz, rank_pol]
        
        clf = TuckerClassifierLCMS(Nmz, r)
        model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+str(ind)+'.npz'
        print model_filename
        clf.loadParameters(model_dirname+model_filename)

        tic = time.clock()
        y_train_pred = clf.predict(X[train_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc = [toc-tic]
        acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
        f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]

        tic = time.clock()
        y_test_pred = clf.predict(X[test_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc.append(toc-tic)
        acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
        f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )


        tms_l.append(tms_loc)
        accuracies_l.append(acc_loc)
        f1s_l.append(f1_loc)
    accuracies.append(accuracies_l)
    f1s.append(f1s_l)
    tms.append(tms_l)
    np.savez_compressed(
        results_dirname+'rank_selection_metric='+metric_name+'_'+model_filename_prefix+'+'+metric_name,
        tms=tms, accuracies=accuracies, f1s=f1s
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
tms = np.array(tms)

print "Accuracies:"
print np.median(accuracies, axis=0)
print "F1-measures:"
print np.median(f1s, axis=0)
print "Prediction time (for all samples):"
print np.median(np.sum(tms, axis=-1), axis=0)

rankMZ=1_model_td_0.npz


  return ((np.issubdtype(x.dtype, float) and


rankMZ=2_model_td_0.npz
rankMZ=3_model_td_0.npz
rankMZ=4_model_td_0.npz
rankMZ=5_model_td_0.npz
rankMZ=6_model_td_0.npz
rankMZ=7_model_td_0.npz


In [12]:
print "Accuracies:"
print np.median(accuracies, axis=0)
print "F1-measures:"
print np.median(f1s, axis=0)
print "Prediction time (for all samples):"
print np.median(np.sum(tms, axis=-1), axis=0)

Accuracies:
[[0.43733333 0.33834586]
 [0.69262295 0.48051948]
 [0.69118414 0.4953271 ]
 [0.72954925 0.55862069]
 [0.73653396 0.56493506]
 [0.73419204 0.57142857]
 [0.74528841 0.57943925]
 [0.74700171 0.57943925]
 [0.74126239 0.56074766]
 [0.74346132 0.55172414]
 [0.72189696 0.53793103]
 [0.72676683 0.54482759]
 [0.72529983 0.55172414]
 [0.72482436 0.55862069]
 [0.721202   0.56551724]
 [0.71897607 0.55140187]
 [0.720726   0.56390977]
 [0.71311475 0.56493506]
 [0.71194379 0.54135338]
 [0.71253333 0.54887218]
 [0.70895938 0.53271028]
 [0.71721311 0.53246753]
 [0.71146667 0.54545455]
 [0.70474015 0.53271028]
 [0.72248244 0.55172414]]
F1-measures:
[[0.42402708 0.32614866]
 [0.71799773 0.47925063]
 [0.73309307 0.51305219]
 [0.76358876 0.57541959]
 [0.7869755  0.58786333]
 [0.7848737  0.60293754]
 [0.79814278 0.59773805]
 [0.805665   0.59279973]
 [0.79885924 0.5878348 ]
 [0.80584592 0.60085429]
 [0.79246676 0.56926705]
 [0.79560758 0.57813857]
 [0.79607765 0.58239875]
 [0.79740654 0.59489796]

## Taking a look on components

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_prefix = 'model_td_all'

filename_dataset = 'dataset.npz'

maxitnum = 1000

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
tic = time.clock()
clf.fit(X, y, verbose=0)
toc = time.clock()
tms = toc-tic
clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix)
np.savez_compressed(
    model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
    tms=tms
)

print 'Evaluation time: %.2f s' % (tms)

In [15]:
import scipy.optimize

eps = 10*np.spacing(1.)
def func(i, A, B, Omega):
    ii = int(i)
    if (ii >= A.size) or (ii in Omega):
        return +np.infty
    rv = np.mean(np.abs(B[:, ii])) - np.log(eps+np.abs(A[ii]))
    if np.isnan(rv):
        return +np.infty
    return rv

data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_all'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'

rank_mz = 25
rank_pol = 2
rank = rank_mz, rank_pol
n_comp = 3

df = np.load(data_dirname + filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

clf = TuckerClassifierLCMS(Nmz, rank)
model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+'.npz'
print model_filename
clf.loadParameters(model_dirname+model_filename)

components = clf.MZspaces
polspaces = clf.Polspaces_inv
corrDict = {}
#Z = X[test_indices[ind]].copy()
# the first loop is for component spaces
Omegas = {}
for i_cl in xrange(len(clf.classes)):
    tmp = []
    current_component_class = clf.classes[i_cl]
    # the second loop is for samples' classes
    for j_cl in xrange(len(clf.classes)):
        current_sample_class = clf.classes[j_cl]
        which = np.where(y == current_sample_class)[0]
        Z = np.einsum('ijk,kl->ijl', X[which], polspaces[current_component_class])
        corrMat = np.corrcoef(Z.T[0], components[current_component_class], rowvar=False)
        
        corrMat += np.corrcoef(Z.T[1], components[current_component_class], rowvar=False)
        corrMat *= 0.5
        corrMat = corrMat[:len(which), len(which):]
        
        tmp.append( np.median(corrMat, axis=0) )
    corrDict[current_component_class] = np.array(tmp)
    spind = range(i_cl) + range(i_cl+1, len(clf.classes))
    spind = tuple(spind)
    Omega = []
    f = lambda i: func(
        i,
        corrDict[current_component_class][i_cl, :],
        corrDict[current_component_class][spind, :],
        Omega
    )
    ranges = (slice(0, components[current_component_class].shape[1], 1),)
    for l_c in xrange(n_comp):
        p = scipy.optimize.brute(f, ranges)
        p = int(p)
        '''
        print (
            current_component_class,
            corrDict[current_component_class][i_cl, p],
            corrDict[current_component_class][:, p]
        )
        '''
        Omega.append(p)
    Omegas[current_component_class] = Omega

np.savez_compressed(
    model_dirname+'sntd_components', classes=clf.classes, components=components, Omegas=Omegas, 
    corrDict=corrDict
)



rankMZ=25_model_td_all.npz
