# Tucker decomposition

## Contents

- [1. Imports](#Imports)
- [2. Train models with fixed rank for CV](#Train-models-with-fixed-rank-for-CV )
- [3. Predict with learned models](#Predict-with-learned-models)
- [4. Rank selection: build models](#Rank-selection:-build-models)
- [5. Rank selection: inspect accuracy](#Rank-selection:-inspect-accuracy)

[Back to Chemfin](../Chemfin.ipynb)

### Imports

The first cell with code includes almost all necessary inputs.

Required packages: [numpy](http://www.numpy.org/), [scikit-learn](http://scikit-learn.org/), [dcor](https://pypi.python.org/pypi/dcor).

[Back to contents](#Contents)

In [2]:
%env OMP_NUM_THREADS=4
import sys
sys.path.append('../src/')

from computational_utils import reshape

import numpy as np
import copy
import time

from sklearn.metrics import accuracy_score

from tucker import estimateMzPolarityFactors
from tucker import TuckerClassifierLCMS

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import dcor

import pandas as pd

env: OMP_NUM_THREADS=4


### Train models with fixed rank for CV

It is assumed that CV indices has been already precomputed with [Initialize](./1_Initialize.ipynb).

[Back to contents](#Contents)

In [10]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

maxitnum = 1000

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
for ind in xrange(len(train_indices)):
    clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
    tic = time.clock()
    clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
    toc = time.clock()
    tms.append(toc-tic)
    clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix+'_'+str(ind))
    np.savez_compressed(
        model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
        tms=tms
    )
    print "ind=%d rank=%d time=%.2fs" % (
        ind, rank_mz, tms[-1]
    )

ind=0 rank=25 time=52826.16s
ind=1 rank=25 time=52615.62s
ind=2 rank=25 time=57755.90s
ind=3 rank=25 time=52189.50s
ind=4 rank=25 time=53416.44s
ind=5 rank=25 time=51258.26s
ind=6 rank=25 time=55868.38s
ind=7 rank=25 time=50189.79s
ind=8 rank=25 time=51762.45s
ind=9 rank=25 time=55265.80s
ind=10 rank=25 time=52065.07s
ind=11 rank=25 time=50743.12s
ind=12 rank=25 time=52441.37s
ind=13 rank=25 time=56996.62s
ind=14 rank=25 time=64201.51s
ind=15 rank=25 time=54402.55s
ind=16 rank=25 time=59769.03s
ind=17 rank=25 time=59780.56s
ind=18 rank=25 time=66703.53s
ind=19 rank=25 time=57762.88s
ind=20 rank=25 time=51917.56s
ind=21 rank=25 time=50853.04s
ind=22 rank=25 time=48563.67s
ind=23 rank=25 time=54836.83s
ind=24 rank=25 time=67458.81s


### Predict with learned models
Principle angle used as measure of distance between two column-spaces.
Requires additional imports of sklearn metrics.

[Back to contents](#Contents)

In [2]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

df = np.load(data_dirname+filename_dataset2)
X_test2, y_test2 = df['data'], df['label']
y_test2 = reshape(y_test2, [-1, 1])

tms = []
accuracies = []
f1s = []
confusion_matrices = []

predicted_pa_test = []
predicted_pa_test2 = []

for ind in xrange(len(train_indices)):
    clf = TuckerClassifierLCMS(Nmz, r)
    model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+str(ind)+'.npz'
    print model_filename
    clf.loadParameters(model_dirname+model_filename)
    
    tic = time.clock()
    y_train_pred = clf.predict(X[train_indices[ind]])
    toc = time.clock()
    tms_loc = [toc-tic]
    acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
    f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]
    
    tic = time.clock()
    y_test_pred, y_test_pred_explicit = clf.predict(X[test_indices[ind]], return_all=True)
    toc = time.clock()
    tms_loc.append(toc-tic)
    conf_mat = confusion_matrix(y[test_indices[ind]], y_test_pred)
    acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
    f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )
    
    y_test2_pred, y_test2_pred_explicit = clf.predict(X_test2, return_all=True)
    acc_loc.append( accuracy_score(y_test2, y_test2_pred) )
    f1_loc.append( f1_score(y_test2, y_test2_pred, average='weighted') )
    
    tms.append(tms_loc)
    confusion_matrices.append(conf_mat)
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    
    y_test_pred_explicit = y_test_pred_explicit.assign(TRUE=y[test_indices[ind]])
    predicted_pa_test.append( y_test_pred_explicit.values )
    y_test2_pred_explicit = y_test2_pred_explicit.assign(TRUE=y_test2)
    predicted_pa_test2.append( y_test2_pred_explicit.values )
    
    np.savez_compressed(
        results_dirname+'rankMZ='+str(rank_mz)+model_filename_base+'+CCA',
        tms=tms, confusion_matrices=confusion_matrices, accuracies=accuracies,
        f1s=f1s, classes=clf.classes, predicted_pa_test=predicted_pa_test,
        predicted_pa_test2=predicted_pa_test2
    )
    print "ind=%d rank=%d time=%.2fs/%.2fs acc=%.4f/%.4f/%.4f f1=%.4f/%.4f/%.4f" % (
        ind, rank_mz, tms[-1][0], tms[-1][1], acc_loc[0], acc_loc[1], acc_loc[2],
        f1_loc[0], f1_loc[1], f1_loc[2]
    )

rankMZ=25_model_td_0.npz


  'recall', 'true', average, warn_for)


ind=0 rank=25 time=2041.82s/540.80s acc=0.9776/0.9119/0.8636 f1=0.9779/0.9109/0.9110
rankMZ=25_model_td_1.npz
ind=1 rank=25 time=2019.48s/537.77s acc=0.9728/0.9502/0.8409 f1=0.9726/0.9497/0.8684
rankMZ=25_model_td_2.npz
ind=2 rank=25 time=2098.12s/508.57s acc=0.9746/0.9381/0.9091 f1=0.9747/0.9400/0.9495
rankMZ=25_model_td_3.npz
ind=3 rank=25 time=2129.23s/522.27s acc=0.9725/0.9438/0.8182 f1=0.9722/0.9437/0.8657
rankMZ=25_model_td_4.npz
ind=4 rank=25 time=2069.94s/488.21s acc=0.9788/0.9204/0.9091 f1=0.9792/0.9194/0.9495
rankMZ=25_model_td_5.npz
ind=5 rank=25 time=2049.69s/536.31s acc=0.9759/0.9539/0.8636 f1=0.9761/0.9543/0.9207
rankMZ=25_model_td_6.npz
ind=6 rank=25 time=2145.20s/563.02s acc=0.9795/0.9481/0.8636 f1=0.9795/0.9485/0.9211
rankMZ=25_model_td_7.npz
ind=7 rank=25 time=2100.96s/509.59s acc=0.9763/0.9336/0.8409 f1=0.9763/0.9335/0.9081
rankMZ=25_model_td_8.npz
ind=8 rank=25 time=2114.12s/511.10s acc=0.9780/0.9528/0.8864 f1=0.9781/0.9518/0.9337
rankMZ=25_model_td_9.npz
ind=9 rank

In [9]:
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

rank = 25

df = np.load( results_dirname+'rankMZ='+str(rank)+model_filename_base+'+CCA.npz' )


accuracies = np.median(df['accuracies'], axis=0)
f1s = np.median(df['f1s'], axis=0)
tms = df['tms']

numSampPerFold = map(lambda x: float(len(x)), test_indices)
tms[:, 1] /= numSampPerFold

tms = np.median(tms, axis=0)

dataDict = {
    'train': [accuracies[0], f1s[0]],
    'test': [accuracies[1], f1s[1]],
    'test2': [accuracies[2], f1s[2]],
    'index': ['accuracy', 'F1']
}

print 'One sample (from test) prediction time: %.2f s' % (tms[1])

table = pd.DataFrame(dataDict)
table.set_index('index')
table


One sample (from test) prediction time: 1.14 s


Unnamed: 0,index,test,test2,train
0,accuracy,0.933628,0.863636,0.976256
1,F1,0.933472,0.910985,0.976235


### Rank selection: build models

Grid search approach followed by inspection of accuracy (one times repeated K-fold)

[Back to contents](#Contents)

In [None]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/rank_selection/'
model_filename_prefix = 'model_td'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

maxitnum = 1000
n_splits = 5

rank_mz_max = 25
ranks_mz = np.arange(rank_mz_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
for ind in xrange(n_splits):
    for i in xrange(len(ranks_mz)):
        rank_mz = ranks_mz[i]
        r = [rank_mz, rank_pol]
        clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
        tic = time.clock()
        clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
        toc = time.clock()
        tms.append(toc-tic)
        clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix+'_'+str(ind))
        np.savez_compressed(
            model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
            tms=tms
        )
        print "ind=%d rank=%d time=%.2fs" % (
            ind, rank_mz, tms[-1]
        )

### Rank selection: inspect accuracy

[Back to contents](#Contents)

In [None]:
metrics = ['cca', 'dcor']

metric_name = metrics[0]

if metric_name == 'cca':
    metric = None
elif metric_name == 'dcor':
    metric = dcor.distance_correlation
else:
    raise NotImplementedError
    

data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/rank_selection/'
model_filename_base = 'model_td_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

n_splits = 5
rank_mz_max = 25
ranks_mz = np.arange(rank_mz_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

tms = []
accuracies = []
f1s = []
for ind in xrange(n_splits):
    tms_l = []
    accuracies_l = []
    f1s_l = []
    for i in xrange(len(ranks_mz)):
        rank_mz = ranks_mz[i]
        r = [rank_mz, rank_pol]
        
        clf = TuckerClassifierLCMS(Nmz, r)
        model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+str(ind)+'.npz'
        print model_filename
        clf.loadParameters(model_dirname+model_filename)

        tic = time.clock()
        y_train_pred = clf.predict(X[train_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc = [toc-tic]
        acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
        f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]

        tic = time.clock()
        y_test_pred = clf.predict(X[test_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc.append(toc-tic)
        acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
        f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )


        tms_l.append(tms_loc)
        accuracies_l.append(acc_loc)
        f1s_l.append(f1_loc)
    accuracies.append(accuracies_l)
    f1s.append(f1s_l)
    tms.append(tms_l)
    np.savez_compressed(
        results_dirname+'rank_selection_metric='+metric_name+'_'+model_filename_prefix+'+'+metric_name,
        tms=tms, accuracies=accuracies, f1s=f1s
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
tms = np.array(tms)

print "Accuracies:"
print np.median(accuracies, axis=0)
print "F1-measures:"
print np.median(f1s, axis=0)
print "Prediction time (for all samples):"
print np.median(np.sum(tms, axis=-1), axis=0)

## Taking a look on components

In [2]:
data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_prefix = 'model_td_all'

filename_dataset = 'dataset.npz'

maxitnum = 1000

rank_mz = 25
rank_pol = 2

r = [rank_mz, rank_pol]

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

clf = TuckerClassifierLCMS(Nmz, r, maxitnum=maxitnum)
tic = time.clock()
clf.fit(X, y, verbose=0)
toc = time.clock()
tms = toc-tic
clf.saveParameters(model_dirname+'rankMZ='+str(rank_mz)+'_'+model_filename_prefix)
np.savez_compressed(
    model_dirname+'rankMZ='+str(rank_mz)+'_times_train_'+model_filename_prefix,
    tms=tms
)

print 'Evaluation time: %.2f s' % (tms)

In [10]:
import scipy.optimize

eps = 10*np.spacing(1.)
def func(i, A, B, Omega):
    ii = int(i)
    if (ii >= A.size) or (ii in Omega):
        return +np.infty
    rv = np.mean(np.abs(B[:, ii])) - np.log(eps+np.abs(A[ii]))
    if np.isnan(rv):
        return +np.infty
    return rv

data_dirname = '../data/'
model_dirname = '../models/tucker_decomposition/'
model_filename_base = 'model_td_all'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'

rank_mz = 25
rank_pol = 2
rank = rank_mz, rank_pol
n_comp = 3

df = np.load(data_dirname + filename_dataset)
X, y = df['data'], df['label']
Nmz = X.shape[1]

clf = TuckerClassifierLCMS(Nmz, rank)
model_filename = 'rankMZ='+str(rank_mz)+'_'+model_filename_base+'.npz'
print model_filename
clf.loadParameters(model_dirname+model_filename)

components = clf.MZspaces
polspaces = clf.Polspaces_inv
corrDict = {}
#Z = X[test_indices[ind]].copy()
# the first loop is for component spaces
Omegas = {}
for i_cl in xrange(len(clf.classes)):
    tmp = []
    current_component_class = clf.classes[i_cl]
    # the second loop is for samples' classes
    for j_cl in xrange(len(clf.classes)):
        current_sample_class = clf.classes[j_cl]
        which = np.where(y == current_sample_class)[0]
        Z = np.einsum('ijk,kl->ijl', X[which], polspaces[current_component_class])
        corrMat = np.corrcoef(Z.T[0], components[current_component_class], rowvar=False)
        
        corrMat += np.corrcoef(Z.T[1], components[current_component_class], rowvar=False)
        corrMat *= 0.5
        corrMat = corrMat[:len(which), len(which):]
        
        tmp.append( np.median(corrMat, axis=0) )
    corrDict[current_component_class] = np.array(tmp)
    spind = range(i_cl) + range(i_cl+1, len(clf.classes))
    spind = tuple(spind)
    Omega = []
    f = lambda i: func(
        i,
        corrDict[current_component_class][i_cl, :],
        corrDict[current_component_class][spind, :],
        Omega
    )
    ranges = (slice(0, components[current_component_class].shape[1], 1),)
    for l_c in xrange(n_comp):
        p = scipy.optimize.brute(f, ranges)
        p = int(p)
        '''
        print (
            current_component_class,
            corrDict[current_component_class][i_cl, p],
            corrDict[current_component_class][:, p]
        )
        '''
        Omega.append(p)
    Omegas[current_component_class] = Omega

np.savez_compressed(
    model_dirname+'sntd_components', classes=clf.classes, components=components, Omegas=Omegas, 
    corrDict=corrDict
)



rankMZ=25_model_td_all.npz
