# Matrix low-rank decomposition

## Contents

- [1. Imports](#Imports)
- [2. Train models with fixed rank for CV](#Train-models-with-fixed-rank-for-CV )
- [3. Predict with learned models](#Predict-with-learned-models)
- [4. Rank selection: build models](#Rank-selection:-build-models)
- [5. Rank selection: inspect accuracy](#Rank-selection:-inspect-accuracy)

[Back to Chemfin](../Chemfin.ipynb)

### Imports

The first cell with code includes almost all necessary inputs.

Required packages: [numpy](http://www.numpy.org/), [scikit-learn](http://scikit-learn.org/), [dcor](https://pypi.python.org/pypi/dcor).

[Back to contents](#Contents)

In [2]:
import sys
sys.path.append('../src/')

from computational_utils import reshape

import numpy as np
import pandas as pd
import copy
import time
import scipy

from sklearn.metrics import accuracy_score

from matrix import estimateMzPolarityFactors
from matrix import MatrixClassifierLCMS

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import dcor

### Train models with fixed rank for CV

It is assumed that CV indices has been already precomputed with [Initialize](./1_Initialize.ipynb).

[Back to contents](#Contents)

In [4]:
data_dirname = '../data/'
model_dirname = '../models/matrix_decomposition/'
model_filename_prefix = 'model_md_snn'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

maxitnum = 1000

rank = 25

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
X = np.reshape(X, [X.shape[0], -1], order='F')
Nfeatures = X.shape[1]

tms = []
for ind in xrange(len(train_indices)):
    clf = MatrixClassifierLCMS(Nfeatures, rank, maxitnum=maxitnum)
    tic = time.clock()
    clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
    toc = time.clock()
    tms.append(toc-tic)
    clf.saveParameters(model_dirname+'rank='+str(rank)+'_'+model_filename_prefix+'_'+str(ind))
    np.savez_compressed(
        model_dirname+'rank='+str(rank)+'_times_train_'+model_filename_prefix,
        tms=tms
    )
    print "ind=%d rank=%d time=%.2fs" % (
        ind, rank, tms[-1]
    )

ind=0 rank=25 time=1744.36s
ind=1 rank=25 time=1900.87s
ind=2 rank=25 time=1713.61s
ind=3 rank=25 time=1681.82s
ind=4 rank=25 time=1699.67s
ind=5 rank=25 time=1712.97s
ind=6 rank=25 time=1759.30s
ind=7 rank=25 time=1749.07s
ind=8 rank=25 time=1966.37s
ind=9 rank=25 time=1688.98s
ind=10 rank=25 time=1685.06s
ind=11 rank=25 time=1725.52s
ind=12 rank=25 time=1669.19s
ind=13 rank=25 time=1713.50s
ind=14 rank=25 time=1915.29s
ind=15 rank=25 time=1709.36s
ind=16 rank=25 time=1683.56s
ind=17 rank=25 time=1858.98s
ind=18 rank=25 time=1741.92s
ind=19 rank=25 time=1770.65s
ind=20 rank=25 time=1680.97s
ind=21 rank=25 time=1754.97s
ind=22 rank=25 time=1730.09s
ind=23 rank=25 time=1684.22s
ind=24 rank=25 time=1766.32s


In [33]:
model_dirname = '../models/matrix_decomposition/'
model_filename = 'model_md_snn.npz'

df = np.load(
    model_dirname+'rank='+str(rank)+'_times_train_'+model_filename
)

tms = df['tms']
print "Median of training time: %.2f s" % (np.median(tms))

Median of training time: 1725.52 s
45.52341


### Predict with learned models
Principle angle used as measure of distance between two column-spaces.
Requires additional imports of sklearn metrics.

[Back to contents](#Contents)

In [5]:
data_dirname = '../data/'
model_dirname = '../models/matrix_decomposition/'
model_filename_base = 'model_md_snn_'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

rank = 25

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
X = np.reshape(X, [X.shape[0], -1], order='F')

Nfeatures = X.shape[1]

df = np.load(data_dirname+filename_dataset2)
X_test2, y_test2 = df['data'], df['label']
X_test2 = np.reshape(X_test2, [X_test2.shape[0], -1], order='F')
y_test2 = reshape(y_test2, [-1, 1])

tms = []
accuracies = []
f1s = []
confusion_matrices = []

predicted_pa_test = []
predicted_pa_test2 = []
for ind in xrange(len(train_indices)):
    clf = MatrixClassifierLCMS(Nfeatures, rank)
    model_filename = 'rank='+str(rank)+'_'+model_filename_base+str(ind)+'.npz'
    print model_filename
    clf.loadParameters(model_dirname+model_filename)
    
    tic = time.clock()
    y_train_pred = clf.predict(X[train_indices[ind]])
    toc = time.clock()
    tms_loc = [toc-tic]
    acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
    f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]
    
    tic = time.clock()
    y_test_pred, y_test_pred_explicit = clf.predict(X[test_indices[ind]], return_all=True)
    toc = time.clock()
    tms_loc.append(toc-tic)
    conf_mat = confusion_matrix(y[test_indices[ind]], y_test_pred)
    acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
    f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )
    
    y_test2_pred, y_test2_pred_explicit = clf.predict(X_test2, return_all=True)
    acc_loc.append( accuracy_score(y_test2, y_test2_pred) )
    f1_loc.append( f1_score(y_test2, y_test2_pred, average='weighted') )
    
    y_test_pred_explicit = y_test_pred_explicit.assign(TRUE=y[test_indices[ind]])
    predicted_pa_test.append( y_test_pred_explicit.values )
    y_test2_pred_explicit = y_test2_pred_explicit.assign(TRUE=y_test2)
    predicted_pa_test2.append( y_test2_pred_explicit.values )
    
    tms.append(tms_loc)
    confusion_matrices.append(conf_mat)
    accuracies.append(acc_loc)
    f1s.append(f1_loc)
    np.savez_compressed(
        results_dirname+'rank='+str(rank)+model_filename_base+'+CCA',
        tms=tms, confusion_matrices=confusion_matrices, accuracies=accuracies,
        f1s=f1s, classes=clf.classes, predicted_pa_test=predicted_pa_test,
        predicted_pa_test2=predicted_pa_test2
    )
    print "ind=%d rank=%d time=%.2fs/%.2fs acc=%.4f/%.4f/%.4f f1=%.4f/%.4f/%.4f" % (
        ind, rank, tms[-1][0], tms[-1][1], acc_loc[0], acc_loc[1], acc_loc[2],
        f1_loc[0], f1_loc[1], f1_loc[2]
    )

rank=25_model_md_snn_0.npz


  'recall', 'true', average, warn_for)


ind=0 rank=25 time=1989.16s/539.97s acc=0.9927/0.9350/0.8182 f1=0.9928/0.9356/0.8413
rank=25_model_md_snn_1.npz
ind=1 rank=25 time=2017.46s/523.34s acc=0.9917/0.9589/0.8182 f1=0.9918/0.9583/0.8413
rank=25_model_md_snn_2.npz
ind=2 rank=25 time=2055.32s/545.82s acc=0.9906/0.9535/0.8182 f1=0.9908/0.9528/0.8413
rank=25_model_md_snn_3.npz
ind=3 rank=25 time=2048.50s/486.68s acc=0.9928/0.9551/0.8182 f1=0.9930/0.9552/0.8413
rank=25_model_md_snn_4.npz
ind=4 rank=25 time=2049.93s/502.04s acc=0.9907/0.9297/0.8182 f1=0.9909/0.9284/0.8516
rank=25_model_md_snn_5.npz
ind=5 rank=25 time=2002.13s/550.16s acc=0.9899/0.9476/0.7955 f1=0.9902/0.9485/0.8182
rank=25_model_md_snn_6.npz
ind=6 rank=25 time=2032.47s/540.94s acc=0.9928/0.9610/0.7955 f1=0.9930/0.9609/0.8293
rank=25_model_md_snn_7.npz
ind=7 rank=25 time=2020.74s/493.38s acc=0.9939/0.9469/0.8182 f1=0.9940/0.9472/0.8413
rank=25_model_md_snn_8.npz
ind=8 rank=25 time=1987.07s/517.46s acc=0.9851/0.9573/0.8182 f1=0.9852/0.9574/0.8516
rank=25_model_md_sn

In [34]:
model_dirname = '../models/matrix_decomposition/'
model_filename_base = 'model_md_snn_'
results_dirname = '../results/'

rank = 25

df = np.load( results_dirname+'rank='+str(rank)+model_filename_base+'+CCA.npz' )


accuracies = np.median(df['accuracies'], axis=0)
f1s = np.median(df['f1s'], axis=0)
tms = df['tms']

numSampPerFold = map(lambda x: float(len(x)), test_indices)
tms[:, 1] /= numSampPerFold

tms = np.median(tms, axis=0)

dataDict = {
    'train': [accuracies[0], f1s[0]],
    'test': [accuracies[1], f1s[1]],
    'test2': [accuracies[2], f1s[2]],
    'index': ['accuracy', 'F1']
}

print 'One sample (from test) prediction time: %.2f s' % (tms[1])

table = pd.DataFrame(dataDict)
table.set_index('index')
table


One sample (from test) prediction time: 1.13 s


Unnamed: 0,index,test,test2,train
0,accuracy,0.947589,0.818182,0.991671
1,F1,0.948522,0.841311,0.991844


### Rank selection: build models

Grid search approach followed by inspection of accuracy (one times repeated K-fold)

[Back to contents](#Contents)

In [2]:
data_dirname = '../data/'
model_dirname = '../models/matrix_decomposition/rank_selection/'
model_filename_prefix = 'model_md_snn'

filename_dataset = 'dataset.npz'
filename_cv = 'cv_indices.npz'

maxitnum = 1000
n_splits = 5

rank_max = 25
ranks = np.arange(rank_max) + 1

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
X = np.reshape(X, [X.shape[0], -1], order='F')

Nfeatures = X.shape[1]

tms = []
for ind in xrange(n_splits):
    for i in xrange(len(ranks)):
        rank = ranks[i]
        clf = MatrixClassifierLCMS(Nfeatures, rank, maxitnum=maxitnum, Lambda=1e-3)
        tic = time.clock() 
        clf.fit(X[train_indices[ind]], y[train_indices[ind]], verbose=0)
        toc = time.clock()
        tms.append(toc-tic)
        clf.saveParameters(model_dirname+'rank='+str(rank)+'_'+model_filename_prefix+'_'+str(ind))
        np.savez_compressed(
            model_dirname+'rank='+str(rank)+'_times_train_'+model_filename_prefix,
            tms=tms
        )
        print "ind=%d rank=%d time=%.2fs" % (
            ind, rank, tms[-1]
        )

ind=0 rank=1 time=124.49s
ind=0 rank=2 time=21.72s
ind=0 rank=3 time=75.26s
ind=0 rank=4 time=99.71s
ind=0 rank=5 time=151.40s
ind=0 rank=6 time=206.12s
ind=0 rank=7 time=301.79s
ind=0 rank=8 time=338.43s
ind=0 rank=9 time=421.20s
ind=0 rank=10 time=541.37s
ind=0 rank=11 time=623.05s
ind=0 rank=12 time=626.31s
ind=0 rank=13 time=783.98s
ind=0 rank=14 time=930.83s
ind=0 rank=15 time=1003.62s
ind=0 rank=16 time=967.44s
ind=0 rank=17 time=1039.24s
ind=0 rank=18 time=1127.81s
ind=0 rank=19 time=1187.08s
ind=0 rank=20 time=1279.45s
ind=0 rank=21 time=1330.18s
ind=0 rank=22 time=1430.10s
ind=0 rank=23 time=1529.84s
ind=0 rank=24 time=1609.34s
ind=0 rank=25 time=1772.09s
ind=1 rank=1 time=116.28s
ind=1 rank=2 time=37.05s
ind=1 rank=3 time=75.55s
ind=1 rank=4 time=129.93s
ind=1 rank=5 time=152.45s
ind=1 rank=6 time=282.78s
ind=1 rank=7 time=337.82s
ind=1 rank=8 time=398.52s
ind=1 rank=9 time=462.67s
ind=1 rank=10 time=571.09s
ind=1 rank=11 time=613.49s
ind=1 rank=12 time=718.40s
ind=1 rank=13 

### Rank selection: inspect accuracy

[Back to contents](#Contents)

In [3]:
metrics = ['cca', 'dcor']

metric_name = metrics[0]

if metric_name == 'cca':
    metric = None
elif metric_name == 'dcor':
    metric = dcor.distance_correlation
else:
    raise NotImplementedError
    

data_dirname = '../data/'
model_dirname = '../models/matrix_decomposition/rank_selection/'
model_filename_base = 'model_md_snn_'
results_dirname = '../results/'
model_filename_prefix = 'model_md_snn_'


filename_dataset = 'dataset.npz'
filename_dataset2 = 'test2.npz'
filename_cv = 'cv_indices.npz'

n_splits = 5
rank_max = 25
ranks = np.arange(rank_max) + 1
rank_pol = 2

df = np.load(data_dirname+filename_cv)
test_indices, train_indices = df['test_indices'], df['train_indices']

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
X = np.reshape(X, [X.shape[0], -1], order='F')

Nfeatures = X.shape[1]
tms = []
accuracies = []
f1s = []
for ind in xrange(n_splits):
    tms_l = []
    accuracies_l = []
    f1s_l = []
    for i in xrange(len(ranks)):
        rank = ranks[i]
        
        clf = MatrixClassifierLCMS(Nfeatures, rank)
        model_filename = 'rank='+str(rank)+'_'+model_filename_base+str(ind)+'.npz'
        print model_filename
        clf.loadParameters(model_dirname+model_filename)

        tic = time.clock()
        y_train_pred = clf.predict(X[train_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc = [toc-tic]
        acc_loc = [accuracy_score(y[train_indices[ind]], y_train_pred)]
        f1_loc = [f1_score(y[train_indices[ind]], y_train_pred, average='weighted')]

        tic = time.clock()
        y_test_pred = clf.predict(X[test_indices[ind]], metric=metric)
        toc = time.clock()
        tms_loc.append(toc-tic)
        acc_loc.append( accuracy_score(y[test_indices[ind]], y_test_pred) )
        f1_loc.append( f1_score(y[test_indices[ind]], y_test_pred, average='weighted') )


        tms_l.append(tms_loc)
        accuracies_l.append(acc_loc)
        f1s_l.append(f1_loc)
    accuracies.append(accuracies_l)
    f1s.append(f1s_l)
    tms.append(tms_l)
    np.savez_compressed(
        results_dirname+'rank_selection_metric='+metric_name+'_'+model_filename_prefix+'+'+metric_name,
        tms=tms, accuracies=accuracies, f1s=f1s
    )
accuracies = np.array(accuracies)
f1s = np.array(f1s)
tms = np.array(tms)

print "Accuracies:"
print np.median(accuracies, axis=0)
print "F1-measures:"
print np.median(f1s, axis=0)
print "Prediction time (for all samples):"
print np.median(np.sum(tms, axis=-1), axis=0)

rank=1_model_md_snn_0.npz


  'precision', 'predicted', average, warn_for)


rank=2_model_md_snn_0.npz
rank=3_model_md_snn_0.npz
rank=4_model_md_snn_0.npz
rank=5_model_md_snn_0.npz
rank=6_model_md_snn_0.npz
rank=7_model_md_snn_0.npz
rank=8_model_md_snn_0.npz
rank=9_model_md_snn_0.npz
rank=10_model_md_snn_0.npz
rank=11_model_md_snn_0.npz
rank=12_model_md_snn_0.npz
rank=13_model_md_snn_0.npz
rank=14_model_md_snn_0.npz
rank=15_model_md_snn_0.npz
rank=16_model_md_snn_0.npz
rank=17_model_md_snn_0.npz
rank=18_model_md_snn_0.npz
rank=19_model_md_snn_0.npz
rank=20_model_md_snn_0.npz
rank=21_model_md_snn_0.npz
rank=22_model_md_snn_0.npz
rank=23_model_md_snn_0.npz
rank=24_model_md_snn_0.npz
rank=25_model_md_snn_0.npz
rank=1_model_md_snn_1.npz
rank=2_model_md_snn_1.npz
rank=3_model_md_snn_1.npz
rank=4_model_md_snn_1.npz
rank=5_model_md_snn_1.npz
rank=6_model_md_snn_1.npz
rank=7_model_md_snn_1.npz
rank=8_model_md_snn_1.npz
rank=9_model_md_snn_1.npz
rank=10_model_md_snn_1.npz
rank=11_model_md_snn_1.npz
rank=12_model_md_snn_1.npz
rank=13_model_md_snn_1.npz
rank=14_model_md_s

## Feature extraction: inspect the results on whole dataset

[Back to contents](#Contents)

In [80]:
data_dirname = '../data/'
model_dirname = '../models/matrix_decomposition/'
model_filename_prefix = 'model_md_snn_full_dataset'

filename_dataset = 'dataset.npz'
maxitnum = 1000
rank = 25

df = np.load(data_dirname+filename_dataset)
X, y = df['data'], df['label']
X = np.reshape(X, [X.shape[0], -1], order='F')
Nfeatures = X.shape[1]

clf = MatrixClassifierLCMS(Nfeatures, rank, maxitnum=maxitnum)
tic = time.clock()
clf.fit(X, y, verbose=0)
toc = time.clock()
tms = toc-tic
clf.saveParameters(model_dirname+'rank='+str(rank)+'_'+model_filename_prefix)
print 'Evaluation time: %.2f s' % (tms)

Evaluation time: 11574.40 s


In [111]:
eps = np.spacing(1.)
def func(i, A, B, Omega):
    ii = int(i)
    if (ii >= A.size) or (ii in Omega):
        return +np.infty
    rv = np.mean(np.abs(B[:, ii])) - np.log(eps+np.abs(A[ii]))
    if np.isnan(rv):
        return +np.infty
    return rv

data_dirname = '../data/'
label_plant_filename = 'species.csv'
label_plant_dict = pd.read_csv(data_dirname+label_plant_filename, index_col=0)
label_plant_dict = label_plant_dict.to_dict()['Specie name']

model_dirname = '../models/matrix_decomposition/'
model_filename_base = 'model_md_snn_full_dataset'
results_dirname = '../results/'

filename_dataset = 'dataset.npz'

rank = 25
n_comp = 3

df = np.load(data_dirname + filename_dataset)
X, y = df['data'], df['label']
X = np.reshape(X, [X.shape[0], -1], order='F')
Nfeatures = X.shape[1]

clf = MatrixClassifierLCMS(Nfeatures, rank)
model_filename = 'rank='+str(rank)+'_'+model_filename_base+'.npz'
print model_filename
clf.loadParameters(model_dirname+model_filename)

components = clf.FeatureSpaces
corrDict = {}
#Z = X[test_indices[ind]].copy()
# the first loop is for component spaces
Omegas = {}
for i_cl in xrange(len(clf.classes)):
    tmp = []
    current_component_class = clf.classes[i_cl]
    # the second loop is for samples' classes
    for j_cl in xrange(len(clf.classes)):
        current_sample_class = clf.classes[j_cl]
        which = np.where(y == current_sample_class)[0]
        corrMat = np.corrcoef(X[which].T, components[current_component_class], rowvar=False)
        corrMat = corrMat[:len(which), len(which):]
        tmp.append( np.median(corrMat, axis=0) )
    corrDict[current_component_class] = np.array(tmp)
    spind = range(i_cl) + range(i_cl+1, len(clf.classes))
    spind = tuple(spind)
    Omega = []
    f = lambda i: func(
        i,
        corrDict[current_component_class][i_cl, :],
        corrDict[current_component_class][spind, :],
        Omega
    )
    ranges = (slice(0, components[current_component_class].shape[1], 1),)
    for l_c in xrange(n_comp):
        p = scipy.optimize.brute(f, ranges)
        p = int(p)
        '''
        print (
            current_component_class,
            corrDict[current_component_class][i_cl, p],
            corrDict[current_component_class][:, p]
        )
        '''
        Omega.append(p)
    
    Omegas[current_component_class] = Omega

np.savez_compressed(
    model_dirname+'snmf_components', classes=clf.classes, components=components, Omegas=Omegas, 
    corrDict=corrDict
)


rank=25_model_md_snn_full_dataset.npz
