In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema
from bisect import bisect_left

In [3]:
from scipy.sparse import csr_matrix, save_npz

Read data from file

In [4]:
data = np.load('inga_out.npz')

In [5]:
mass = data['arr_0']
mass

array([ 552.293448,  338.046024,  724.221464, ...,  328.072907,
        262.19328 ,  636.278192])

In [6]:
name = data['arr_1']
name

array(['UNPD98266', 'UNPD207163', 'UNPD3499', ..., 'UNPD98267',
       'UNPD47332', 'UNPD101003'], dtype=object)

In [7]:
massabund = data['arr_2']
massabund

array([[  41.00329 ,    1.688456],
       [  43.01894 ,    2.135631],
       [  55.01894 ,    1.105409],
       ..., 
       [ 549.23414 ,   24.377134],
       [ 551.24979 ,   22.666363],
       [ 591.2447  ,    5.496205]])

In [8]:
blockind = data['arr_3']
blockind

array([     0,      0,      0, ..., 220988, 220988, 220988], dtype=uint32)

In [9]:
def KDESparse(massabund, blockind, bandwidth):
    """
    Using kernel density estimation strategy to generate matrix.
    """
    values = massabund[:,0]
    start = values.min() // 100 * 100
    end = values.max() // 100 * 100 + 100
    x = np.arange(start, end, 0.1)

    dens = calculateDensity(values, x, bandwidth)
    
    mi_ind = argrelextrema(dens, np.less)[0]
    mi_value = x[mi_ind]
    np.save('splits_kde_'+str(bandwidth), mi_value)
    
    valueRange = np.append(mi_value, values.max())
    print('number of groups: {}'.format(valueRange.shape))
    
    groups = np.array([bisect_left(valueRange, m) for m in massabund[:,0]])
    
    features = generateSparseMatrix(massabund, blockind, groups)
    save_npz('kde_sparse_'+str(bandwidth), features)
    
    return features

In [10]:
def calculateDensity(values, x, bandwidth):
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(values.reshape(-1,1))
    dens = kde.score_samples(x.reshape(-1,1))
    np.save('dens'+str(bandwidth), dens)
    return dens

In [11]:
def generateSparseMatrix(massabund, blockind, groups):
    """
    Generating the corresponding sparse matrix based on the groups.
    """
    newmassabund = np.concatenate([massabund, blockind.reshape(-1,1), groups.reshape(-1,1)], axis=1)
    data = newmassabund[:,1]
    row = newmassabund[:,2]
    col = newmassabund[:,3]
    n_rows = int(row.max())+1
    n_cols = int(col.max())+1
    
    result = csr_matrix((data, (row, col)), shape=(n_rows, n_cols))
    
    goodCol = result.sum(axis=0) > 0
    newresult = result[:,np.ravel(goodCol)]
    
    return newresult

In [12]:
for n in [0.12, 0.1, 0.08]:
    KDESparse(massabund, blockind, n)

number of groups: (2401,)
number of groups: (2555,)
number of groups: (2795,)
