In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema
from bisect import bisect_left

Read data from file

In [3]:
data = np.load('inga_out.npz')

In [4]:
mass = data['arr_0']
mass

array([ 552.293448,  338.046024,  724.221464, ...,  328.072907,
        262.19328 ,  636.278192])

In [5]:
name = data['arr_1']
name

array(['UNPD98266', 'UNPD207163', 'UNPD3499', ..., 'UNPD98267',
       'UNPD47332', 'UNPD101003'], dtype=object)

In [6]:
massabund = data['arr_2']
massabund

array([[  41.00329 ,    1.688456],
       [  43.01894 ,    2.135631],
       [  55.01894 ,    1.105409],
       ..., 
       [ 549.23414 ,   24.377134],
       [ 551.24979 ,   22.666363],
       [ 591.2447  ,    5.496205]])

In [7]:
blockind = data['arr_3']
blockind

array([     0,      0,      0, ..., 220988, 220988, 220988], dtype=uint32)

In [8]:
def KDENormal(name, massabund, blockind, bandwidth):
    """
    Using kernel density estimation strategy to generate matrix.
    """
    values = massabund[:,0]
    start = values.min() // 100 * 100
    end = values.max() // 100 * 100 + 100
    x = np.arange(start, end, 0.1)

    dens = calculateDensity(values, x, bandwidth)
    
    mi_ind = argrelextrema(dens, np.less)[0]
    mi_value = x[mi_ind]
    np.save('splits_kde_'+str(bandwidth), mi_value)
    
    valueRange = np.append(mi_value, values.max())
    print('number of groups: {}'.format(valueRange.shape))
    
    features = generateMatrix(massabund, blockind, valueRange)
    np.save('kde_'+str(bandwidth), features)
    
#     result = generateDataFrame(name, newfeatures)
#     result.to_csv('splitting_'+str(threshold)+'.csv', index=None)    
    
    return features

In [9]:
def calculateDensity(values, x, bandwidth):
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(values.reshape(-1,1))
    dens = kde.score_samples(x.reshape(-1,1))
    np.save('dens'+str(bandwidth), dens)
    return dens

In [10]:
def generateMatrix(massabund, blockind, splits):
    """
    Generating the corresponding matrix based on the splits.
    """
    result = np.zeros([blockind.max()+1, splits.size])
    
    previousRow = 0
    previousEnd = 0
    for i in range(blockind.size):
        row = blockind[i]
        v = massabund[i]
        m = v[0]
        a = v[1]
        if row == previousRow:
            start = previousEnd
        else:
            start = 0
        for col in range(start, splits.size):
            upperLim = splits[col] 
            if (m <= upperLim):
                result[row,col] = a
                previousRow = row
                previousEnd = col
                break
                
    goodCol = result.sum(axis=0) > 0
    newresult = result[:,goodCol]
    
    return newresult

In [11]:
def generateDataFrame(name, features):
    """
    Taking name and features to generate the corresponding dataframe.
    """
    mat = np.concatenate([name.reshape(-1,1), features], axis=1)
    column_name = ['name']
    for i in range(features.shape[1]):
        column_name.append('fraction'+str(i+1))
    df = pd.DataFrame(mat, columns=column_name)
    return df

In [12]:
for n in [0.12, 0.1, 0.08]:
    KDENormal(name, massabund, blockind, n)

number of groups: (2401,)
number of groups: (2555,)
number of groups: (2795,)
