In [1]:
n_cores = -1

In [2]:
import numpy as np
import pandas as pd

In [3]:
from bisect import bisect_left
from scipy.sparse import csr_matrix, save_npz

In [4]:
data = np.load('inga_out.npz')

In [5]:
mass = data['arr_0']
mass

array([ 552.293448,  338.046024,  724.221464, ...,  328.072907,
        262.19328 ,  636.278192])

In [6]:
name = data['arr_1']
name

array(['UNPD98266', 'UNPD207163', 'UNPD3499', ..., 'UNPD98267',
       'UNPD47332', 'UNPD101003'], dtype=object)

In [7]:
massabund = data['arr_2']
massabund

array([[  41.00329 ,    1.688456],
       [  43.01894 ,    2.135631],
       [  55.01894 ,    1.105409],
       ..., 
       [ 549.23414 ,   24.377134],
       [ 551.24979 ,   22.666363],
       [ 591.2447  ,    5.496205]])

In [8]:
blockind = data['arr_3']
blockind

array([     0,      0,      0, ..., 220988, 220988, 220988], dtype=uint32)

### Equal-width Binning

In [9]:
def equalWidthSparse(massabund, blockind, threshold):
    """
    Using equal-width strategy to generate sparse matrix.
    """
    values = massabund[:,0]
    start = values.min() // threshold * threshold
    end = values.max() // threshold * threshold + threshold
    valueRange = np.arange(start, end, threshold)
    
    groups = np.array([bisect_left(valueRange, m) for m in massabund[:,0]])
    
    features = generateSparseMatrix(massabund, blockind, groups)
    save_npz('equalWidth_'+str(threshold), features)
      
    return features

In [10]:
def generateSparseMatrix(massabund, blockind, groups):
    """
    Generating the corresponding sparse matrix based on the groups.
    """
    newmassabund = np.concatenate([massabund, blockind.reshape(-1,1), groups.reshape(-1,1)], axis=1)
    data = newmassabund[:,1]
    row = newmassabund[:,2]
    col = newmassabund[:,3]
    n_rows = int(row.max())+1
    n_cols = int(col.max())+1
    
    result = csr_matrix((data, (row, col)), shape=(n_rows, n_cols))
    
    goodCol = result.sum(axis=0) > 0
    newresult = result[:,np.ravel(goodCol)]
    
    return newresult

In [11]:
for n in [1, 0.5, 0.1, 0.05, 0.01]:
    equalWidthSparse(massabund, blockind, n)

### Equal-frequency Binning

Not work very well for the distribution.