In [1]:
n_cores = -1

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = np.load('inga_out.npz')

In [4]:
mass = data['arr_0']
mass

array([ 552.293448,  338.046024,  724.221464, ...,  328.072907,
        262.19328 ,  636.278192])

In [5]:
name = data['arr_1']
name

array(['UNPD98266', 'UNPD207163', 'UNPD3499', ..., 'UNPD98267',
       'UNPD47332', 'UNPD101003'], dtype=object)

In [6]:
massabund = data['arr_2']
massabund

array([[  41.00329 ,    1.688456],
       [  43.01894 ,    2.135631],
       [  55.01894 ,    1.105409],
       ..., 
       [ 549.23414 ,   24.377134],
       [ 551.24979 ,   22.666363],
       [ 591.2447  ,    5.496205]])

In [7]:
blockind = data['arr_3']
blockind

array([     0,      0,      0, ..., 220988, 220988, 220988], dtype=uint32)

### Splitting

In [8]:
def splitting(name, massabund, blockind, threshold):
    """
    Using splitting strategy to generate matrix.
    """
    values = np.sort(massabund[:,0])
    
    splits = []
    recurSplitting(values, splits, threshold)
    splits.sort()
    np.save('splits_splitting_'+str(threshold), splits)
    
    splits.append(values.max())
    valueRange = np.array(splits)
    
    features = generateMatrix(massabund, blockind, valueRange)
    np.save('splitting_'+str(threshold), features)
    
#     result = generateDataFrame(name, newfeatures)
#     result.to_csv('splitting_'+str(threshold)+'.csv', index=None)    
    
    return features

In [9]:
def recurSplitting(arr, res, threshold):
    """
    A recursive method to split values into groups until 
    within each group there are no two consecutive values
    which have differences larger than the given threshold.
    """
    if threshold <= 0:
        raise ValueError('Threshold should be positive.')
        
    n = len(arr)
    maxInterval = -1
    maxIdx = -1
    for i in range(1,n):
        diff = arr[i] - arr[i-1]
        if diff > maxInterval:
            maxInterval = diff
            maxIdx = i
            
    if maxInterval < threshold:
        return
    else:
        res.append((arr[maxIdx-1]+arr[maxIdx])/2)
        arr1 = arr[0:maxIdx]
        recurSplitting(arr1, res, threshold)
        arr2 = arr[maxIdx:n]
        recurSplitting(arr2, res, threshold)

In [10]:
def generateMatrix(massabund, blockind, splits):
    """
    Generating the corresponding matrix based on the splits.
    """
    result = np.zeros([blockind.max()+1, splits.size])
    
    previousRow = 0
    previousEnd = 0
    for i in range(blockind.size):
        row = blockind[i]
        v = massabund[i]
        m = v[0]
        a = v[1]
        if row == previousRow:
            start = previousEnd
        else:
            start = 0
        for col in range(start, splits.size):
            upperLim = splits[col] 
            if (m <= upperLim):
                result[row,col] = a
                previousRow = row
                previousEnd = col
                break
                
    goodCol = result.sum(axis=0) > 0
    newresult = result[:,goodCol]
    
    return newresult

In [11]:
def generateDataFrame(name, features):
    """
    Taking name and features to generate the corresponding dataframe.
    """
    mat = np.concatenate([name.reshape(-1,1), features], axis=1)
    column_name = ['name']
    for i in range(features.shape[1]):
        column_name.append('fraction'+str(i+1))
    df = pd.DataFrame(mat, columns=column_name)
    return df

In [12]:
for n in [0.2, 0.1, 0.05]:
    splitting(name, massabund, blockind, n)