# Load Dataset and Mapping

In [1]:
import numpy as np
data = np.genfromtxt('data.csv',dtype='str')
mapping = [['a', 'b'], 'c', ['d', 'e', 'f']]

In [2]:
for idx, c in enumerate(mapping):
    print(idx, c)

0 ['a', 'b']
1 c
2 ['d', 'e', 'f']


# Current Implementation v0.1.2
(Don't rerun after new version)

In [3]:
import sys
sys.path.append('..')
from grouplabelencode import grouplabelencode, grouplabelencode_loop

In [4]:
%timeit encoded = grouplabelencode(data, mapping, nastate=True)

2.52 s ± 190 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%timeit encoded = grouplabelencode_loop(data, mapping, [0,1,2], 3)

2.39 s ± 192 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
enc_old = grouplabelencode_loop(data, mapping, [0,1,2], 3)

# New Code

In [7]:
def grouplabelencode_loop_new(data: list, mapping: list, encoding: list, nacode: int = None) -> list:
    out = []
    for label in data:
        enc = nacode
        for idx, c in enumerate(mapping):
            if label in c:
                enc = encoding[idx]
                break
        out.append(enc)
    return out

def grouplabelencode_new(data: list, mapping: dict, nacode: int = None, nastate: bool = False) -> list:
    # What value is used for missing data?
    if nastate:
        if nacode is None:
            nacode = len(mapping)

    # Process depending on the data type of the data mapping variable
    if isinstance(mapping, list):
        m = mapping
        e = list(range(len(mapping)))
    elif isinstance(mapping, dict):
        m = list(mapping.values())
        e = list(mapping.keys())
    else:
        raise Exception("'data' must be list-of-list or dict.")

    # Convert scalar elements into a list
    m = [[c] if isinstance(c, (str, int, float)) else c for c in m]
                                      
    # Loop over 'data' array
    return grouplabelencode_loop_new(data, m, e, nacode=nacode)

In [8]:
%timeit encoded = grouplabelencode_new(data, mapping, nastate=True)

1.35 s ± 137 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit encoded = grouplabelencode_loop_new(data, mapping, [0,1,2], 3)

1.62 s ± 288 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
enc_new = grouplabelencode_loop_new(data, mapping, [0,1,2], 3)
(np.array(enc_new) == np.array(enc_old)).all()

True