In [1]:
import pandas as pd
import pathlib
import numpy as np
from pprint import pprint
from functools import partial

from {{ cookiecutter.module_name }}.data import datasets, utils, Dataset
from {{ cookiecutter.module_name }}.data.datasets import (build_dataset_dict, fetch_and_unpack, fetch_text_file, read_space_delimited,
                                   load_dataset)
from {{ cookiecutter.module_name }}.data.utils import hash_file, list_dir, head_file, normalize_labels
from {{ cookiecutter.module_name }}.paths import interim_data_path, raw_data_path, processed_data_path

In [2]:
%load_ext autoreload
%autoreload 2

# Adding and Processing Natural Datasets
## The LVQ-PAK Finnish Phonetic dataset

The Learning Vector Quantization project includes a simple Finnish phonetic dataset
consisting 20-dimensional data and their associated targets. Let's explore this dataset and
add it to our global `datasets.json` so it can be unpacked and processed automatically.

In [3]:
dataset_name='lvq-pak'

Download the tarfile and build the dataset dictionary for it. If we know the hash of this file, we should include it here. If not, one will be computed from this download and used for comparison on subsequent downloads.

In [4]:
# Grab the source code package
lvq_pak = build_dataset_dict(url="http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar")
lvq_pak

{'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar',
 'hash_type': 'sha1',
 'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',
 'name': None,
 'file_name': None}

The **name** field can be used to indicate the type of datafile being downloaded. Usually, this is just informational. However, if you specify names `DESCR` or `LICENSE`, the downloaded (text) file will be used as the dataset description and license text, respectively.

Usually you will want to give these unique names, so they don't clash with other downloaded files. (e.g. "LICENSE.txt" is a terrible name to use). We use the **file_name** option for this:

In [5]:
descr = build_dataset_dict(url='http://www.cis.hut.fi/research/lvq_pak/README', file_name=f'{dataset_name}.readme',
                       name='DESCR')

In [6]:
# notice the files have been downloaded to the RAW directory
list_dir(raw_data_path)

['.gitkeep', 'lvq-pak.license', 'lvq_pak-3.1.tar', 'lvq-pak.readme']

Next, we combine the complete set of files into a URL list and use this to build our json file entry.

In [7]:
url_list = [lvq_pak, descr]
url_list

[{'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar',
  'hash_type': 'sha1',
  'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',
  'name': None,
  'file_name': None},
 {'url': 'http://www.cis.hut.fi/research/lvq_pak/README',
  'hash_type': 'sha1',
  'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',
  'name': 'DESCR',
  'file_name': 'lvq-pak.readme'}]

In [8]:
newds_dict = datasets.add_dataset_by_urllist(dataset_name, url_list)
pprint(newds_dict)

{'action': 'fetch_and_process',
 'load_function': functools.partial(<function new_dataset at 0x7f7c20955d90>, dataset_name='lvq-pak'),
 'load_function_args': [],
 'load_function_kwargs': {'dataset_name': 'lvq-pak'},
 'load_function_module': '{{ cookiecutter.module_name }}.data.datasets',
 'load_function_name': 'new_dataset',
 'url_list': [{'file_name': None,
               'hash_type': 'sha1',
               'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',
               'name': None,
               'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar'},
              {'file_name': 'lvq-pak.readme',
               'hash_type': 'sha1',
               'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',
               'name': 'DESCR',
               'url': 'http://www.cis.hut.fi/research/lvq_pak/README'}]}


See that a generic `load_function` (`new_dataset`) has been used to process the data. This does nothing more than populates the DESCR and LICENSE fields (if possible), creating an otherwise empty `Dataset` object

In [9]:
# Now, call the (generic) load function and notice that the LICENSE and DESCR have been set
dset = newds_dict['load_function']()
type(dset)

{{ cookiecutter.module_name }}.data.dset.Dataset

In [10]:
print(dset.DESCR)

************************************************************************
*                                                                      *
*                              LVQ_PAK                                 *
*                                                                      *
*                                The                                   *
*                                                                      *
*                   Learning  Vector  Quantization                     *
*                                                                      *
*                          Program  Package                            *
*                                                                      *
*                   Version 3.1 (April 7, 1995)                        *
*                                                                      *
*                          Prepared by the                             *
*                    LVQ Programming Team of the   

In [11]:
print(dset.LICENSE)

None


Datasets should *always* have an explicit license. Reading the project documentation, we see a license in one of the textfiles. We can extract and use that

In [12]:
license_txt = '''
************************************************************************
*                                                                      *
*                              LVQ_PAK                                 *
*                                                                      *
*                                The                                   *
*                                                                      *
*                   Learning  Vector  Quantization                     *
*                                                                      *
*                          Program  Package                            *
*                                                                      *
*                   Version 3.1 (April 7, 1995)                        *
*                                                                      *
*                          Prepared by the                             *
*                    LVQ Programming Team of the                       *
*                 Helsinki University of Technology                    *
*           Laboratory of Computer and Information Science             *
*                Rakentajanaukio 2 C, SF-02150 Espoo                   *
*                              FINLAND                                 *
*                                                                      *
*                      Copyright (c) 1991-1995                         *
*                                                                      *
************************************************************************
*                                                                      *
*  NOTE: This program package is copyrighted in the sense that it      *
*  may be used for scientific purposes. The package as a whole, or     *
*  parts thereof, cannot be included or used in any commercial         *
*  application without written permission granted by its producents.   *
*  No programs contained in this package may be copied for commercial  *
*  distribution.                                                       *
*                                                                      *
*  All comments concerning this program package may be sent to the     *
*  e-mail address 'lvq@nucleus.hut.fi'.                                *
*                                                                      *
************************************************************************
'''

In [13]:
url_list += [datasets.build_dataset_dict(from_txt=license_txt, file_name=f'{dataset_name}.license', name='LICENSE')]

Now, reload the dataset from scratch and check that the license is there

In [14]:
newds_dict = datasets.add_dataset_by_urllist(dataset_name, url_list)
dset = datasets.load_dataset(dataset_name)
print(dset.LICENSE)


************************************************************************
*                                                                      *
*                              LVQ_PAK                                 *
*                                                                      *
*                                The                                   *
*                                                                      *
*                   Learning  Vector  Quantization                     *
*                                                                      *
*                          Program  Package                            *
*                                                                      *
*                   Version 3.1 (April 7, 1995)                        *
*                                                                      *
*                          Prepared by the                             *
*                    LVQ Programming Team of the  

## Processing the data
The next step is to write the importer that actually processes the data we will be using for this dataset.

The important things to generate are `data` and `target` entries. A `metadata` is optional, but recommended if you want to save additional information about the dataset.

Usually, this functionality gets bundled up into a function and added to `datasets.py`


In [15]:
# Unpack the file
untar_dir = fetch_and_unpack(dataset_name)
unpack_dir = untar_dir / 'lvq_pak-3.1'
list_dir(unpack_dir)

['balance.c',
 'config.h',
 'lvq_rout.c',
 'ex1.dat',
 'knntest.c',
 'classify.c',
 'README',
 'fileio.h',
 'mindist.c',
 'lvq_pak.h',
 'labels.c',
 'pick.c',
 'lvq_run.c',
 'ex2.dat',
 'lvqtrain.c',
 'accuracy.c',
 'datafile.c',
 'cmatr.c',
 'fileio.c',
 'makefile.dos',
 'VERSION',
 'version.h',
 'mcnemar.c',
 'sammon.c',
 'stddev.c',
 'elimin.c',
 'lvq_pak.c',
 'lvq_rout.h',
 'showlabs.c',
 'extract.c',
 'setlabel.c',
 'labels.h',
 'version.c',
 'datafile.h',
 'makefile.unix',
 'eveninit.c',
 'errors.h']

In this dataset, the training and test datsets are stored in files named `ex1.dat` and `ex2.dat` respectively

In [16]:
datafile_train = unpack_dir / 'ex1.dat'
datafile_test = unpack_dir / 'ex2.dat'

datafile_train.exists() and datafile_test.exists()

True

According to the documentation, the data format is space-delimited, with the class label included as the last column. Let's have a look

In [17]:
print(head_file(datafile_train))

20
# Example data from speech signal
21.47 -19.90 -20.68 -6.73 13.67 -11.95 13.83 12.02 7.62 -6.15 -4.38 -2.91 4.80 -7.39 -3.54 -0.87 -5.02 -1.41 -2.33 2.12 A
0.05 28.38 9.52 -11.30 3.11 -11.88 -2.90 -11.04 2.32 -13.80 1.71 -0.40 -1.36 3.91 3.21 -0.98 -0.14 -4.70 0.30 0.27 I
-4.71 -4.61 -0.64 1.78 -1.48 5.98 12.55 -0.50 4.74 4.68 3.27 -0.36 9.24 3.39 -0.40 -1.59 0.94 2.17 -0.10 -0.45 #



Indeed, the datafile consists of 1 line containing the dimension of the data, a comment, and then 21 space-delimited columns, the final column being the target class label. 

**Note:** We have to be a little careful importing the data, because '#' is used both as the comment delimiter, and as a class label.

Fortunately, we have a helper function for this. We will get a little cheeky and skip the first 2 lines (hoping there are no other comments). The documentation also says ther are 1962 entries in each of the training and test datasets.

In [18]:
data, target = read_space_delimited(datafile_train, skiprows=[0,1])
data2, target2 = read_space_delimited(datafile_test, skiprows=[0])

data.shape, target.shape, data2.shape, target2.shape

((1962, 20), (1962,), (1962, 20), (1962,))

In [19]:
target

array(['A', 'I', '#', ..., '#', 'Y', '#'], dtype=object)

This seems to work, so let's wrap this functionality up into a processing function.
By convention, the function takes a `dataset_name`, and any other options that may be useful for reading the data, and returns a dictionary that matches the `Dataset` constructor signature.

We will place this function in `localdata.py`, (and add it to `__all__`) to make it visible to our dataset code.

In [20]:
%%file ../src/data/localdata.py
"""
Custom dataset processing/generation functions should be added to this file
"""

from {{ cookiecutter.module_name }}.data.utils import read_space_delimited, normalize_labels
from {{ cookiecutter.module_name }}.paths import interim_data_path
import numpy as np

__all__ = ['process_lvq_pak']

def process_lvq_pak(dataset_name='lvq-pak', kind='all', numeric_labels=True, metadata=None):
    """
    kind: {'test', 'train', 'all'}, default 'all'
    numeric_labels: boolean (default: True)
        if set, target is a vector of integers, and label_map is created in the metadata
        to reflect the mapping to the string targets
    """
    
    untar_dir = interim_data_path / dataset_name
    unpack_dir = untar_dir / 'lvq_pak-3.1'

    if kind == 'train':
        data, target = read_space_delimited(unpack_dir / 'ex1.dat', skiprows=[0,1])
    elif kind == 'test':
        data, target = read_space_delimited(unpack_dir / 'ex2.dat', skiprows=[0])
    elif kind == 'all':
        data1, target1 = read_space_delimited(unpack_dir / 'ex1.dat', skiprows=[0,1])
        data2, target2 = read_space_delimited(unpack_dir / 'ex2.dat', skiprows=[0])
        data = np.vstack((data1, data2))
        target = np.append(target1, target2)
    else:
        raise Exception(f'Unknown kind: {kind}')

    if numeric_labels:
        if metadata is None:
            metadata = {}
        mapped_target, label_map = normalize_labels(target)
        metadata['label_map'] = label_map
        target = mapped_target

    dset_opts = {
        'dataset_name': dataset_name,
        'data': data,
        'target': target,
        'metadata': metadata
    }
    return dset_opts


Overwriting ../src/data/localdata.py


Let's make sure this works as expected

In [21]:
from {{ cookiecutter.module_name }}.data.localdata import process_lvq_pak

for kind in ['train', 'test', 'all']:
    dset_opts = process_lvq_pak(kind=kind)
    dset = Dataset(**dset_opts)
    print(f'{kind}: data={dset.data.shape} target={dset.target.shape}')

train: data=(1962, 20) target=(1962,)
test: data=(1962, 20) target=(1962,)
all: data=(3924, 20) target=(3924,)


This all looks good


In [22]:
datasets.add_dataset_from_function(dataset_name, process_lvq_pak)

{'action': 'fetch_and_process',
 'load_function_args': [],
 'load_function_kwargs': {},
 'load_function_module': '{{ cookiecutter.module_name }}.data.localdata',
 'load_function_name': 'process_lvq_pak',
 'url_list': [{'file_name': None,
   'hash_type': 'sha1',
   'hash_value': '86024a871724e521341da0ffb783956e39aadb6e',
   'name': None,
   'url': 'http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar'},
  {'file_name': 'lvq-pak.readme',
   'hash_type': 'sha1',
   'hash_value': '138b69cc0b4e02950cec5833752e50a54d36fd0f',
   'name': 'DESCR',
   'url': 'http://www.cis.hut.fi/research/lvq_pak/README'},
  {'contents': "\n************************************************************************\n*                                                                      *\n*                              LVQ_PAK                                 *\n*                                                                      *\n*                                The                                   *\n*   

Finally, re-load the dataset and save a copy of it

In [26]:
lvq = load_dataset(dataset_name)
print(f"data:{lvq.data.shape}, target:{lvq.target.shape}")
lvq.dump()

data:(3924, 20), target:(3924,)


In [27]:
lvq

{'metadata': {'label_map': {0: '#',
   1: '&',
   2: 'A',
   3: 'D',
   4: 'E',
   5: 'F',
   6: 'H',
   7: 'I',
   8: 'J',
   9: 'L',
   10: 'M',
   11: 'N',
   12: 'O',
   13: 'R',
   14: 'S',
   15: 'U',
   16: 'V',
   17: 'Y',
   18: '[',
   19: '\\'},
  'dataset_name': 'lvq-pak',
  'data_hash': 'c87a90aee1ddadb50282e68b9f0155a74b6d7a61',
  'target_hash': '6332a9bca3d44ccf311a79013a4e3937abe12be5'},
 'data': array([['21.47', '-19.90', '-20.68', ..., '-1.41', '-2.33', '2.12'],
        ['0.05', '28.38', '9.52', ..., '-4.70', '0.30', '0.27'],
        ['-4.71', '-4.61', '-0.64', ..., '2.17', '-0.10', '-0.45'],
        ...,
        ['-2.63', '-6.59', '0.19', ..., '0.76', '0.89', '-3.48'],
        ['5.35', '4.96', '18.75', ..., '-0.57', '0.00', '1.35'],
        ['-0.37', '-5.27', '-1.74', ..., '3.48', '-0.90', '-1.00']],
       dtype=object),
 'target': array([ 2,  7,  0, ...,  0, 17,  0])}

In [24]:
list_dir(processed_data_path)

['.gitkeep', 'lvq-pak.dataset', 'lvq-pak.metadata']

In [25]:
pprint(lvq.metadata)

{'data_hash': 'c87a90aee1ddadb50282e68b9f0155a74b6d7a61',
 'dataset_name': 'lvq-pak',
 'label_map': {0: '#',
               1: '&',
               2: 'A',
               3: 'D',
               4: 'E',
               5: 'F',
               6: 'H',
               7: 'I',
               8: 'J',
               9: 'L',
               10: 'M',
               11: 'N',
               12: 'O',
               13: 'R',
               14: 'S',
               15: 'U',
               16: 'V',
               17: 'Y',
               18: '[',
               19: '\\'},
 'target_hash': '6332a9bca3d44ccf311a79013a4e3937abe12be5'}
