# BOSS: Bag-of-SFA Symbols

* Website: https://www2.informatik.hu-berlin.de/~schaefpa/boss/

* Paper: https://www2.informatik.hu-berlin.de/~schaefpa/boss.pdf

**Note: an Internet connection is required to download the datasets used in this benchmark.**

In [1]:
import numpy as np
from pyts.transformation import BOSS
from pyts.classification import KNeighborsClassifier
from pyts.datasets import fetch_ucr_dataset
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier


import pyts; print("pyts version: {0}".format(pyts.__version__))
import sklearn; print("sklearn version: {0}".format(sklearn.__version__))

pyts version: 0.13.0
sklearn version: 1.2.2


In [2]:
knn = KNeighborsClassifier(n_neighbors=1, metric='boss')

dataset_params = {
    
    'Adiac': {'word_size': np.tile(np.arange(10, 16, 2), 3),
              'window_size': np.repeat(np.arange(60, 110, 20), 3),
              'norm_mean': np.full(9, True),
              'drop_sum': np.full(9, True)},
    
    'ECG200': {'word_size': 8,
               'window_size': 40,
               'norm_mean': False,
               'drop_sum': False},
    
    'GunPoint': {'word_size': 8,
                 'window_size': 40,
                 'norm_mean': True,
                 'drop_sum': True},
    
    'MiddlePhalanxTW': {'word_size': 10,
                        'window_size': 30,
                        'norm_mean': False,
                        'drop_sum': False},
    
    'Plane': {'word_size': 6,
              'window_size': 10,
              'norm_mean': True,
              'drop_sum': True},
    
    'SyntheticControl': {'word_size': np.full(20, 5),
                         'window_size': np.arange(18, 37),
                         'norm_mean': np.full(20, True),
                         'drop_sum': np.full(20, True)}

}

In [3]:
for dataset, params in dataset_params.items():
    print(dataset)
    print('-' * len(dataset))
    
    X_train, X_test, y_train, y_test = fetch_ucr_dataset(dataset, return_X_y=True)

    if isinstance(params['window_size'], np.ndarray):
        dicts = [{key: value[i] for key, value in params.items()}
                 for i in range(len(params['window_size']))]
        bosses = [BOSS(**param, sparse=False) for param in dicts]
        pipelines = [Pipeline([('boss', boss), ('knn', knn)])
                     for boss in bosses]
        clf = VotingClassifier([('pipeline_' + str(i), pipeline)
                                 for i, pipeline in enumerate(pipelines)])
    else:
        boss = BOSS(**params, sparse=False)
        clf = Pipeline([('boss', boss), ('knn', knn)])
    accuracy = clf.fit(X_train, y_train).score(X_test, y_test)

    print('Accuracy on the test set: {0:.3f}'.format(accuracy))
    print()

Adiac
-----
Accuracy on the test set: 0.752

ECG200
------
Accuracy on the test set: 0.870

GunPoint
--------
Accuracy on the test set: 1.000

MiddlePhalanxTW
---------------
Accuracy on the test set: 0.526

Plane
-----
Accuracy on the test set: 1.000

SyntheticControl
----------------
Accuracy on the test set: 0.963

