# BOSS: Bag-of-SFA Symbols

* Website: https://www2.informatik.hu-berlin.de/~schaefpa/boss/

* Paper: https://www2.informatik.hu-berlin.de/~schaefpa/boss.pdf

In [1]:
import numpy as np
from pyts.transformation import BOSS
from pyts.classification import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier


import pyts
print("pyts: {0}".format(pyts.__version__))

pyts: 0.8.0


In [2]:
PATH = "UCRArchive_2018/" # Change this value if necessary

In [3]:
# Little trick to convert BOSS output, which is a sparse matrix, to a dense array
from sklearn.preprocessing import FunctionTransformer


transformer = FunctionTransformer(func=lambda x: x.toarray(),
                                  validate=False, check_inverse=False)

In [4]:
knn = KNeighborsClassifier(n_neighbors=1, metric='boss')

# Adiac

In [5]:
dataset_adiac = "Adiac"
file_train_adiac = PATH + str(dataset_adiac) + "/" + str(dataset_adiac) + "_TRAIN.tsv"
file_test_adiac = PATH + str(dataset_adiac) + "/" + str(dataset_adiac) + "_TEST.tsv"

train_adiac = np.genfromtxt(fname=file_train_adiac, delimiter="\t", skip_header=0)
test_adiac = np.genfromtxt(fname=file_test_adiac, delimiter="\t", skip_header=0)

X_train_adiac, y_train_adiac = train_adiac[:, 1:], train_adiac[:, 0]
X_test_adiac, y_test_adiac = test_adiac[:, 1:], test_adiac[:, 0]

In [6]:
window_size_adiac = np.repeat(np.arange(60, 110, 20), 3)
norm_mean_adiac = np.full(window_size_adiac.size, True)
word_size_adiac = np.tile(np.arange(10, 16, 2), 3)

boss_adiac = [BOSS(word_size=word_size, n_bins=4, norm_mean=norm_mean,
                   drop_sum=norm_mean, window_size=window_size)
             for (word_size, norm_mean, window_size) in zip(
                 word_size_adiac, norm_mean_adiac, window_size_adiac)]

pipeline_adiac = [Pipeline([("boss", boss),
                            ('to_dense', transformer),
                            ("knn", knn)]) for boss in boss_adiac]

voting_adiac = VotingClassifier([("pipeline_" + str(i), pipeline)
                                 for i, pipeline in enumerate(pipeline_adiac)])

accuracy_adiac = voting_adiac.fit(X_train_adiac, y_train_adiac).score(X_test_adiac, y_test_adiac)

print("Dataset: {}".format(dataset_adiac))
print("Accuracy on the testing set: {0:.3f}".format(accuracy_adiac))

Dataset: Adiac
Accuracy on the testing set: 0.752


# ECG200

In [7]:
dataset_ecg200 = "ECG200"
file_train_ecg200 = PATH + str(dataset_ecg200) + "/" + str(dataset_ecg200) + "_TRAIN.tsv"
file_test_ecg200 = PATH + str(dataset_ecg200) + "/" + str(dataset_ecg200) + "_TEST.tsv"

train_ecg200 = np.genfromtxt(fname=file_train_ecg200, delimiter="\t", skip_header=0)
test_ecg200 = np.genfromtxt(fname=file_test_ecg200, delimiter="\t", skip_header=0)

X_train_ecg200, y_train_ecg200 = train_ecg200[:, 1:], train_ecg200[:, 0]
X_test_ecg200, y_test_ecg200 = test_ecg200[:, 1:], test_ecg200[:, 0]

In [8]:
boss_ecg200 = BOSS(word_size=8, n_bins=4, norm_mean=False,
                   drop_sum=False, window_size=40)

pipeline_ecg200 = Pipeline([("boss", boss_ecg200),
                            ('to_dense', transformer),
                            ("knn", knn)])
accuracy_ecg200 = pipeline_ecg200.fit(
    X_train_ecg200, y_train_ecg200).score(X_test_ecg200, y_test_ecg200)

print("Dataset: {}".format(dataset_ecg200))
print("Accuracy on the testing set: {0:.3f}".format(accuracy_ecg200))

Dataset: ECG200
Accuracy on the testing set: 0.870


# GunPoint

In [9]:
dataset_gunpoint = "GunPoint"
file_train_gunpoint = PATH + str(dataset_gunpoint) + "/" + str(dataset_gunpoint) + "_TRAIN.tsv"
file_test_gunpoint = PATH + str(dataset_gunpoint) + "/" + str(dataset_gunpoint) + "_TEST.tsv"

train_gunpoint = np.genfromtxt(fname=file_train_gunpoint, delimiter="\t", skip_header=0)
test_gunpoint = np.genfromtxt(fname=file_test_gunpoint, delimiter="\t", skip_header=0)

X_train_gunpoint, y_train_gunpoint = train_gunpoint[:, 1:], train_gunpoint[:, 0]
X_test_gunpoint, y_test_gunpoint = test_gunpoint[:, 1:], test_gunpoint[:, 0]

In [10]:
boss_gunpoint = BOSS(word_size=8, n_bins=4, norm_mean=True,
                     drop_sum=True, window_size=40)

pipeline_gunpoint = Pipeline([("boss", boss_gunpoint),
                              ('to_dense', transformer),
                              ("knn", knn)])
accuracy_gunpoint = pipeline_gunpoint.fit(
    X_train_gunpoint, y_train_gunpoint).score(X_test_gunpoint, y_test_gunpoint)

print("Dataset: {}".format(dataset_gunpoint))
print("Accuracy on the testing set: {0:.3f}".format(accuracy_gunpoint))

Dataset: GunPoint
Accuracy on the testing set: 1.000


# MiddlePhalanxTW

In [11]:
dataset_mdtw = "MiddlePhalanxTW"
file_train_mdtw = PATH + str(dataset_mdtw) + "/" + str(dataset_mdtw) + "_TRAIN.tsv"
file_test_mdtw = PATH + str(dataset_mdtw) + "/" + str(dataset_mdtw) + "_TEST.tsv"

train_mdtw = np.genfromtxt(fname=file_train_mdtw, delimiter="\t", skip_header=0)
test_mdtw = np.genfromtxt(fname=file_test_mdtw, delimiter="\t", skip_header=0)

X_train_mdtw, y_train_mdtw = train_mdtw[:, 1:-29], train_mdtw[:, 0]
X_test_mdtw, y_test_mdtw = test_mdtw[:, 1:-29], test_mdtw[:, 0]

In [12]:
boss_mdtw = BOSS(word_size=6, n_bins=4, norm_mean=False,
                 drop_sum=False, window_size=30)

pipeline_mdtw = Pipeline([("boss", boss_mdtw),
                          ('to_dense', transformer),
                          ("knn", knn)])
accuracy_mdtw = pipeline_mdtw.fit(
    X_train_mdtw, y_train_mdtw).score(X_test_mdtw, y_test_mdtw)

print("Dataset: {}".format(dataset_mdtw))
print("Accuracy on the testing set: {0:.3f}".format(accuracy_mdtw))

Dataset: MiddlePhalanxTW
Accuracy on the testing set: 0.526


# Plane

In [13]:
dataset_plane = "Plane"
file_train_plane = PATH + str(dataset_plane) + "/" + str(dataset_plane) + "_TRAIN.tsv"
file_test_plane = PATH + str(dataset_plane) + "/" + str(dataset_plane) + "_TEST.tsv"

train_plane = np.genfromtxt(fname=file_train_plane, delimiter="\t", skip_header=0)
test_plane = np.genfromtxt(fname=file_test_plane, delimiter="\t", skip_header=0)

X_train_plane, y_train_plane = train_plane[:, 1:], train_plane[:, 0]
X_test_plane, y_test_plane = test_plane[:, 1:], test_plane[:, 0]

In [14]:
boss_plane = BOSS(word_size=6, n_bins=4, norm_mean=True,
                  drop_sum=True, window_size=10)

pipeline_plane = Pipeline([("boss", boss_plane),
                           ('to_dense', transformer),
                           ("knn", knn)])
accuracy_plane = pipeline_plane.fit(
    X_train_plane, y_train_plane).score(X_test_plane, y_test_plane)

print("Dataset: {}".format(dataset_plane))
print("Accuracy on the testing set: {0:.3f}".format(accuracy_plane))

Dataset: Plane
Accuracy on the testing set: 1.000


# SyntheticControl

In [15]:
dataset_synthetic = "SyntheticControl"
file_train_synthetic = PATH + str(dataset_synthetic) + "/" + str(dataset_synthetic) + "_TRAIN.tsv"
file_test_synthetic = PATH + str(dataset_synthetic) + "/" + str(dataset_synthetic) + "_TEST.tsv"

train_synthetic = np.genfromtxt(fname=file_train_synthetic, delimiter="\t", skip_header=0)
test_synthetic = np.genfromtxt(fname=file_test_synthetic, delimiter="\t", skip_header=0)

X_train_synthetic, y_train_synthetic = train_synthetic[:, 1:], train_synthetic[:, 0]
X_test_synthetic, y_test_synthetic = test_synthetic[:, 1:], test_synthetic[:, 0]

In [16]:
window_size_synthetic = np.arange(18, 37, dtype='int64')
norm_mean_synthetic = np.full(window_size_adiac.size, False)
word_size_synthetic = np.full(window_size_adiac.size, 6)

boss_synthetic = [BOSS(word_size=word_size, n_bins=4, norm_mean=norm_mean,
                  drop_sum=norm_mean, window_size=window_size)
             for (word_size, norm_mean, window_size) in zip(
                 word_size_synthetic, norm_mean_synthetic, window_size_synthetic)]

pipeline_synthetic = [Pipeline([("boss", boss),
                                ('to_dense', transformer),
                                ("knn", knn)]) for boss in boss_synthetic]

voting_synthetic = VotingClassifier([("pipeline_" + str(i), pipeline)
                                     for i, pipeline in enumerate(pipeline_synthetic)])

accuracy_synthetic = voting_synthetic.fit(
    X_train_synthetic, y_train_synthetic).score(X_test_synthetic, y_test_synthetic)

print("Dataset: {}".format(dataset_synthetic))
print("Accuracy on the testing set: {0:.3f}".format(accuracy_synthetic))

Dataset: SyntheticControl
Accuracy on the testing set: 0.963
