# BOSSVS: Bag of SFA Symbols in Vector Space

* Website: https://www2.informatik.hu-berlin.de/~schaefpa/bossVS/

* Paper: https://www2.informatik.hu-berlin.de/~schaefpa/bossvs.pdf

In [1]:
import numpy as np
import pyts
from pyts.classification import BOSSVS
from sklearn.ensemble import VotingClassifier


print("pyts: {0}".format(pyts.__version__))

pyts: 0.8.0


In [2]:
PATH = "UCRArchive_2018/" # Change this value if necessary

# Adiac

In [3]:
dataset_adiac = "Adiac"
file_train_adiac = PATH + str(dataset_adiac) + "/" + str(dataset_adiac) + "_TRAIN.tsv"
file_test_adiac = PATH + str(dataset_adiac) + "/" + str(dataset_adiac) + "_TEST.tsv"

train_adiac = np.genfromtxt(fname=file_train_adiac, delimiter="\t", skip_header=0)
test_adiac = np.genfromtxt(fname=file_test_adiac, delimiter="\t", skip_header=0)

X_train_adiac, y_train_adiac = train_adiac[:, 1:], train_adiac[:, 0]
X_test_adiac, y_test_adiac = test_adiac[:, 1:], test_adiac[:, 0]

In [4]:
clf_adiac = BOSSVS(word_size=12, n_bins=4, norm_mean=True,
                   drop_sum=True, window_size=80)

accuracy_adiac = clf_adiac.fit(
    X_train_adiac, y_train_adiac).score(X_test_adiac, y_test_adiac)

print("Dataset: {}".format(dataset_adiac))
print("Accuracy on the test set: {0:.3f}".format(accuracy_adiac))

Dataset: Adiac
Accuracy on the test set: 0.703


# ECG200

In [5]:
dataset_ecg200 = "ECG200"
file_train_ecg200 = PATH + str(dataset_ecg200) + "/" + str(dataset_ecg200) + "_TRAIN.tsv"
file_test_ecg200 = PATH + str(dataset_ecg200) + "/" + str(dataset_ecg200) + "_TEST.tsv"

train_ecg200 = np.genfromtxt(fname=file_train_ecg200, delimiter="\t", skip_header=0)
test_ecg200 = np.genfromtxt(fname=file_test_ecg200, delimiter="\t", skip_header=0)

X_train_ecg200, y_train_ecg200 = train_ecg200[:, 1:], train_ecg200[:, 0]
X_test_ecg200, y_test_ecg200 = test_ecg200[:, 1:], test_ecg200[:, 0]

In [6]:
clf_ecg200 = BOSSVS(word_size=5, n_bins=4, norm_mean=False,
                    drop_sum=False, window_size=40)

accuracy_ecg200 = clf_ecg200.fit(
    X_train_ecg200, y_train_ecg200).score(X_test_ecg200, y_test_ecg200)

print("Dataset: {}".format(dataset_ecg200))
print("Accuracy on the test set: {0:.3f}".format(accuracy_ecg200))

Dataset: ECG200
Accuracy on the test set: 0.860


# GunPoint

In [7]:
dataset_gunpoint = "GunPoint"
file_train_gunpoint = PATH + str(dataset_gunpoint) + "/" + str(dataset_gunpoint) + "_TRAIN.tsv"
file_test_gunpoint = PATH + str(dataset_gunpoint) + "/" + str(dataset_gunpoint) + "_TEST.tsv"

train_gunpoint = np.genfromtxt(fname=file_train_gunpoint, delimiter="\t", skip_header=0)
test_gunpoint = np.genfromtxt(fname=file_test_gunpoint, delimiter="\t", skip_header=0)

X_train_gunpoint, y_train_gunpoint = train_gunpoint[:, 1:], train_gunpoint[:, 0]
X_test_gunpoint, y_test_gunpoint = test_gunpoint[:, 1:], test_gunpoint[:, 0]

In [8]:
clf_gunpoint = BOSSVS(word_size=14, n_bins=4, norm_mean=True,
                      drop_sum=True, window_size=40)

accuracy_gunpoint = clf_gunpoint.fit(
    X_train_gunpoint, y_train_gunpoint).score(X_test_gunpoint, y_test_gunpoint)

print("Dataset: {}".format(dataset_gunpoint))
print("Accuracy on the test set: {0:.3f}".format(accuracy_gunpoint))

Dataset: GunPoint
Accuracy on the test set: 1.000


# MiddlePhalanxTW

In [9]:
dataset_mdtw = "MiddlePhalanxTW"
file_train_mdtw = PATH + str(dataset_mdtw) + "/" + str(dataset_mdtw) + "_TRAIN.tsv"
file_test_mdtw = PATH + str(dataset_mdtw) + "/" + str(dataset_mdtw) + "_TEST.tsv"

train_mdtw = np.genfromtxt(fname=file_train_mdtw, delimiter="\t", skip_header=0)
test_mdtw = np.genfromtxt(fname=file_test_mdtw, delimiter="\t", skip_header=0)

X_train_mdtw, y_train_mdtw = train_mdtw[:, 1:-29], train_mdtw[:, 0]
X_test_mdtw, y_test_mdtw = test_mdtw[:, 1:-29], test_mdtw[:, 0]

In [10]:
clf_mdtw = BOSSVS(word_size=10, n_bins=4, norm_mean=False,
                  drop_sum=False, window_size=25)

accuracy_mdtw = clf_mdtw.fit(X_train_mdtw, y_train_mdtw).score(X_test_mdtw, y_test_mdtw)

print("Dataset: {}".format(dataset_mdtw))
print("Accuracy on the test set: {0:.3f}".format(accuracy_mdtw))

Dataset: MiddlePhalanxTW
Accuracy on the test set: 0.545


# Plane

In [11]:
dataset_plane = "Plane"
file_train_plane = PATH + str(dataset_plane) + "/" + str(dataset_plane) + "_TRAIN.tsv"
file_test_plane = PATH + str(dataset_plane) + "/" + str(dataset_plane) + "_TEST.tsv"

train_plane = np.genfromtxt(fname=file_train_plane, delimiter="\t", skip_header=0)
test_plane = np.genfromtxt(fname=file_test_plane, delimiter="\t", skip_header=0)

X_train_plane, y_train_plane = train_plane[:, 1:], train_plane[:, 0]
X_test_plane, y_test_plane = test_plane[:, 1:], test_plane[:, 0]

In [12]:
clf_plane = BOSSVS(word_size=6, n_bins=4, norm_mean=False,
                   drop_sum=False, window_size=10)

accuracy_plane = clf_plane.fit(X_train_plane, y_train_plane).score(X_test_plane, y_test_plane)

print("Dataset: {}".format(dataset_plane))
print("Accuracy on the test set: {0:.3f}".format(accuracy_plane))

Dataset: Plane
Accuracy on the test set: 1.000


# SyntheticControl

In [13]:
dataset_synthetic = "SyntheticControl"
file_train_synthetic = PATH + str(dataset_synthetic) + "/" + str(dataset_synthetic) + "_TRAIN.tsv"
file_test_synthetic = PATH + str(dataset_synthetic) + "/" + str(dataset_synthetic) + "_TEST.tsv"

train_synthetic = np.genfromtxt(fname=file_train_synthetic, delimiter="\t", skip_header=0)
test_synthetic = np.genfromtxt(fname=file_test_synthetic, delimiter="\t", skip_header=0)

X_train_synthetic, y_train_synthetic = train_synthetic[:, 1:], train_synthetic[:, 0]
X_test_synthetic, y_test_synthetic = test_synthetic[:, 1:], test_synthetic[:, 0]

In [14]:
window_size_synthetic = np.arange(18, 37, dtype='int64')
norm_mean_synthetic = np.full(window_size_synthetic.size, False)
word_size_synthetic = np.full(window_size_synthetic.size, 6, dtype='int64')

bossvs_synthetic = [BOSSVS(word_size=word_size, n_bins=4, norm_mean=norm_mean,
                           drop_sum=norm_mean, window_size=window_size)
                    for (word_size, norm_mean, window_size) in zip(
                         word_size_synthetic, norm_mean_synthetic, window_size_synthetic)]

voting_synthetic = VotingClassifier([("bossvs_" + str(i), bossvs)
                                     for i, bossvs in enumerate(bossvs_synthetic)])

accuracy_synthetic = voting_synthetic.fit(
    X_train_synthetic, y_train_synthetic).score(X_test_synthetic, y_test_synthetic)

print("Dataset: {}".format(dataset_synthetic))
print("Accuracy on the test set: {0:.3f}".format(accuracy_synthetic))

Dataset: SyntheticControl
Accuracy on the test set: 0.980
