In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier

from pyts.classification import BOSSVS
#from shapelets.classification import LtsShapeletClassifier ---- НЕ РАБОТАЕТ

from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.classification.dictionary_based import BOSSEnsemble
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.classification.shapelet_based import ShapeletTransformClassifier
#from sktime.classification.shapelet_based import MrSEQLClassifier
from sktime.datasets import load_basic_motions
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.classification.sklearn import RotationForest

from sktime.classification.hybrid import HIVECOTEV1
from sktime.classification.hybrid import HIVECOTEV2
#from sktime.contrib.vector_classifiers._rotation_forest import RotationForest


In [12]:
# !pip install shapelets --user

In [14]:
# !pip install shapelets-platform --user

In [67]:
# !pip install sktime

In [None]:
# !pip install pyts

**This dataset is composed of two collections of heartbeat signals derived from the famous dataset in heartbeat classification: The PTB Diagnostic ECG Database.**

This dataset has been used in exploring heartbeat classification using deep neural network architectures, and observing some of the capabilities of transfer learning on it. The signals correspond to electrocardiogram (ECG) shapes of heartbeats for the normal case and the cases affected by different arrhythmias and myocardial infarction. These signals are preprocessed and segmented, with each segment corresponding to a heartbeat.

Arrhythmia Dataset
Number of Categories: 5
Sampling Frequency: 125Hz
Data Source: Physionet's MIT-BIH Arrhythmia Dataset
Classes: ['N': 0, 'S': 1, 'V': 2, 'F': 3, 'Q': 4]

Remark: All the samples are cropped, downsampled and padded with zeroes if necessary to the fixed dimension of 188.

In [2]:
X_train = np.load("train_x.npy")
y_train = np.load("train_y.npy")
X_test_real = np.load("test_x.npy")

In [3]:
def cm_plot(y_test, y_pred):

    cm = confusion_matrix(y_test, y_pred)
    # Normalise
    cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    fig, ax = plt.subplots(figsize=(5,5))
    sns.heatmap(cmn, cmap='Blues', annot=True, fmt='.2f')
    sns.set(font_scale=1.3)
    plt.title("Confusion Matrix")

    return plt.show()

In [4]:
X_train.shape

(14667, 187)

In [5]:
X_test_real.shape

(7225, 187)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.33, random_state = 42)

## 1. KNeighborsTimeSeriesClassifier:

In [46]:
%%time
classifier = KNeighborsTimeSeriesClassifier(distance = "lcss", weights = 'distance', n_jobs = -1, )
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
f1_score(y_test, y_pred, average = "macro")

Wall time: 2h 12min 15s


0.18070215175537938

**KNeighborsTimeSeriesClassifier f1_score on X_test = 0.18**

## 2. ShapeletTransformClassifier with RotationForest estimator:

In [21]:
clf = ShapeletTransformClassifier(estimator = RotationForest(n_estimators=3), 
                                  n_shapelet_samples = 100,
                                  max_shapelets = 10,
                                  batch_size = 20,
                                  time_limit_in_minutes = 10)

In [22]:
%%time
clf.fit(X_train, y_train)

Wall time: 7min 47s


ShapeletTransformClassifier(batch_size=20,
                            estimator=RotationForest(n_estimators=3),
                            max_shapelets=10, n_shapelet_samples=100,
                            time_limit_in_minutes=10)

In [23]:
%%time
y_pred = clf.predict(X_test)
f1_score(y_test, y_pred, average = "macro")

Wall time: 1.41 s


0.24433393447596535

**ShapeletTransformClassifier f1_score on X_test = 0.244**

## 3. HIVECOTEV2:

In [35]:
clf = HIVECOTEV2(

    stc_params={

        "estimator": RotationForest(n_estimators = 3),

        "n_shapelet_samples": 500,

        "max_shapelets": 20,

        "batch_size": 100,

    },

    drcif_params = {"n_estimators": 10},

    time_limit_in_minutes = 30

)

In [36]:
%%time
clf.fit(X_train, y_train)

Wall time: 1h 9min 11s


HIVECOTEV2(drcif_params={'n_estimators': 10, 'time_limit_in_minutes': 5.0},
           stc_params={'batch_size': 100,
                       'estimator': RotationForest(n_estimators=3),
                       'max_shapelets': 20, 'n_shapelet_samples': 500,
                       'time_limit_in_minutes': 5.0},
           time_limit_in_minutes=30)

**HIVECOTEV2 f1_score on X_test = 0.817**

In [44]:
# %%time
# y_pred = clf.predict(X_test_real)

## Submit:

In [45]:
y_pred_csv = pd.DataFrame(list(range(len(y_pred))), columns=["ID"])
#y_pred_csv = pd.DataFrame()
y_pred_csv["ID"] = np.array(range(len(y_pred))).astype(int).astype(str)
y_pred_csv["Answer"] = np.array(y_pred).astype(int).astype(str)
y_pred_csv.to_csv("testing_submission.csv", index=None)
y_pred_csv

Unnamed: 0,ID,Answer
0,0,0
1,1,0
2,2,0
3,3,2
4,4,0
...,...,...
7220,7220,0
7221,7221,2
7222,7222,0
7223,7223,0
