In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import warnings
warnings.filterwarnings('ignore') 

from tensorflow import keras
from sklearn.preprocessing import RobustScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from datasets import load_data, random_benchmark, list_datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score
from Imputation import remove_and_impute
from Models import SAE, CNN_AE, LSTM_AE, GRU_AE, Bi_LSTM_AE, CNN_Bi_LSTM_AE, Causal_CNN_AE, Wavenet, Attention_Bi_LSTM_AE, Attention_CNN_Bi_LSTM_AE, Attention_Wavenet

np.random.seed(7)
tf.random.set_seed(7)

In [2]:
rf_clf = RandomForestClassifier(n_jobs=-1, n_estimators=100, random_state=7)
svm_clf = SVC(random_state=7, gamma='scale')
knn_clf = KNeighborsClassifier(n_neighbors=1, weights='distance', n_jobs=-1)
mlp_clf = MLPClassifier(random_state=7)

In [3]:
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def flatten_ts(train, test):
    new_train, new_test = [], []
    train_lens = []
    
    for _, row in train.iterrows():
        for i in row.index:
            train_lens.append(len(row[i]))

    maxlen = np.ceil(np.average(train_lens)).astype(int)
    
    for _, row in train.iterrows():
        new_list = []
        for i in row.index:
            ts = []
            for j in range(len(row[i])):
                ts.append(row[i][j])
            new_list.append(ts)
        new_train.append(pad_sequences(new_list, maxlen=maxlen, dtype='float32'))
        
    for _, row in test.iterrows():
        new_list = []
        for i in row.index:
            ts = []
            for j in range(len(row[i])):
                ts.append(row[i][j])
            new_list.append(ts)
        new_test.append(pad_sequences(new_list, maxlen=maxlen, dtype='float32'))
            
    train_df = pd.DataFrame(np.array(new_train).reshape(train.shape[0], maxlen * train.columns.shape[0]))
    test_df = pd.DataFrame(np.array(new_test).reshape(test.shape[0], maxlen * train.columns.shape[0]))

    scaler = RobustScaler()
    scaler.fit(train_df)
    return scaler.transform(train_df), scaler.transform(test_df), maxlen * train.columns.shape[0]
#     return np.array(train_df), np.array(test_df), maxlen * train.columns.shape[0]

def rnn_reshape(train, test, n_steps, n_features):
#     train, test = flatten_ts(train, test)
    return train.reshape(train.shape[0], n_steps, n_features), test.reshape(test.shape[0], n_steps, n_features)

In [5]:
def evaluate(data_name, univariate):
    print('Data: ', data_name)
    train_x, train_y, test_x, test_y = load_data(data_name, univariate=univariate)    
    n_features = train_x.iloc[0].shape[0]
    
    X_train, X_test, n_steps = flatten_ts(train_x, test_x)
    
    # RF
    rf_clf.fit(X_train, train_y)
    pred = rf_clf.predict(X_test)
    rf_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='macro')}
    print('RF >>', rf_scores)

    # SVM
    svm_clf.fit(X_train, train_y)
    pred = svm_clf.predict(X_test)
    svm_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='macro')}
    print('SVM >>', svm_scores)
    
    # 1-NN
    knn_clf.fit(X_train, train_y)
    pred = knn_clf.predict(X_test)
    knn_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='macro')}
    print('1-NN >>', knn_scores)
    
    # MLP
    mlp_clf.fit(X_train, train_y)
    pred = mlp_clf.predict(X_test)
    mlp_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='macro')}
    print('MLP >>', mlp_scores)
    
    results.append({'dataset': data_name, 'dim': str(n_steps)+', '+str(n_features), 
                    'RF-ACC': rf_scores['accuracy'], 
                    'SVM-ACC': svm_scores['accuracy'],
                    '1NN-ACC': knn_scores['accuracy'],
                    'MLP-ACC': mlp_scores['accuracy'],
                    'RF-F1': rf_scores['f1'], 
                    'SVM-F1': svm_scores['f1'],
                    '1NN-F1': knn_scores['f1'],
                    'MLP-F1': mlp_scores['f1']
                   })

In [6]:
results = []
selected_uni_datasets = ['Earthquakes', 'ArrowHead', 'BeetleFly', 'ChlorineConcentration', 'Chinatown', 'DiatomSizeReduction', 'ECG200', 'ECG5000', 'ECGFiveDays',
                         'FreezerSmallTrain', 'Fungi', 'GunPoint', 'GunPointAgeSpan','GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'Herring', 
                         'InsectEPGRegularTrain', 'InsectEPGSmallTrain', 'InsectWingbeatSound', 'Lightning2', 'MedicalImages', 'MiddlePhalanxTW',
                         'NonInvasiveFetalECGThorax2', 'OliveOil', 'PhalangesOutlinesCorrect', 'PickupGestureWiimoteZ','PigAirwayPressure', 'PowerCons',
                         'ProximalPhalanxOutlineAgeGroup', 'SemgHandGenderCh2', 'SemgHandMovementCh2', 'SemgHandSubjectCh2', 'SmoothSubspace', 'StarLightCurves',
                         'SyntheticControl', 'Trace', 'UMD', 'UWaveGestureLibraryAll', 'Wafer', 'Yoga']

for dataset in selected_uni_datasets:
    evaluate(dataset, univariate=True)
    
pd.DataFrame(results).to_csv('./results/uni-baseline-results.csv', index=False)

Data:  Earthquakes
RF >> {'accuracy': 0.7482014388489209, 'f1': 0.4279835390946502}
SVM >> {'accuracy': 0.7482014388489209, 'f1': 0.4279835390946502}
1-NN >> {'accuracy': 0.7050359712230215, 'f1': 0.5077308456422216}
MLP >> {'accuracy': 0.6906474820143885, 'f1': 0.4299475441106342}
Data:  ArrowHead
RF >> {'accuracy': 0.7085714285714285, 'f1': 0.706712080216457}
SVM >> {'accuracy': 0.64, 'f1': 0.637575075075075}
1-NN >> {'accuracy': 0.7657142857142857, 'f1': 0.7647301587301586}
MLP >> {'accuracy': 0.7942857142857143, 'f1': 0.796788660819519}
Data:  BeetleFly
RF >> {'accuracy': 0.85, 'f1': 0.8465473145780051}
SVM >> {'accuracy': 0.8, 'f1': 0.7916666666666665}
1-NN >> {'accuracy': 0.75, 'f1': 0.7333333333333334}
MLP >> {'accuracy': 0.85, 'f1': 0.8465473145780051}
Data:  ChlorineConcentration
RF >> {'accuracy': 0.7109375, 'f1': 0.6371877832816195}
SVM >> {'accuracy': 0.5755208333333334, 'f1': 0.34300721479598745}
1-NN >> {'accuracy': 0.7143229166666667, 'f1': 0.672897882001864}
MLP >> {'ac

In [7]:
results = []
selected_mul_datasets = ['ArticularyWordRecognition', 'AtrialFibrillation', 'BasicMotions', 'Cricket',
                         'ERing', 'HandMovementDirection', 'Handwriting', 'JapaneseVowels', 'PenDigits', 'RacketSports', 'SelfRegulationSCP1',
                         'SelfRegulationSCP2', 'SpokenArabicDigits', 'StandWalkJump', 'EthanolConcentration']

for dataset in selected_mul_datasets:
    evaluate(dataset, univariate=False)
    
pd.DataFrame(results).to_csv('./results/mul-baseline-results.csv', index=False)

Data:  ArticularyWordRecognition
RF >> {'accuracy': 0.9766666666666667, 'f1': 0.9769874396135266}
SVM >> {'accuracy': 0.9766666666666667, 'f1': 0.9765144768275204}
1-NN >> {'accuracy': 0.9666666666666667, 'f1': 0.9670741529002398}
MLP >> {'accuracy': 0.97, 'f1': 0.9697131390348781}
Data:  AtrialFibrillation
RF >> {'accuracy': 0.2, 'f1': 0.17663817663817663}
SVM >> {'accuracy': 0.2, 'f1': 0.13333333333333333}
1-NN >> {'accuracy': 0.2, 'f1': 0.12499999999999999}
MLP >> {'accuracy': 0.13333333333333333, 'f1': 0.1272727272727273}
Data:  BasicMotions
RF >> {'accuracy': 0.925, 'f1': 0.9246031746031745}
SVM >> {'accuracy': 0.925, 'f1': 0.9244322928533455}
1-NN >> {'accuracy': 0.55, 'f1': 0.46828533785055526}
MLP >> {'accuracy': 0.825, 'f1': 0.8188150858429496}
Data:  Cricket
RF >> {'accuracy': 0.9305555555555556, 'f1': 0.9287490287490288}
SVM >> {'accuracy': 0.9444444444444444, 'f1': 0.9407758907758907}
1-NN >> {'accuracy': 0.9166666666666666, 'f1': 0.9131146631146629}
MLP >> {'accuracy': 0.9