In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import warnings
warnings.filterwarnings('ignore') 

from tensorflow import keras
from sklearn.preprocessing import RobustScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from datasets import load_data, random_benchmark, list_datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score
from Imputation import remove_and_impute
from Models import SAE, CNN_AE, LSTM_AE, GRU_AE, Bi_LSTM_AE, CNN_Bi_LSTM_AE, Causal_CNN_AE, Wavenet, Attention_Bi_LSTM_AE, Attention_CNN_Bi_LSTM_AE, Attention_Wavenet

np.random.seed(7)
tf.random.set_seed(7)

In [2]:
rf_clf = RandomForestClassifier(n_jobs=-1, random_state=7)
svm_clf = SVC(gamma='scale', random_state=7)
knn_clf = KNeighborsClassifier(n_neighbors=1, weights='distance', n_jobs=-1)
mlp_clf = MLPClassifier(random_state=7)

In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3072)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPUs


In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def flatten_ts(train, test):
    new_train, new_test = [], []
    train_lens = []
    
    for _, row in train.iterrows():
        for i in row.index:
            train_lens.append(len(row[i]))

    maxlen = np.ceil(np.average(train_lens)).astype(int)
    
    for _, row in train.iterrows():
        new_list = []
        for i in row.index:
            ts = []
            for j in range(len(row[i])):
                ts.append(row[i][j])
            new_list.append(ts)
        new_train.append(pad_sequences(new_list, maxlen=maxlen, dtype='float32'))
        
    for _, row in test.iterrows():
        new_list = []
        for i in row.index:
            ts = []
            for j in range(len(row[i])):
                ts.append(row[i][j])
            new_list.append(ts)
        new_test.append(pad_sequences(new_list, maxlen=maxlen, dtype='float32'))
            
    train_df = pd.DataFrame(np.array(new_train).reshape(train.shape[0], maxlen * train.columns.shape[0]))
    test_df = pd.DataFrame(np.array(new_test).reshape(test.shape[0], maxlen * train.columns.shape[0]))

    scaler = RobustScaler()
    scaler.fit(train_df)
    return scaler.transform(train_df), scaler.transform(test_df), maxlen * train.columns.shape[0]
#     return np.array(train_df), np.array(test_df), maxlen * train.columns.shape[0]

def rnn_reshape(train, test, n_steps, n_features):
#     train, test = flatten_ts(train, test)
    return train.reshape(train.shape[0], n_steps, n_features), test.reshape(test.shape[0], n_steps, n_features)

In [5]:
from sklearn.model_selection import train_test_split
es = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [6]:
def LSTM_Model(n_steps, n_features, n_classes):
    return keras.models.Sequential([
        keras.layers.LSTM(128, return_sequences=True, input_shape=[n_steps, n_features]),
        keras.layers.LSTM(128),
        keras.layers.Dense(n_classes, activation='softmax')
    ])

In [7]:
from TRepNet import TRepNet
# SyntheticControl, PhalangesOutlinesCorrect
# SelfRegulationSCP2, SelfRegulationSCP1
from sklearn.model_selection import GridSearchCV

In [8]:
data_name = 'TwoPatterns'
print('Data: ', data_name)
train_x, train_y, test_x, test_y = load_data(data_name, univariate=True)

n_features = train_x.columns.shape[0]

X_train, X_test, n_steps = flatten_ts(train_x, test_x)
X_train, X_test = rnn_reshape(X_train, X_test, n_steps // n_features, n_features)

encoder, decoder = TRepNet(n_steps // n_features, n_features, activation='elu')
model = keras.models.Sequential([encoder, decoder])
model.compile(loss="mae", optimizer=keras.optimizers.Nadam(lr=0.001, clipnorm=1.), metrics=['mae'])
history = model.fit(X_train, X_train, epochs=500, batch_size=16, validation_data=[X_test, X_test], callbacks=[es], verbose=0, shuffle=False)

Data:  TwoPatterns


In [13]:
def evaluate(data_name, univariate, lb_rate):
    train_x, train_y, test_x, test_y = load_data(data_name, univariate=univariate)    
    n_features = train_x.columns.shape[0]

    X_train, X_test, n_steps = flatten_ts(train_x, test_x)
    X_train, X_test = rnn_reshape(X_train, X_test, n_steps // n_features, n_features)
    if lb_rate > np.unique(train_y).shape[0] and train_x.shape[0] - lb_rate > np.unique(train_y).shape[0]:
        X_train, X_test, n_steps = flatten_ts(train_x, test_x)
        X_train, _, train_y, _ = train_test_split(X_train, train_y, train_size=lb_rate, stratify=train_y, random_state=7)
        
#         train_y = np.where(train_y == 'left', 0, train_y)
#         train_y = np.where(train_y == 'right', 1, train_y)
#         test_y = np.where(test_y == 'left', 0, test_y)
#         test_y = np.where(test_y == 'right', 1, test_y)
        
        # # RF
        # rf_clf.fit(X_train, train_y)
        # pred = rf_clf.predict(X_test)
        # rf_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
        # print('RF >>', rf_scores)

        # SVM
        svm_clf = SVC(gamma='scale', random_state=7)
        svm_clf.fit(X_train, train_y)
        pred = svm_clf.predict(X_test)
        svm_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
        print('B-SVM >>', svm_scores)

        # # 1-NN
        # knn_clf.fit(X_train, train_y)
        # pred = knn_clf.predict(X_test)
        # knn_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
        # print('1-NN >>', knn_scores)

        # # MLP
        # mlp_clf.fit(X_train, train_y)
        # pred = mlp_clf.predict(X_test)
        # mlp_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
        # print('MLP >>', mlp_scores)

        # LSTM
        X_train, X_test = rnn_reshape(X_train, X_test, n_steps // n_features, n_features)
#         train_y, test_y = train_y.astype(float), test_y.astype(float)
        n_classes = np.unique(train_y).shape[0]
        if np.min(train_y.astype(int)) == 1:
            n_classes = n_classes + 1
        y_train, y_test = keras.utils.to_categorical(train_y), keras.utils.to_categorical(test_y)
        model = LSTM_Model(n_steps // n_features, n_features, n_classes)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        history = model.fit(X_train, y_train, epochs=10, validation_split=0.1, shuffle=False, verbose=0, callbacks=[es])
#         pred = keras.utils.to_categorical(model.predict_classes(X_test))
        lstm_scores = {'accuracy': model.evaluate(X_test, y_test, verbose=0)[1], 'f1': ''}
        print('LSTM >>', lstm_scores)
        
        # TRepNet-SVM
#         X_train, X_test = rnn_reshape(X_train, X_test, n_steps // n_features, n_features)
        # Codings
        codings_train = encoder.predict(X_train)
        codings_test = encoder.predict(X_test)
        
        tsvm_clf = SVC(random_state=7, gamma='scale')
        nb_classes = np.unique(train_y).shape[0]
        train_size = codings_train.shape[0]
        if train_size // nb_classes < 5 or train_size < 50:
            tsvm_clf.fit(codings_train, train_y)
        else:
            grid_search = GridSearchCV(svm_clf, {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, np.inf]}, cv=5, iid=False, n_jobs=-1)
            if train_size <= 10000:
                grid_search.fit(codings_train, train_y)
            else:
                codings_train, _, train_y, _  = train_test_split(codings_train, train_y, train_size=10000, random_state=7, stratify=train_y)
                grid_search.fit(codings_train, train_y)       
            tsvm_clf = grid_search.best_estimator_

            tsvm_clf.fit(codings_train, train_y)

        tpred = tsvm_clf.predict(codings_test)
        TrepNet_scores = {'accuracy': accuracy_score(test_y, tpred), 'f1': f1_score(test_y, tpred, average='weighted')}
        print('TRepNet-SVM >>', TrepNet_scores)
        
        results.append({'dataset': data_name, 'dim': str(n_steps)+', '+str(n_features), '# Labels': lb_rate,
                        # 'RF-ACC': rf_scores['accuracy'], 
                        'SVM-ACC': svm_scores['accuracy'],
                        # '1NN-ACC': knn_scores['accuracy'],
                        # 'MLP-ACC': mlp_scores['accuracy'], 
                        'LSTM-ACC': lstm_scores['accuracy'],
                        'TRepNet-ACC': TrepNet_scores['accuracy'],
                        # 'RF-F1': rf_scores['f1'], 
                        'SVM-F1': svm_scores['f1'],
                        # '1NN-F1': knn_scores['f1'], 
                        # 'MLP-F1': mlp_scores['f1'],
                        'LSTM-F1': lstm_scores['f1'],
                        'TRepNet-F1': TrepNet_scores['f1']
                        })

In [16]:
# selected_uni_datasets = ['ArrowHead', 'BeetleFly', 'ChlorineConcentration', 'Crop', 'Earthquakes','ECG200', 'ECG5000', 'ECGFiveDays',
#                          'FreezerSmallTrain', 'Fungi', 'GunPoint', 'GunPointAgeSpan','GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'Herring', 
#                          'InsectEPGRegularTrain', 'InsectEPGSmallTrain', 'InsectWingbeatSound', 'Lightning2', 'MedicalImages', 'MiddlePhalanxTW',
#                          'NonInvasiveFetalECGThorax2', 'OliveOil', 'PhalangesOutlinesCorrect', 'PickupGestureWiimoteZ','PigAirwayPressure', 'PowerCons',
#                          'ProximalPhalanxOutlineAgeGroup', 'SemgHandGenderCh2', 'SemgHandMovementCh2', 'SemgHandSubjectCh2', 'SmoothSubspace', 'StarLightCurves',
#                          'SyntheticControl', 'Trace', 'UMD', 'UWaveGestureLibraryAll', 'Wafer', 'Yoga'] # DiatomSizeReduction

# for mr in [int(10 * pow(1.5, i)) for i in range(11)]:
#     print('Label #', mr)
#     results = []
#     for dataset in selected_uni_datasets:
#         evaluate(dataset, univariate=True, lb_rate=mr)
#     pd.DataFrame(results).to_csv('./results/sparse labels/uni-LSTM-' + str(mr) +'.csv', index=False)

results = []
for mr in [10, 20, 30, 40, 50, 100, 150, 200, 300]:
    print('Label #', mr)
    evaluate(data_name, univariate=True, lb_rate=mr)
pd.DataFrame(results).to_csv('./results/sparse labels/uni-'+data_name+'.csv', index=False)

Label # 10
B-SVM >> {'accuracy': 0.341, 'f1': 0.23254596532534247}
LSTM >> {'accuracy': 0.459, 'f1': ''}
TRepNet-SVM >> {'accuracy': 0.35575, 'f1': 0.2402914694937631}
Label # 20
B-SVM >> {'accuracy': 0.4195, 'f1': 0.4197986561940946}
LSTM >> {'accuracy': 0.44125, 'f1': ''}
TRepNet-SVM >> {'accuracy': 0.449, 'f1': 0.44742501055773143}
Label # 30
B-SVM >> {'accuracy': 0.4275, 'f1': 0.3953407561540016}
LSTM >> {'accuracy': 0.47925, 'f1': ''}
TRepNet-SVM >> {'accuracy': 0.4025, 'f1': 0.3409962300872288}
Label # 40
B-SVM >> {'accuracy': 0.42275, 'f1': 0.3835733413994904}
LSTM >> {'accuracy': 0.4885, 'f1': ''}
TRepNet-SVM >> {'accuracy': 0.443, 'f1': 0.39578424254094474}
Label # 50
B-SVM >> {'accuracy': 0.4755, 'f1': 0.4564317458700733}
LSTM >> {'accuracy': 0.49675, 'f1': ''}
TRepNet-SVM >> {'accuracy': 0.533, 'f1': 0.5293335928019307}
Label # 100
B-SVM >> {'accuracy': 0.5285, 'f1': 0.5194589636698052}
LSTM >> {'accuracy': 0.536, 'f1': ''}
TRepNet-SVM >> {'accuracy': 0.5875, 'f1': 0.5845055

In [11]:
# selected_mul_datasets = ['ArticularyWordRecognition', 'AtrialFibrillation', 'BasicMotions', 'Cricket', 'EthanolConcentration',
#                          'ERing', 'HandMovementDirection', 'Handwriting', 'JapaneseVowels', 'PenDigits', 'RacketSports', 'SelfRegulationSCP1',
#                          'SelfRegulationSCP2', 'SpokenArabicDigits', 'StandWalkJump']
                         
# for mr in [int(10 * pow(1.5, i)) for i in range(11)]:
#     print('Label #', mr)
#     results = []
#     for dataset in [selected_mul_datasets]:
#         evaluate(dataset, univariate=False, lb_rate=mr)
#     pd.DataFrame(results).to_csv('./results/sparse labels/mul-LSTM-' + str(mr) + '.csv', index=False)

# results = []
# for mr in [20, 30, 40, 50, 100, 150, 200, 300]:
#     print('Label #', mr)
#     evaluate(data_name, univariate=False, lb_rate=mr)
# pd.DataFrame(results).to_csv('./results/sparse labels/mul-'+data_name+'.csv', index=False)