In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import warnings
warnings.filterwarnings('ignore') 

from tensorflow import keras
from sklearn.preprocessing import RobustScaler, Normalizer, StandardScaler
from datasets import load_data, random_benchmark, list_datasets
from tensorflow.keras.layers import Conv1D, LSTM, GRU, Bidirectional, MaxPool1D, TimeDistributed, RepeatVector, Dense, Attention, Input, Embedding, Dropout, BatchNormalization
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score

np.random.seed(7)
tf.random.set_seed(7)

In [2]:
svm_clf = SVC(gamma='scale', random_state=7)

In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPUs


In [4]:
# import tensorflow.compat.v1 as tf
# from tqdm import tqdm
# tf.disable_v2_behavior()

In [5]:
def get_output_dim(original_dim):
    if original_dim // 1.3 >= 512:
        return 512
    elif original_dim // 1.3 <= 128:
        return 128
    else:
        return int(original_dim // 1.3)

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def flatten_ts(train, test):
    new_train, new_test = [], []
    train_lens = []
    
    for _, row in train.iterrows():
        for i in row.index:
            train_lens.append(len(row[i]))

    maxlen = np.ceil(np.average(train_lens)).astype(int)
    
    for _, row in train.iterrows():
        new_list = []
        for i in row.index:
            ts = []
            for j in range(len(row[i])):
                ts.append(row[i][j])
            new_list.append(ts)
        new_train.append(pad_sequences(new_list, maxlen=maxlen, dtype='float32'))
        
    for _, row in test.iterrows():
        new_list = []
        for i in row.index:
            ts = []
            for j in range(len(row[i])):
                ts.append(row[i][j])
            new_list.append(ts)
        new_test.append(pad_sequences(new_list, maxlen=maxlen, dtype='float32'))
            
    train_df = pd.DataFrame(np.array(new_train).reshape(train.shape[0], maxlen * train.columns.shape[0]))
    test_df = pd.DataFrame(np.array(new_test).reshape(test.shape[0], maxlen * train.columns.shape[0]))

    scaler = RobustScaler()
    scaler.fit(train_df)
    return scaler.transform(train_df), scaler.transform(test_df), maxlen * train.columns.shape[0]
#     return np.array(train_df), np.array(test_df), maxlen * train.columns.shape[0]

def rnn_reshape(train, test, n_steps, n_features):
#     train, test = flatten_ts(train, test)
    return train.reshape(train.shape[0], n_steps, n_features), test.reshape(test.shape[0], n_steps, n_features)

In [7]:
# def normalize(Data):
#     Data=np.array(Data)
#     Dim = len(Data[0,:])
#     Min_Val = np.zeros(Dim)
#     Max_Val = np.zeros(Dim)
#     for i in range(Dim):
#         Min_Val[i] = np.min(Data[:,i])
#         Data[:,i] = Data[:,i] - np.min(Data[:,i])
#         Max_Val[i] = np.max(Data[:,i])
#         Data[:,i] = Data[:,i] / (np.max(Data[:,i]) + 1e-10)   
#     return pd.DataFrame(Data)

# def making_masking(data):
#     dataa=data.copy()
#     for i in range(data.shape[1]):
#         #print(i)
#         for j in range((data.shape[0])):
#             if np.isnan(data[i].iloc[j]):
#                 dataa[i].iloc[j]=0
#             else:
#                 dataa[i].iloc[j]=1
#     return(dataa)

# def gain_imputation(gain_input):
#     mask_input=making_masking(gain_input)
#     #print(mask_input)
#     #%% System Parameters
#     # 1. Mini batch size
#     mb_size = 32
#     # 2. Missing rate
#     # 3. Hint rate
#     p_hint = 0.8
#     # 4. Loss Hyperparameters
#     alpha = 10
#     # 5. Train Rate
#     train_rate = 1

#     #%% Data

#     Data=gain_input.to_numpy()

#     # Parameters
#     No = len(Data)
#     Dim = len(Data[0,:])

#     # Hidden state dimensions
#     H_Dim1 = Dim
#     H_Dim2 = Dim

#     # Normalization (0 to 1)
#     Min_Val = np.zeros(Dim)
#     Max_Val = np.zeros(Dim)


#     Data_df=pd.DataFrame(Data)
#     #%% Missing introducing
#     Missing=mask_input.to_numpy()
        
#     #%% Train Test Division    
    

#     Train_No = int(No * train_rate)
#     Test_No = No - Train_No
        
#     # Train / Test Features
#     trainX = Data
#     testX = Data

#     # Train / Test Missing Indicators
#     trainM = Missing
#     testM = Missing

#     #%% Necessary Functions

#     # 1. Xavier Initialization Definition
#     def xavier_init(size):
#         in_dim = size[0]
#         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
#         return tf.random_normal(shape = size, stddev = xavier_stddev)
        
#     # Hint Vector Generation
#     def sample_M(m, n, p):
#         A = np.random.uniform(0., 1., size = [m, n])
#         B = A > p
#         C = 1.*B
#         return C
    
#     '''
#     GAIN Consists of 3 Components
#     - Generator
#     - Discriminator
#     - Hint Mechanism
#     '''   
    
#     #%% GAIN Architecture   
    
#     #%% 1. Input Placeholders
#     # 1.1. Data Vector
#     X = tf.placeholder(tf.float32, shape = [None, Dim])
#     # 1.2. Mask Vector 
#     M = tf.placeholder(tf.float32, shape = [None, Dim])
#     # 1.3. Hint vector
#     H = tf.placeholder(tf.float32, shape = [None, Dim])
#     # 1.4. X with missing values
#     New_X = tf.placeholder(tf.float32, shape = [None, Dim])

#     #%% 2. Discriminator
#     D_W1 = tf.Variable(xavier_init([Dim*2, H_Dim1]))     # Data + Hint as inputs
#     D_b1 = tf.Variable(tf.zeros(shape = [H_Dim1]))

#     D_W2 = tf.Variable(xavier_init([H_Dim1, H_Dim2]))
#     D_b2 = tf.Variable(tf.zeros(shape = [H_Dim2]))

#     D_W3 = tf.Variable(xavier_init([H_Dim2, Dim]))
#     D_b3 = tf.Variable(tf.zeros(shape = [Dim]))       # Output is multi-variate

#     theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

#     #%% 3. Generator
#     G_W1 = tf.Variable(xavier_init([Dim*2, H_Dim1]))     # Data + Mask as inputs (Random Noises are in Missing Components)
#     G_b1 = tf.Variable(tf.zeros(shape = [H_Dim1]))

#     G_W2 = tf.Variable(xavier_init([H_Dim1, H_Dim2]))
#     G_b2 = tf.Variable(tf.zeros(shape = [H_Dim2]))

#     G_W3 = tf.Variable(xavier_init([H_Dim2, Dim]))
#     G_b3 = tf.Variable(tf.zeros(shape = [Dim]))

#     theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

#     #%% GAIN Function

#     #%% 1. Generator
#     def generator(new_x,m):
#         inputs = tf.concat(axis = 1, values = [new_x,m])  # Mask + Data Concatenate
#         G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
#         G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
#         G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) # [0,1] normalized Output
        
#         return G_prob
        
#     #%% 2. Discriminator
#     def discriminator(new_x, h):
#         inputs = tf.concat(axis = 1, values = [new_x,h])  # Hint + Data Concatenate
#         D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
#         D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
#         D_logit = tf.matmul(D_h2, D_W3) + D_b3
#         D_prob = tf.nn.sigmoid(D_logit)  # [0,1] Probability Output
        
#         return D_prob

#     #%% 3. Other functions
#     # Random sample generator for Z
#     def sample_Z(m, n):
#         return np.random.uniform(0.,0.1, size = [m, n])        

#     # Mini-batch generation
#     def sample_idx(m, n):
#         A = np.random.permutation(m)
#         idx = A[:n]
#         return idx

#     #%% Structure
#     # Generator
#     G_sample = generator(New_X,M)

#     # Combine with original data
#     Hat_New_X = New_X * M + G_sample * (1-M)

#     # Discriminator
#     D_prob = discriminator(Hat_New_X, H)

#     #%% Loss
#     D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) 
#     G_loss1 = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
#     MSE_train_loss = tf.reduce_mean((M * New_X - M * G_sample)**2) / tf.reduce_mean(M)

#     D_loss = D_loss1
#     G_loss = G_loss1 + alpha * MSE_train_loss 

#     #%% MSE Performance metric
#     MSE_test_loss = tf.reduce_mean(((1-M) * X - (1-M)*G_sample)**2) / tf.reduce_mean(1-M)

#     #%% Solver
#     D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
#     G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

#     # Sessions
#     sess = tf.Session()
#     sess.run(tf.global_variables_initializer())

#     #%% Iterations

#     #%% Start Iterations
#     for it in tqdm(range(10000)):    
#         #%% Inputs
#         mb_idx = sample_idx(Train_No, mb_size)
#         X_mb = trainX[mb_idx,:]  
#         Z_mb = sample_Z(mb_size, Dim) 
#         M_mb = trainM[mb_idx,:]  
#         H_mb1 = sample_M(mb_size, Dim, 1-p_hint)
#         H_mb = M_mb * H_mb1
        
#         New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
        
#         _, D_loss_curr = sess.run([D_solver, D_loss1], feed_dict = {M: M_mb, New_X: New_X_mb, H: H_mb})
#         _, G_loss_curr, MSE_train_loss_curr, MSE_test_loss_curr = sess.run([G_solver, G_loss1, MSE_train_loss, MSE_test_loss],
#                                                                         feed_dict = {X: X_mb, M: M_mb, New_X: New_X_mb, H: H_mb})
                
            
#         #%% Intermediate Losses
#         if it % 100 == 0:
#             print('Iter: {}'.format(it))
#             print('Train_loss: {:.4}'.format(np.sqrt(MSE_train_loss_curr)))
#             print('Test_loss: {:.4}'.format(np.sqrt(MSE_test_loss_curr)))
#             print()
        
#     #%% Final Loss
    
   
#     Z_mb = sample_Z(Train_No, Dim) 
   
#     M_mb = trainM
#     X_mb = trainX

#     New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
#     MSE_final, Sample = sess.run([MSE_test_loss, G_sample], feed_dict = {X: trainX, M: trainM, New_X: New_X_mb})
#     #print('Final Test RMSE: ' + str(np.sqrt(MSE_final)))
#     New_X_mb=pd.DataFrame(New_X_mb)
#     New_X_mb = New_X_mb.fillna(New_X_mb.mean()).fillna(0)
#     np.array(New_X_mb)
#     return(New_X_mb)

In [8]:
import random

def remove_and_impute(train_data, test_data, missing_rate, method='mean'):
    train, test, n_steps = flatten_ts(train_data, test_data)
    new_train = pd.DataFrame(train)
    new_test = pd.DataFrame(test)
#     new_train = normalize(new_train)
    count = 0
    ix = [(row, col) for row in range(train.shape[0]) for col in range(1, train.shape[1]-1)]
    for row, col in random.sample(ix, int(round(missing_rate * len(ix)))):
        new_train.iat[row, col] = np.nan
        count += 1

    if method == 'mean':
        new_train = new_train.fillna(new_train.mean()).fillna(0)
    elif method == 'last':
        new_train = new_train.fillna(method='ffill').fillna(method='bfill').fillna(0)
    else:
#         new_train = gain_imputation(new_train)
        new_train = new_train.fillna(0)
        
    return np.array(new_train), np.array(test), n_steps

In [9]:
from TRepNet import TRepNet

In [10]:
es = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
# mc = keras.callbacks.ModelCheckpoint('model.h5', save_best_only=True)

In [11]:
method = 'mean'

In [12]:
def evaluate(data_name, univariate, missing_rate):
    print('Data: ', data_name)
    train_x, train_y, test_x, test_y = load_data(data_name, univariate=univariate)    
    X_train, X_test, n_steps = remove_and_impute(train_x, test_x, missing_rate=missing_rate, method=method)
    # X_train, X_test, n_steps = flatten_ts(train_x, test_x)

    n_features = train_x.columns.shape[0]
#     X_train, X_test = rnn_reshape(X_train, X_test, n_steps // n_features, n_features)
            
#     encoder, decoder = TRepNet(n_steps // n_features, n_features, activation='elu')
#     model = keras.models.Sequential([encoder, decoder])

#     model.compile(loss="mae", optimizer=keras.optimizers.Nadam(lr=0.001, clipnorm=1.), metrics=['mae'])
#     history = model.fit(X_train, X_train, epochs=500, batch_size=16, validation_data=[X_test, X_test], callbacks=[es], verbose=0, shuffle=False)
    
#     # Codings
#     codings_train = encoder.predict(X_train)
#     codings_test = encoder.predict(X_test)
    
    # # RF
    # rf_clf.fit(X_train, train_y)
    # pred = rf_clf.predict(X_test)
    # rf_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
    # print('RF >>', rf_scores)

    # SVM
    svm_clf.fit(X_train, train_y)
    pred = svm_clf.predict(X_test)
    svm_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
    print('SVM >>', svm_scores)

    # # 1-NN
    # knn_clf.fit(X_train, train_y)
    # pred = knn_clf.predict(X_test)
    # knn_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
    # print('1-NN >>', knn_scores)

    # # MLP
    # mlp_clf.fit(X_train, train_y)
    # pred = mlp_clf.predict(X_test)
    # mlp_scores = {'accuracy': accuracy_score(test_y, pred), 'f1': f1_score(test_y, pred, average='weighted')}
    # print('MLP >>', mlp_scores)
    
    results.append({'dataset': data_name, 'dim': str(n_steps)+', '+str(n_features), 
                    # 'RF-ACC': rf_scores['accuracy'], 
                    'SVM-ACC': svm_scores['accuracy'],
                    # '1NN-ACC': knn_scores['accuracy'],
                    # 'MLP-ACC': mlp_scores['accuracy'], 
                    # 'RF-F1': rf_scores['f1'], 
                    'SVM-F1': svm_scores['f1'],
                    # '1NN-F1': knn_scores['f1'], 
                    # 'MLP-F1': mlp_scores['f1']
                    })

In [13]:
selected_uni_datasets = ['ArrowHead', 'BeetleFly', 'ChlorineConcentration', 'Crop', 'DiatomSizeReduction', 'Earthquakes','ECG200', 'ECG5000', 'ECGFiveDays',
                         'FreezerSmallTrain', 'Fungi', 'GunPoint', 'GunPointAgeSpan','GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'Herring', 
                         'InsectEPGRegularTrain', 'InsectEPGSmallTrain', 'InsectWingbeatSound', 'Lightning2', 'MedicalImages', 'MiddlePhalanxTW',
                         'NonInvasiveFetalECGThorax2', 'OliveOil', 'PhalangesOutlinesCorrect', 'PickupGestureWiimoteZ','PigAirwayPressure', 'PowerCons',
                         'ProximalPhalanxOutlineAgeGroup', 'SemgHandGenderCh2', 'SemgHandMovementCh2', 'SemgHandSubjectCh2', 'SmoothSubspace', 'StarLightCurves',
                         'SyntheticControl', 'Trace', 'UMD', 'UWaveGestureLibraryAll', 'Wafer', 'Yoga']

for mr in [90, 80, 60, 50, 40, 20, 10]:
    print('Missing Rate:', mr)
    results = []
    for dataset in selected_uni_datasets:
        evaluate(dataset, univariate=True, missing_rate=mr/100)
    pd.DataFrame(results).to_csv('./results/missing values/uni-baseline-' + str(mr) +'-'+ method +'.csv', index=False)

Missing Rate: 90
Data:  ArrowHead
SVM >> {'accuracy': 0.4114285714285714, 'f1': 0.41559023609443774}
Data:  BeetleFly
SVM >> {'accuracy': 0.5, 'f1': 0.3333333333333333}
Data:  ChlorineConcentration
SVM >> {'accuracy': 0.23671875, 'f1': 0.10048266513047843}
Data:  Crop
SVM >> {'accuracy': 0.06976190476190476, 'f1': 0.04833819637355565}
Data:  DiatomSizeReduction
SVM >> {'accuracy': 0.3006535947712418, 'f1': 0.13899563175353893}
Data:  Earthquakes
SVM >> {'accuracy': 0.7482014388489209, 'f1': 0.6404357995085412}
Data:  ECG200
SVM >> {'accuracy': 0.36, 'f1': 0.2074536408864767}
Data:  ECG5000
SVM >> {'accuracy': 0.7431111111111111, 'f1': 0.6971968171560134}
Data:  ECGFiveDays
SVM >> {'accuracy': 0.6445993031358885, 'f1': 0.6171183319144475}
Data:  FreezerSmallTrain
SVM >> {'accuracy': 0.5, 'f1': 0.3333333333333333}
Data:  Fungi
SVM >> {'accuracy': 0.13978494623655913, 'f1': 0.11509185368351378}
Data:  GunPoint
SVM >> {'accuracy': 0.6333333333333333, 'f1': 0.5790530360740102}
Data:  GunPoi

In [None]:
selected_mul_datasets = ['ArticularyWordRecognition', 'AtrialFibrillation', 'BasicMotions', 'Cricket', 'EthanolConcentration',
                         'ERing', 'HandMovementDirection', 'Handwriting', 'JapaneseVowels', 'PenDigits', 'RacketSports', 'SelfRegulationSCP1',
                         'SelfRegulationSCP2', 'SpokenArabicDigits', 'StandWalkJump']

for mr in [90, 80, 60, 50, 40, 20, 10]:
    print('Missing Rate:', mr)
    results = []
    for dataset in selected_mul_datasets:
        evaluate(dataset, univariate=False, missing_rate=mr/100)
    pd.DataFrame(results).to_csv('./results/missing values/mul-baseline-' + str(mr) + '-'+ method +'.csv', index=False)

Missing Rate: 90
Data:  ArticularyWordRecognition
SVM >> {'accuracy': 0.04, 'f1': 0.003076923076923077}
Data:  AtrialFibrillation
SVM >> {'accuracy': 0.4, 'f1': 0.28654970760233917}
Data:  BasicMotions
SVM >> {'accuracy': 0.25, 'f1': 0.1}
Data:  Cricket
SVM >> {'accuracy': 0.08333333333333333, 'f1': 0.012820512820512822}
Data:  EthanolConcentration
SVM >> {'accuracy': 0.3155893536121673, 'f1': 0.20678513148511632}
Data:  ERing
SVM >> {'accuracy': 0.4444444444444444, 'f1': 0.37692743730315553}
Data:  HandMovementDirection
SVM >> {'accuracy': 0.40540540540540543, 'f1': 0.23388773388773387}
Data:  Handwriting
SVM >> {'accuracy': 0.052941176470588235, 'f1': 0.007510324630062788}
Data:  JapaneseVowels
SVM >> {'accuracy': 0.12972972972972974, 'f1': 0.05661013854244131}
Data:  PenDigits
SVM >> {'accuracy': 0.6889651229273871, 'f1': 0.6906894810229469}
Data:  RacketSports
SVM >> {'accuracy': 0.2631578947368421, 'f1': 0.10964912280701755}
Data:  SelfRegulationSCP1
SVM >> {'accuracy': 0.55631399