In [1]:
# import mysql.connector

# mydb = mysql.connector.connect(
#   host="localhost",
#   user="root",
#   passwd="",
#   database="anomaly_detection_decision_support"
# )

# mycursor = mydb.cursor()


def open_connection():
    mydb = mysql.connector.connect(
      host="34.68.13.182",
      user="root",
      passwd="6g8HBIy0F8atEKtb",
      database="anomaly_detection_decision_support"
    )

    mycursor = mydb.cursor()
    return mydb, mycursor

def close_connection(mydb, mycursor):
    mycursor.close()
    mydb.close()

In [2]:
import os
import json
import mysql.connector


def get_datasets(path):
    datasets = []
    for root, dirs, files in os.walk(path):
        for dir in dirs:
            with open(os.path.join(root, dir, 'metadata.json')) as json_file:
                metadata = json.load(json_file)
                metadata['files'] = [os.path.join(root, dir, filename) for filename in metadata['files']]
                datasets.append(metadata)
    return datasets

def get_methods():
    methods = [
        {
            'name': 'gaussian',
            'parameters': {},
            'isSupervised': False,
        }, {
            'name': 'linear_regression',
            'parameters': {},
            'isSupervised': False,
        }, {
            'name': 'pca',
            'parameters': {},
            'isSupervised': False,
        }, {
            'name': 'kmeans',
            'parameters': {},
            'isSupervised': False,
        }, {
            'name': 'neural_network',
            'parameters': {},
            'isSupervised': True,
        }
    ];
    
    return methods
                       

In [3]:
#initialize tensorflow
import tensorflow as tf
from tensorflow.python.client import device_lib
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

strategy = None
if (os.environ.get('COLAB_TPU_ADDR')!=None):
  os.environ['TPU_ADDR'] = os.environ['COLAB_TPU_ADDR']
# else:
#   os.environ['TPU_ADDR'] = '10.15.20.26:8470'

if (os.environ.get('TPU_ADDR')!=None):
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    # This is the TPU initialization code that has to be at the beginning.
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

def get_devices():
    if strategy:
        devices = ['ASIC']
    else:
        devices = [d.name for d in device_lib.list_local_devices() if 'XLA' not in d.name]
    return devices

In [4]:
# data cleansing
import numpy as np
import pandas as pd
from pymfe.mfe import MFE
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def transform_data(dataset, rows = 1000, mfe = True):
    # read data from files
    df = pd.DataFrame()
    chunk = int(rows/len(dataset['files']))
    for filename in dataset['files']:
        df_temp = pd.read_csv(filename, error_bad_lines=False, warn_bad_lines=False,index_col=None, nrows=chunk)
        if df.columns.size != 0:
            df_temp.columns=df.columns
        df = pd.concat([df, df_temp], axis=0)

    # convert objects to category codes
    for col_name in df.columns:
        if(df[col_name].dtype == 'object'):
            df[col_name]= df[col_name].astype('category')
            df[col_name] = df[col_name].cat.codes.astype('int64')
            
    # split dataset into features and target arrays
    features = target = feature_scores = None
    if (dataset['label']):
        features = np.nan_to_num(df.drop(columns=[dataset['label']]).to_numpy())
        target = df[dataset['label']].to_numpy()
    else:
        features = df.to_numpy()
        
    # features selection
    if (dataset['label']):
        clf = ExtraTreesClassifier(n_estimators=50)
        clf = clf.fit(features, target)
        feature_scores = pd.DataFrame(np.vstack([clf.feature_importances_]), 
                                      columns=df.drop(columns=[dataset['label']]).columns.values).to_dict(orient='records')[0]

    
    # standardize dataset
    scaler = preprocessing.StandardScaler()
    features = scaler.fit_transform(features)
    features = np.nan_to_num(features)
#     features = preprocessing.scale(features)    
    
    # Extract general, statistical and information-theoretic measures
    ft = ()
    if mfe==True:
        mfe = MFE(groups=["general", "statistical", "info-theory"],suppress_warnings = True)
        if dataset['label']:
            mfe.fit(features, target)
        else:
            mfe.fit(features)
        ft = mfe.extract()
        ft = np.nan_to_num(ft)
    
    return features, target, feature_scores, ft

In [5]:
import tensorflow_probability as tfp

def gaussian(features):
    
    # measure training time
    start_time = tf.timestamp()
    features = tf.constant(features)
    mu = tf.reduce_mean(features, axis=0)
    mu = tf.reshape(mu, [1,features.shape[1]])
    mx = tf.matmul(tf.transpose(mu), mu)
    vx = tf.matmul(tf.transpose(features), features)/tf.cast(tf.shape(features)[0], tf.float64)
    sigma = vx - mx
#     mvn = tfp.distributions.MultivariateNormalTriL(loc=mu,scale_tril=tf.linalg.cholesky(sigma))
    train_time = tf.timestamp()-start_time
    # measure test time
    start_time = tf.timestamp()
#     mvn.prob(tf.constant(features))
    test_time = (tf.timestamp()-start_time)/features.shape[0]
    
    return train_time.numpy(), test_time.numpy() 

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def linear(features):
    # create linear regression
    model = Sequential()
    model.add(Dense(1, input_dim=features.shape[1], activation='linear'))
    
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.RMSprop(0.001),
                metrics=['mean_absolute_error', 'mean_squared_error'])
    
    # measure training time
    start_time = tf.timestamp()
    model.fit(features,features)
    train_time = tf.timestamp()-start_time
    
    weights = tf.transpose(model.get_weights()[0])
    
    # measure test time
    start_time = tf.timestamp()
    distance = tf.math.divide(tf.math.abs(tf.math.add(tf.reduce_sum(tf.multiply(weights, features), axis=1),model.get_weights()[1])),
                              tf.math.sqrt(tf.reduce_sum(tf.multiply(weights, weights), axis=1)))
    test_time = (tf.timestamp()-start_time)/features.shape[0]
    
    return train_time.numpy(), test_time.numpy()
    

In [7]:
def pca(features):
    features -= tf.reduce_mean(features, axis=0)
    
    start_time = tf.timestamp()
    result = tf.linalg.svd(features)
    train_time = test_time = (tf.timestamp()-start_time)
    
    return train_time.numpy(), test_time.numpy()

In [8]:
# function which returns data (necessary for tensorflow v1)
def input_fn():
    return tf.compat.v1.train.limit_epochs(tf.convert_to_tensor(features, dtype=tf.float32), num_epochs=1)


def kmeans(features):
    # build a model
    model = tf.compat.v1.estimator.experimental.KMeans(5)
    
    #train a model
    start_time = tf.timestamp()
    model.train(input_fn)
    train_time = (tf.timestamp()-start_time)
    
    #test a model
    start_time = tf.timestamp()
    model.predict_cluster_index(input_fn)
    test_time = (tf.timestamp()-start_time)
    
    return train_time.numpy(), test_time.numpy()

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def nn(features, target):
    # create a NN
    model = Sequential()
    model.add(Dense(features.shape[1], input_dim=features.shape[1], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # build a model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
        tf.keras.metrics.MeanSquaredError(),
        tf.keras.metrics.AUC(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.Accuracy()
    ])
    
    # measure training time
    start_time = tf.timestamp()
    model.fit(features, target)
    train_time = tf.timestamp()-start_time
    
    # measure test time
    start_time = tf.timestamp()
    results = model.predict(features)
    test_time = (tf.timestamp()-start_time)/features.shape[0]
    
    # cross validation
    validation = model.evaluate(features, target)
    
    return train_time.numpy(), test_time.numpy(), validation
    

In [10]:
def evaluate(device, methods, dataset, features, target):
    isSupervised = target is not None
    for method in methods:
        if method['isSupervised'] == isSupervised:
            if method['name'] == 'gaussian':
                result = gaussian(features)
            elif method['name'] == 'linear_regression':
                result = linear(features)
            elif method['name'] == 'pca':
                result = pca(features)
            elif method['name'] == 'kmeans':
                result = kmeans(features)
            elif method['name'] == 'neural_network':
                result = nn(features, target)
                
            print('Running',method['name'])
            
            insert_evaluation_info(device, method, dataset, result)

In [16]:
def clear_db():
    mydb, mycursor = open_connection()
    mycursor.execute('DELETE FROM algorithm')
    mydb.commit()
    sql = "INSERT INTO algorithm (name, complexity) VALUES (%s, %s)"
    val = ('gaussian', 'O(n^2)')
    mycursor.execute(sql, val)
    val = ('linear_regression', 'O(n^2)')
    mycursor.execute(sql, val)
    val = ('pca', 'O(n^2)')
    mycursor.execute(sql, val)
    val = ('kmeans', 'O(n^2)')
    mycursor.execute(sql, val)
    val = ('neural_network', 'O(n^2)')
    mycursor.execute(sql, val)
    
    mydb.commit()

    mycursor.execute('DELETE FROM device_characterization')
    mycursor.execute('DELETE FROM device')

    sql = "INSERT INTO device (name, type) VALUES (%s, %s)"
    val = ('Intel Xeon', 'CPU')
    mycursor.execute(sql, val)
    
    device_id = mycursor.lastrowid
    sql = "INSERT INTO device_characterization (device_id, name, value) VALUES (%s, %s, %s)"
    val = (device_id, 'transistor_count', '7.2')
    mycursor.execute(sql, val)
    val = (device_id, 'core_count', '2')
    mycursor.execute(sql, val)
    val = (device_id, 'technology', '22')
    mycursor.execute(sql, val)
    val = (device_id, 'power_dissipation', '180')
    mycursor.execute(sql, val)
    val = (device_id, 'flops', '90')
    mycursor.execute(sql, val)
    val = (device_id, 'fequency', '4300')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_type', 'DRAM')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_size', '13')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_bandwidth', '')
    mycursor.execute(sql, val)
    val = (device_id, 'weight', '')
    mycursor.execute(sql, val)

    sql = "INSERT INTO device (name, type) VALUES (%s, %s)"
    val = ('Tesla K80', 'GPU')
    mycursor.execute(sql, val)
    device_id = mycursor.lastrowid
    sql = "INSERT INTO device_characterization (device_id, name, value) VALUES (%s, %s, %s)"
    val = (device_id, 'transistor_count', '7.1')
    mycursor.execute(sql, val)
    val = (device_id, 'core_count', '2496')
    mycursor.execute(sql, val)
    val = (device_id, 'technology', '28')
    mycursor.execute(sql, val)
    val = (device_id, 'power_dissipation', '300')
    mycursor.execute(sql, val)
    val = (device_id, 'flops', '2910')
    mycursor.execute(sql, val)
    val = (device_id, 'fequency', '1562')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_type', 'DRAM')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_size', '12')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_bandwidth', '')
    mycursor.execute(sql, val)
    val = (device_id, 'weight', '')
    mycursor.execute(sql, val)

    sql = "INSERT INTO device (name, type) VALUES (%s, %s)"
    val = ('Google TPU', 'ASIC')
    mycursor.execute(sql, val)
    device_id = mycursor.lastrowid
    sql = "INSERT INTO device_characterization (device_id, name, value) VALUES (%s, %s, %s)"
    val = (device_id, 'transistor_count', '2.1')
    mycursor.execute(sql, val)
    val = (device_id, 'core_count', '2496')
    mycursor.execute(sql, val)
    val = (device_id, 'technology', '28')
    mycursor.execute(sql, val)
    val = (device_id, 'power_dissipation', '40')
    mycursor.execute(sql, val)
    val = (device_id, 'flops', '180000')
    mycursor.execute(sql, val)
    val = (device_id, 'fequency', '700')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_type', 'SRAM')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_size', '16')
    mycursor.execute(sql, val)
    val = (device_id, 'memory_bandwidth', '')
    mycursor.execute(sql, val)
    val = (device_id, 'weight', '')
    mycursor.execute(sql, val)
    mydb.commit()

    mycursor.execute('DELETE FROM dataset_characterization')
    mycursor.execute('DELETE FROM feature_score')
    mycursor.execute('DELETE FROM dataset')
    
    
    mycursor.execute('DELETE FROM performance')
    mycursor.execute('DELETE FROM parameter')
    mycursor.execute('DELETE FROM evaluation')
    
    mydb.commit()
    
    close_connection(mydb, mycursor)
    
def insert_data_info(dataset, feature_score, ft):
    mydb, mycursor = open_connection()
    mycursor.execute("SELECT * FROM dataset WHERE name='"+str(dataset['name'])+"'")
    row = mycursor.fetchone()
    if row:
        return row[0]
    
    sql = "INSERT INTO dataset (name, type_of_data, domain, anomaly_types, anomaly_space, anomaly_entropy, label, files) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
    val = (dataset['name'], str(dataset['type_of_data']).strip('[]'), str(dataset['domain']).strip('[]'), str(dataset['anomaly_types']).strip('[]'), dataset['anomaly_space'], dataset['anomaly_entropy'], dataset['label'], str(dataset['files']).strip('[]'))
    mycursor.execute(sql, val)
    mydb.commit()
    dataset['id'] = mycursor.lastrowid;
    
    if len(ft) == 2:
        for i in range(len(ft[0])):
            sql = "INSERT INTO dataset_characterization (dataset_id, name, value) VALUES (%s, %s, %s)"
            val = (dataset['id'], str(ft[0][i]), str(ft[1][i]))
            mycursor.execute(sql, val)
        mydb.commit()
        
    if feature_score:
        for key in feature_score:
            sql = "INSERT INTO feature_score (dataset_id, name, value) VALUES (%s, %s, %s)"
            val = (dataset['id'], str(key), str(feature_score[key]))
            mycursor.execute(sql, val)
        mydb.commit()
        
    close_connection(mydb, mycursor)
    return dataset['id']


def insert_evaluation_info(device, method, dataset, result):
    device_type = 'TPU'
    if 'CPU' in device:
        device_type = 'CPU'
    elif 'GPU' in device:
        device_type = 'GPU'
        
    mydb, mycursor = open_connection()
    mycursor.execute("SELECT * FROM device WHERE type='"+device_type+"'")
    device_id = mycursor.fetchone()[0]
    mycursor.execute("SELECT * FROM dataset WHERE id="+str(dataset['id']))
    dataset_id = mycursor.fetchone()[0]
    mycursor.execute("SELECT * FROM algorithm WHERE name='"+method['name']+"'")
    algorithm_id = mycursor.fetchone()[0]

    sql = "INSERT INTO evaluation (evaluation_id, dataset_id, algorithm_id, device_id, training_time, inference_time) VALUES (%s, %s, %s, %s, %s, %s)"
    val = (1, dataset_id, algorithm_id, device_id, str(result[0]), str(result[1]))
    mycursor.execute(sql, val)
    mydb.commit()

    if len(result) == 3:
        evaluation_id = mycursor.lastrowid;
        sql = "INSERT INTO performance (evaluation_id, name, value) VALUES (%s, %s, %s)"
        val = (evaluation_id, 'mean_squared_error', str(result[2][0]))
        mycursor.execute(sql, val)
        val = (evaluation_id, 'auc', str(result[2][0]))
        mycursor.execute(sql, val)
        val = (evaluation_id, 'precision', str(result[2][1]))
        mycursor.execute(sql, val)
        val = (evaluation_id, 'recall', str(result[2][2]))
        mycursor.execute(sql, val)
        val = (evaluation_id, 'accuracy', str(result[2][3]))
        mycursor.execute(sql, val)
        mydb.commit()
        
    close_connection(mydb, mycursor)

In [17]:
clear_db()

In [None]:
datasets = get_datasets('../GoogleDrive/Academic/PhD/III/datasets/')
devices = get_devices()
methods = get_methods()

# tf.profiler.experimental.start('logdir')

print('*** Datasets for evaluation:', len(datasets),'***')
i = 1
for dataset in datasets:
    print('*** Dataset:',i,'Name:',dataset['name'])
    i+=1
    features, target, feature_score, ft = transform_data(dataset, 100, True)
    dataset['id'] = insert_data_info(dataset, feature_score, ft)
    print('Features:', features.shape)
    
    for device in devices:
        if device == 'ASIC':
            with strategy.scope():
                evaluate(device, methods, dataset, features, target)
        else:
            with tf.device(device):
                evaluate(device, methods, dataset, features, target)
        
print('*** DONE ***')

# tf.profiler.experimental.stop()
    
    
    

*** Datasets for evaluation: 15 ***
*** Dataset: 1 Name: Green Card & H1B (2014-2018)




Features: (100, 27)
Running gaussian
Running linear_regression
Running pca
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/gv/w4dx2qm514jdz__wrb8hzjc80000gn/T/tmpv_89lwlg', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Will set it as 'np.nan' for all summary functions.
 Exception message: ZeroDivisionError('float division by zero').
 Will set it as 'np.nan' for all summary functions.


In [41]:
# %load_ext tensorboard
# %tensorboard --logdir=./logdir