## Read data from database

In [8]:
import os
# specify which GPU will be used
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [9]:
import os

import pymysql
from warnings import filterwarnings

_connection = None

def get_connection(db_config):
    """
    get db connection
    :return:
    """
    global _connection
    if _connection is None:
        _connection = pymysql.connect(host=db_config['host'], user=db_config['username'],
                                      password=db_config['password'],
                                      db=db_config['db'], charset="utf8")
        filterwarnings('ignore', category=pymysql.Warning)

    return _connection


def close():
    """
    close DB connection
    :return:
    """
    global _connection
    if _connection is not None:
        _connection.close()
    _connection = None

In [10]:
db = {
    'host': '172.26.187.242',
    'username': 'malware_r',
    'password': 'GEg22v2O7jbfWhb3',
    'db': 'malware'
}

### Fields

- mw_file_suffix: file name after hash value
- mw_file_prefix: directory
- mw_em_f: features of ember, splitted by ";"

In [11]:
import time

# the base function which can query sql and return dict data
def get_specific_data(table_suffix, sql=None):
    start_time = time.time()
    
    global _connection
    if _connection is None:
        raise Exception("please init db connect first")

    cursor = _connection.cursor()
    cursor.execute("SET NAMES utf8mb4")

    ret = []
        
    cursor.execute(sql)

    field_names = [i[0] for i in cursor.description]

    for row in cursor:
        temp = {}
        for key in range(len(row)):
            temp[field_names[key]] = row[key]
        ret.append(temp)
     
    cursor.close()
    # _connection.close()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return ret

In [12]:
close()
res1 = []
get_connection(db)
table_suffix = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"]
# table_suffix = ["0","8"]
# Iterate all partitions of databases
for suffix in table_suffix:
    sql = """ 
select
  a.mw_file_hash,
  a.section_name,
  b.mw_file_suffix as mw_file_size,
  b.mw_file_prefix as mw_file_directory,
  b.mw_num_engines,
  a.pointerto_raw_data,
  a.virtual_size,
  d.mw_em_f
from (
       select
         mw_file_hash,
         section_name,
         pointerto_raw_data,
         virtual_size,
         count(1) as cnt
       from mw_index_2017_section_%s
       where CNT_CODE = 1 and MEM_EXECUTE = 1
       group by mw_file_hash) a
  inner join mw_index_2017_%s b on a.mw_file_hash = b.mw_file_hash and a.cnt = 1 and b.mw_num_engines <> -1 and
                                  (b.mw_num_engines >= 4 or b.mw_num_engines = 0) and
                                  b.mw_file_prefix in ('201701')
  inner join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
    """ % (suffix, suffix, suffix)
    res1.extend(get_specific_data(suffix, sql))
close()
print(len(res1))

--- 9.282475709915161 seconds ---
--- 10.822624206542969 seconds ---
--- 10.618620157241821 seconds ---
--- 10.548434257507324 seconds ---
--- 10.782588005065918 seconds ---
--- 10.80064058303833 seconds ---
--- 10.719825029373169 seconds ---
--- 10.58968210220337 seconds ---
--- 10.071983575820923 seconds ---
--- 10.547618865966797 seconds ---
--- 10.178751945495605 seconds ---
--- 10.721812009811401 seconds ---
--- 9.94730520248413 seconds ---
--- 8.613046646118164 seconds ---
--- 9.755885601043701 seconds ---
--- 9.516873121261597 seconds ---
77683


In [13]:
close()
res2 = []
get_connection(db)
table_suffix = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"]
# table_suffix = ["0","8"]
# Iterate all partitions of databases
for suffix in table_suffix:
    sql = """ 
select
  a.mw_file_hash,
  a.section_name,
  b.mw_file_suffix as mw_file_size,
  b.mw_file_prefix as mw_file_directory,
  b.mw_num_engines,
  a.pointerto_raw_data,
  a.virtual_size,
  d.mw_em_f
from (
       select
         mw_file_hash,
         section_name,
         pointerto_raw_data,
         virtual_size,
         count(1) as cnt
       from mw_index_2017_section_%s
       where CNT_CODE = 1 and MEM_EXECUTE = 1
       group by mw_file_hash) a
  inner join mw_index_2017_%s b on a.mw_file_hash = b.mw_file_hash and a.cnt = 1 and b.mw_num_engines <> -1 and
                                  (b.mw_num_engines >= 4 or b.mw_num_engines = 0) and a.virtual_size < 10240 and
                                  b.mw_file_prefix in ('201705')
  inner join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
    """ % (suffix, suffix, suffix)
    res2.extend(get_specific_data(suffix, sql))
close()
print(len(res2))

--- 4.398529767990112 seconds ---
--- 7.239321947097778 seconds ---
--- 7.314236164093018 seconds ---
--- 7.149776220321655 seconds ---
--- 7.3079514503479 seconds ---
--- 7.48460841178894 seconds ---
--- 7.394437789916992 seconds ---
--- 7.230358123779297 seconds ---
--- 6.433512926101685 seconds ---
--- 7.308030366897583 seconds ---
--- 6.981007814407349 seconds ---
--- 7.216529369354248 seconds ---
--- 6.75273323059082 seconds ---
--- 6.520577430725098 seconds ---
--- 6.680319786071777 seconds ---
--- 6.436001777648926 seconds ---
14478


## Check and split data

In [14]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
from sklearn.model_selection import train_test_split

max_length = 10240

train_data = pd.DataFrame(res1)
# train_data = train_data.loc[train_data.virtual_size <= max_length]
# train_data = train_data.reset_index(drop=True)
train_data.mw_num_engines[train_data.mw_num_engines == 0 ] = 0
train_data.mw_num_engines[train_data.mw_num_engines >= 4 ] = 1
train_label = train_data.mw_num_engines.ravel()

test_data = pd.DataFrame(res2)
# test_data = test_data.loc[test_data.virtual_size <= max_length]
# test_data = test_data.reset_index(drop=True)
test_data.mw_num_engines[test_data.mw_num_engines == 0 ] = 0
test_data.mw_num_engines[test_data.mw_num_engines >= 4 ] = 1
test_label = test_data.mw_num_engines.ravel()

x_train, x_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.1, random_state=2345)
x_test = test_data
y_test = test_label
del train_data, test_data, train_label, res1, res2

x_train = x_train.reset_index(drop=True)
x_val = x_val.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## EMBER

In [30]:
import pandas as pd
import re
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, confusion_matrix

In [32]:
def get_ember_feature(data):
    ember_f = np.zeros((len(data.mw_em_f), 2351), dtype=float)
    for index, item in data.iterrows():
        ember_f[index, :] = item['mw_em_f'].split(';')
    return ember_f

In [55]:
def get_model(x_train, y_train, x_val, y_val):
    params = {'application': 'binary'}
    lgbm_dataset = lgb.Dataset(x_train, y_train.ravel())
    valid_sets = lgb.Dataset(x_val, y_val.ravel())

    model = lgb.train(params, lgbm_dataset, valid_sets=valid_sets, early_stopping_rounds=10)
    y_pred = model.predict(x_val, num_iteration=model.best_iteration)
    
    loss = log_loss(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)
    acc = accuracy_score(y_val, (y_pred > 0.5).astype(int))
#     model.save_model(file_path + "-%04d-%.5f-%.5f.h5" % (model.best_iteration, loss, acc),
#                      num_iteration=model.best_iteration)
    print("val loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)
        
    return model

In [34]:
x_etrain = get_ember_feature(x_train)
x_eval = get_ember_feature(x_val)
x_etest = get_ember_feature(x_test)

In [35]:
import matplotlib.pyplot as plt
model = get_model(x_etrain, y_train, x_eval, y_val)

[1]	valid_0's binary_logloss: 0.614185
[2]	valid_0's binary_logloss: 0.550422
[3]	valid_0's binary_logloss: 0.496149
[4]	valid_0's binary_logloss: 0.450833
[5]	valid_0's binary_logloss: 0.410366
[6]	valid_0's binary_logloss: 0.376211
[7]	valid_0's binary_logloss: 0.346214
[8]	valid_0's binary_logloss: 0.319633
[9]	valid_0's binary_logloss: 0.296475
[10]	valid_0's binary_logloss: 0.275275
[11]	valid_0's binary_logloss: 0.255923
[12]	valid_0's binary_logloss: 0.239463
[13]	valid_0's binary_logloss: 0.223765
[14]	valid_0's binary_logloss: 0.21015
[15]	valid_0's binary_logloss: 0.19773
[16]	valid_0's binary_logloss: 0.186524
[17]	valid_0's binary_logloss: 0.176393
[18]	valid_0's binary_logloss: 0.167543
[19]	valid_0's binary_logloss: 0.158879
[20]	valid_0's binary_logloss: 0.15158
[21]	valid_0's binary_logloss: 0.144474
[22]	valid_0's binary_logloss: 0.138378
[23]	valid_0's binary_logloss: 0.132022
[24]	valid_0's binary_logloss: 0.126577
[25]	valid_0's binary_logloss: 0.121727
[26]	valid_0

In [28]:
def estimate_model(y_pred, test_y):
    
    loss = log_loss(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred)
    acc = accuracy_score(test_y, (y_pred > 0.5).astype(int))
    print("loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)

    fp_np_index = np.where(test_y == 0)
    fp_np = y_pred[fp_np_index].shape[0]
    thre_index = int(np.ceil(fp_np - fp_np * 0.001))

    sorted_pred_prob = np.sort(y_pred[fp_np_index], axis=0)
    thre = sorted_pred_prob[thre_index]
    if thre == 1:
        thre = max(sorted_pred_prob[np.where(sorted_pred_prob != 1)])

    y_pred_prob = np.vstack((y_pred.transpose(), (1 - y_pred).transpose())).transpose()
    y_pred_prob[:, 1] = thre
    y_pred_label = np.argmin(y_pred_prob, axis=-1)

    tn, fp, fn, tp = confusion_matrix(test_y, y_pred_label).ravel()
    fp_rate = fp / (fp + tn)
    recall_rate = tp / (tp + fn)

    print("thre: %.10f"%  thre)
    print("fp:  %.10f"%  fp_rate)
    print("recall:  %.10f"%  recall_rate)
    
    return auc, loss, recall_rate

In [36]:
y_p = model.predict(x_etest)
y_pred_e = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred_e[i, 0] = y_p[i]

estimate_model(y_pred_e, y_test)

loss : 0.04347
auc score : 0.99870
accuracy score : 0.98895
thre: 0.9313507233
fp:  0.0008241004
recall:  0.8116390216


(0.9986965581035286, 0.04346982346640275, 0.8116390216474557)

## Malcon

In [15]:
import keras
import numpy as np

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'

    def __init__(self, list_IDs, datasets, labels, batch_size=32, dim=8192, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.datasets = datasets
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'  # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.zeros((self.batch_size, self.dim), dtype=float)
        y = np.zeros(self.batch_size, dtype=float)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            base_path = "/ssd/2017/{0}/{1}{2}"
            item = self.datasets.loc[ID]
            file_path = base_path.format(item["mw_file_directory"], item["mw_file_hash"], item["mw_file_size"])
            in_file = open(file_path, 'rb')
            in_file.seek(item['pointerto_raw_data'])
            if item['virtual_size'] > max_length:
                bytes_data = [int(single_byte) for single_byte in in_file.read(max_length)]
            else:
                bytes_data = [int(single_byte) for single_byte in in_file.read(item['virtual_size'])]
            X[i, 0:len(bytes_data)] = bytes_data
            y[i] = self.labels[ID]

        return X, y

Using TensorFlow backend.


In [74]:
import hashlib
import json
import time

import keras
from keras import Input
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Dense, Embedding, Conv1D, Multiply, GlobalMaxPooling1D, Dropout, Activation
from keras.models import load_model
from sklearn.model_selection import train_test_split

class TMalConv(object):
    """
    train of mal conv
    """

    def __init__(self):
        self.max_len = max_length
        self.history = None
        self.model = None
        self.p_md5 = None
        self.time = time.time()
        self.summary = {
            'time':time.time(),
            'batch_size': 16,
            'epochs': 64,
            'g_c_filter': 128,
            'g_c_kernel_size': 500,
            'g_c_stride': 500,
        }

    def run(self):
        """
        :return:
        """
        self.train()
        
    def get_p(self, key):
        """
        get the parameter from the summary
        :param key:
        :return:
        """
        return self.summary[key]

    def gate_cnn(self, gate_cnn_input):
        """
        construct a gated cnn by the specific kernel size
        :param gate_cnn_input:
        :param kernel_size:
        :return:
        """
        gate_cnn_input = Activation('relu')(gate_cnn_input)
        
        conv1_out = Conv1D(self.get_p("g_c_filter"), self.get_p("g_c_kernel_size"), strides=self.get_p("g_c_stride"))(
            gate_cnn_input)
        conv2_out = Conv1D(self.get_p("g_c_filter"), self.get_p("g_c_kernel_size"), strides=self.get_p("g_c_stride"),
                           activation="sigmoid")(gate_cnn_input)
        
        merged = Multiply()([conv1_out, conv2_out])
        
        gate_cnn_output = GlobalMaxPooling1D()(merged)
        return gate_cnn_output

    def get_model(self):
        """
        get a model
        :param max_len:
        :param kernel_sizes:
        :return:
        """
        net_input = Input(shape=(self.max_len,))

        embedding_out = Embedding(256, 8, input_length=self.max_len)(net_input)
        merged = self.gate_cnn(embedding_out)

        dense_out = Dense(128)(merged)
        
        net_output = Dense(1, activation='sigmoid')(dense_out)

        model = keras.models.Model(inputs=net_input, outputs=net_output)
        
        model.summary()
        return model

    def train(self):
        batch_size = self.get_p("batch_size")
        epochs = self.get_p("epochs")

        self.model = self.get_model()

        print('Length of the train: ', len(x_train))
        print('Length of the validation: ', len(x_val))
        
#         tensor_board = TensorBoard(log_dir='./logs/', batch_size=batch_size)
        file_path = "/home/zhaoqi/BaseTrain/models/"+ str(self.time) +"-{epoch:04d}-{val_loss:.5f}-{val_acc:.5f}.h5"
        early_stopping = EarlyStopping("val_loss", patience=15, verbose=0, mode='auto')
        check_point = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=False, mode='auto')
        callbacks_list = [check_point, early_stopping]

        # Generators
        training_generator = DataGenerator(range(len(x_train)), x_train, y_train, batch_size, self.max_len)
        validation_generator = DataGenerator(range(len(x_val)), x_val, y_val, batch_size, self.max_len)
        
        self.model.compile(loss='binary_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        self.model.fit_generator(generator=training_generator,
                                 validation_data=validation_generator,
                                 use_multiprocessing=True,
                                 epochs=epochs,
                                 workers=6,
                                 callbacks=callbacks_list)

In [16]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [75]:
t_instance = TMalConv()
t_instance.run()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 10240)        0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 10240, 8)     2048        input_5[0][0]                    
__________________________________________________________________________________________________
activation_4 (Activation)       (None, 10240, 8)     0           embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 20, 128)      512128      activation_4[0][0]               
__________________________________________________________________________________________________
conv1d_10 

Process ForkPoolWorker-648:
Process ForkPoolWorker-645:
Process ForkPoolWorker-646:
Process ForkPoolWorker-649:
Process ForkPoolWorker-647:
Process ForkPoolWorker-644:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/li

KeyboardInterrupt: 

In [26]:
model_dir = '/home/zhaoqi/BaseTrain/models/'
f_name = '1532937061.0332327-0006-0.13849-0.96546.h5'
c_model = load_model(model_dir + f_name)

test_generator = DataGenerator(range(len(x_test)), x_test, y_test, 16, max_length, False)
y_pred = c_model.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)



In [31]:
estimate_model(y_pred, y_test[0:len(y_pred)])

loss : 0.10697
auc score : 0.98493
accuracy score : 0.97283
thre: 0.9910161495
fp:  0.0006416720
recall:  0.7291139241


(0.9849250009572864, 0.10696545849992999, 0.7291139240506329)

## Merge features

### Merge Malconv and Ember

In [68]:
from keras.models import Model

model_f = Model(c_model.input, c_model.layers[-2].output)

train_generator = DataGenerator(range(len(x_train)), x_train, y_train, 16, max_length, False)
malcon_train_x = model_f.predict_generator(generator=train_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

val_generator = DataGenerator(range(len(x_val)), x_val, y_val, 16, max_length, False)
malcon_val_x = model_f.predict_generator(generator=val_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

test_generator = DataGenerator(range(len(x_test)), x_test, y_test, 16, max_length, False)
malcon_test_x = model_f.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)



In [70]:
def merge_feature(m_data, e_data):
    num = len(m_data)
    m_x = np.zeros((num, 128+2351), dtype=float)
    
    for index in range(num):
        m_x[index, 0:128] = m_data[index]
        m_x[index, 128:128+2351] = e_data[index]  
    return m_x

In [71]:
merge_train_x = merge_feature(malcon_train_x, x_etrain)
merge_val_x = merge_feature(malcon_val_x, x_eval)
merge_test_x = merge_feature(malcon_test_x, x_etest)

# model_m = get_model(merge_train_x, y_train[0:len(merge_train_x)], merge_val_x, y_val[0:len(merge_val_x)] )

In [73]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop

batch_size = 16
epochs = 20

model = Sequential()

model.add(Dense(128, activation='relu', input_shape=(128+2351,)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(merge_train_x, y_train[0:len(merge_train_x)],
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(merge_val_x, y_val[0:len(merge_val_x)]))
score = model.evaluate(merge_test_x, y_test[0:len(merge_test_x)], verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 128)               317440    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 33        
Total params: 321,601
Trainable params: 321,601
Non-trainable params: 0
_________________________________________________________________
Train on 69904 samples, validate on 7760 samples
Epoch 1/20
Epoch 2/20

KeyboardInterrupt: 