## Read data from database

In [1]:
import os
# specify which GPU will be used
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import os

import pymysql
from warnings import filterwarnings

_connection = None

def get_connection(db_config):
    """
    get db connection
    :return:
    """
    global _connection
    if _connection is None:
        _connection = pymysql.connect(host=db_config['host'], user=db_config['username'],
                                      password=db_config['password'],
                                      db=db_config['db'], charset="utf8")
        filterwarnings('ignore', category=pymysql.Warning)

    return _connection


def close():
    """
    close DB connection
    :return:
    """
    global _connection
    if _connection is not None:
        _connection.close()
    _connection = None

In [3]:
db = {
    'host': '172.26.187.242',
    'username': 'malware_r',
    'password': 'GEg22v2O7jbfWhb3',
    'db': 'malware'
}

### Fields

- mw_file_suffix: file name after hash value
- mw_file_prefix: directory
- mw_em_f: features of ember, splitted by ";"

In [4]:
import time

# the base function which can query sql and return dict data
def get_specific_data(table_suffix, sql=None):
    start_time = time.time()
    
    global _connection
    if _connection is None:
        raise Exception("please init db connect first")

    cursor = _connection.cursor()
    cursor.execute("SET NAMES utf8mb4")

    ret = []
        
    cursor.execute(sql)

    field_names = [i[0] for i in cursor.description]

    for row in cursor:
        temp = {}
        for key in range(len(row)):
            temp[field_names[key]] = row[key]
        ret.append(temp)
     
    cursor.close()
    # _connection.close()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return ret

In [5]:
close()
res1 = []
get_connection(db)
table_suffix = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"]
# table_suffix = ["0","3","9","A","F"]
# Iterate all partitions of databases
for suffix in table_suffix:
    sql = """ 
select
  a.mw_file_hash,
  a.section_name,
  c.mw_file_suffix as mw_file_size,
  c.mw_file_prefix as mw_file_directory,
  c.mw_num_engines,
  a.pointerto_raw_data,
  a.virtual_size,
  d.mw_em_f
from mw_index_2017_section_%s as a
  inner join mw_index_2017_%s c on a.mw_file_hash = c.mw_file_hash
  inner join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
where (CNT_CODE = 1 or MEM_EXECUTE = 1) and c.mw_num_engines <> -1 and (c.mw_num_engines >= 4 or c.mw_num_engines = 0) and
      c.mw_file_prefix in ('201704')
    """ % (suffix, suffix, suffix)
    res1.extend(get_specific_data(suffix, sql))
close()
print(len(res1))

--- 5.31632137298584 seconds ---
--- 3.9965932369232178 seconds ---
--- 3.33967924118042 seconds ---
--- 3.151942253112793 seconds ---
--- 2.9704651832580566 seconds ---
--- 3.045807361602783 seconds ---
--- 3.091031789779663 seconds ---
--- 3.0149919986724854 seconds ---
--- 3.0040297508239746 seconds ---
--- 3.0724172592163086 seconds ---
--- 3.0729897022247314 seconds ---
--- 3.0630974769592285 seconds ---
--- 2.9828712940216064 seconds ---
--- 2.9788856506347656 seconds ---
--- 3.0753777027130127 seconds ---
--- 2.9015636444091797 seconds ---
162542


In [6]:
close()
res2 = []
get_connection(db)
table_suffix = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"]
# table_suffix = ["0","3","9","A","F"]
# Iterate all partitions of databases
for suffix in table_suffix:
    sql = """ 
select
  a.mw_file_hash,
  a.section_name,
  c.mw_file_suffix as mw_file_size,
  c.mw_file_prefix as mw_file_directory,
  c.mw_num_engines,
  a.pointerto_raw_data,
  a.virtual_size,
  d.mw_em_f
from mw_index_2017_section_%s as a
  inner join mw_index_2017_%s c on a.mw_file_hash = c.mw_file_hash
  inner join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
where (CNT_CODE = 1 or MEM_EXECUTE = 1) and c.mw_num_engines <> -1 and (c.mw_num_engines >= 4 or c.mw_num_engines = 0) and
      c.mw_file_prefix in ('201705')
    """ % (suffix, suffix, suffix)
    res2.extend(get_specific_data(suffix, sql))
close()
print(len(res2))

--- 4.072739362716675 seconds ---
--- 4.032433032989502 seconds ---
--- 4.26564359664917 seconds ---
--- 4.275423288345337 seconds ---
--- 4.170823574066162 seconds ---
--- 4.200479507446289 seconds ---
--- 4.490224123001099 seconds ---
--- 4.505908489227295 seconds ---
--- 4.325603008270264 seconds ---
--- 4.958554983139038 seconds ---
--- 5.262883424758911 seconds ---
--- 4.585867881774902 seconds ---
--- 4.1625049114227295 seconds ---
--- 4.266520738601685 seconds ---
--- 4.174576997756958 seconds ---
--- 4.029135465621948 seconds ---
187577


## Check and split data

In [7]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
from sklearn.model_selection import train_test_split

max_length = 10000

train_data = pd.DataFrame(res1)
# train_data = train_data.loc[train_data.virtual_size <= max_length]
# train_data = train_data.reset_index(drop=True)
train_data.mw_num_engines[train_data.mw_num_engines == 0 ] = 0
train_data.mw_num_engines[train_data.mw_num_engines >= 4 ] = 1
train_label = train_data.mw_num_engines.ravel()

test_data = pd.DataFrame(res2)
# test_data = test_data.loc[test_data.virtual_size <= max_length]
# test_data = test_data.reset_index(drop=True)
test_data.mw_num_engines[test_data.mw_num_engines == 0 ] = 0
test_data.mw_num_engines[test_data.mw_num_engines >= 4 ] = 1
test_label = test_data.mw_num_engines.ravel()

x_train, x_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.1, random_state=2345)
x_test = test_data
y_test = test_label

x_train = x_train.reset_index(drop=True)
x_val = x_val.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
import pandas as pd
import re
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, confusion_matrix

In [9]:
def estimate_model(y_pred, test_y):
    
    loss = log_loss(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred)
    acc = accuracy_score(test_y, (y_pred > 0.5).astype(int))
    print("loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)

    fp_np_index = np.where(test_y == 0)
    fp_np = y_pred[fp_np_index].shape[0]
    thre_index = int(np.ceil(fp_np - fp_np * 0.001))

    sorted_pred_prob = np.sort(y_pred[fp_np_index], axis=0)
    thre = sorted_pred_prob[thre_index]
    if thre == 1:
        thre = max(sorted_pred_prob[np.where(sorted_pred_prob != 1)])

    y_pred_prob = np.vstack((y_pred.transpose(), (1 - y_pred).transpose())).transpose()
    y_pred_prob[:, 1] = thre
    y_pred_label = np.argmin(y_pred_prob, axis=-1)

    tn, fp, fn, tp = confusion_matrix(test_y, y_pred_label).ravel()
    fp_rate = fp / (fp + tn)
    recall_rate = tp / (tp + fn)

    print("thre: %.10f"%  thre)
    print("fp:  %.10f"%  fp_rate)
    print("recall:  %.10f"%  recall_rate)
    
    return auc, loss, recall_rate

## Autoencoder

In [10]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [11]:
import keras
import numpy as np

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'

    def __init__(self, list_IDs, datasets, batch_size=32, dim=10000, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.datasets = datasets
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, Y = self.__data_generation(list_IDs_temp)

        return X, Y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'  # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.zeros((self.batch_size, self.dim), dtype=float)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            base_path = "/ssd/2017/{0}/{1}{2}"
            item = self.datasets.loc[ID]
            file_path = base_path.format(item["mw_file_directory"], item["mw_file_hash"], item["mw_file_size"])
            in_file = open(file_path, 'rb')
            in_file.seek(item['pointerto_raw_data'])
            if item['virtual_size'] > self.dim:
                bytes_data = [int(single_byte) for single_byte in in_file.read(self.dim)]
            else:
                bytes_data = [int(single_byte) for single_byte in in_file.read(item['virtual_size'])]
            X[i, 0:len(bytes_data)] = bytes_data

#         X = X.reshape((-1, 100, 100, 1)) / 255.0
#         T = X.reshape((-1, 10000)) / 255.0
        Y = X.reshape((-1, 10000, 1)) / 255.0
        return Y, Y

In [12]:
import hashlib
import json
import time

import keras
from keras import Input
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Dense, Embedding, Conv1D, Conv2D, Multiply, GlobalMaxPooling1D, Dropout, Activation
from keras.layers import UpSampling2D, Flatten, merge, MaxPooling2D, MaxPooling1D, UpSampling1D
from keras.models import load_model, Model
from keras.layers import merge, Dropout, BatchNormalization, Maximum, Add
from keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split

class Autoencoder():
    def __init__(self, autoencoder_name, encoder_name):
        self.autoencoder_name = autoencoder_name
        self.encoder_name = encoder_name
        self.autoencoder = None
        self.encoder = None
        self.start_time = time.time()

    def get_model(self):

        input_sequence = Input(shape=(10000,1));
    
        x = Conv1D(32, 5, padding='same')(input_sequence)
        x = Conv1D(32, 5, padding='causal', dilation_rate=1)(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=2)(x)
        x = MaxPooling1D(2, padding='same')(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=4)(x)
        x = MaxPooling1D(2, padding='same')(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=8)(x)
        x = MaxPooling1D(2, padding='same')(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=16)(x)
        x = MaxPooling1D(5, padding='same')(x)
        encoded = Dense(2, activation='sigmoid')(x)
        
        x = Conv1D(32, 3, padding='causal', dilation_rate=16)(x)
        x = UpSampling1D(5)(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=8)(x)
        x = UpSampling1D(2)(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=4)(x)
        x = UpSampling1D(2)(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 3, padding='same')(x)
        x = Conv1D(32, 3, padding='causal', dilation_rate=2)(x)
        x = UpSampling1D(2)(x)
        x = Activation('relu')(x)
        x = Conv1D(32, 5, padding='same')(x)
        x = Conv1D(32, 5, padding='causal', dilation_rate=1)(x)
        x = UpSampling1D(5)(x)
        decoded = Conv1D(1, 3, activation='sigmoid', padding='same')(x)
        
## Deep
#         x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
#         x = MaxPooling2D((2, 2), padding='same')(x)
#         x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
#         x = MaxPooling2D((2, 2), padding='same')(x)
#         x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
#         x = MaxPooling2D((5, 5), padding='same')(x)
#         encoded = Dense(2, activation='sigmoid')(x)
        
#         # at this point the representation is (4, 4, 8) i.e. 128-dimensional

#         x = Conv2D(16, (3, 3), activation='relu', padding='same')(encoded)
#         x = UpSampling2D((5, 5))(x)
#         x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
#         x = UpSampling2D((2, 2))(x)
#         x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
#         x = UpSampling2D((2, 2))(x)
#         decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

## Base
#         x = Conv2D(32, 4, activation='relu', padding='same')(input_img)
#         x = MaxPooling2D(25, padding='same')(x)
#         encoded = Dense(1, activation='sigmoid')(x)

#         x = Conv2D(32, 4, activation='relu', padding='same')(encoded)
#         x = UpSampling2D(25)(x)
#         decoded = Conv2D(1, 4, activation='sigmoid', padding='same')(x)

        self.autoencoder = Model(inputs=input_sequence, outputs=decoded)
        self.encoder = Model(inputs=input_sequence, outputs=encoded)

        self.autoencoder.compile(loss='mean_squared_error', optimizer=RMSprop())
#         self.autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
        self.autoencoder.summary()

    def train(self, train_df, max_epoch, batch_size=32):
        self.get_model()
        partition_train, partition_validation = train_test_split(range(len(train_df)), test_size=0.05,
                                                                 random_state=1234)
        print('Length of the train: ', len(partition_train))
        print('Length of the validation: ', len(partition_validation))

        #         tensor_board = TensorBoard(log_dir='./logs/', batch_size=batch_size)
        file_path = "/home/zhaoqi/autoencoder/models/"+ str(self.start_time) +"-{epoch:04d}-{val_loss:.5f}_12_3_a_0.h5"
        #         early_stopping = EarlyStopping("val_loss", patience=2, verbose=0, mode='auto')
        check_point = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
        callbacks_list = [check_point]
        
        # Generators
        training_generator = DataGenerator(partition_train, train_df, batch_size)
        validation_generator = DataGenerator(partition_validation, train_df, batch_size)

        self.autoencoder.fit_generator(generator=training_generator,
                                       validation_data=validation_generator,
                                       use_multiprocessing=True,
                                       epochs=max_epoch,
                                       workers=6,
                                       callbacks=callbacks_list)
        self.autoencoder.save(self.autoencoder_name)
        self.encoder.save(self.encoder_name)
        
        
autoencoder = Autoencoder('autoencoder.h5', 'encoder.h5')
autoencoder.train(x_train, max_epoch=64, batch_size=16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10000, 1)          0         
_________________________________________________________________
activation_1 (Activation)    (None, 10000, 1)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 10000, 32)         192       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 10000, 32)         5152      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2000, 32)          0         
_________________________________________________________________
activation_2 (Activation)    (None, 2000, 32)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2000, 32)          3104      
__________


Epoch 00010: val_loss did not improve from 0.06632
Epoch 11/64

Epoch 00011: val_loss improved from 0.06632 to 0.06626, saving model to /home/zhaoqi/autoencoder/models/1533489233.0026522-0011-0.06626_12_3_a_0.h5
Epoch 12/64

Epoch 00012: val_loss did not improve from 0.06626
Epoch 13/64

Epoch 00013: val_loss did not improve from 0.06626
Epoch 14/64

Epoch 00014: val_loss improved from 0.06626 to 0.06618, saving model to /home/zhaoqi/autoencoder/models/1533489233.0026522-0014-0.06618_12_3_a_0.h5
Epoch 15/64

Epoch 00015: val_loss improved from 0.06618 to 0.06618, saving model to /home/zhaoqi/autoencoder/models/1533489233.0026522-0015-0.06618_12_3_a_0.h5
Epoch 16/64

Epoch 00016: val_loss improved from 0.06618 to 0.06607, saving model to /home/zhaoqi/autoencoder/models/1533489233.0026522-0016-0.06607_12_3_a_0.h5
Epoch 17/64

Epoch 00017: val_loss improved from 0.06607 to 0.06594, saving model to /home/zhaoqi/autoencoder/models/1533489233.0026522-0017-0.06594_12_3_a_0.h5
Epoch 18/64

Ep

Process ForkPoolWorker-653:
Process ForkPoolWorker-650:
Process ForkPoolWorker-654:
Process ForkPoolWorker-645:
Process ForkPoolWorker-648:
Process ForkPoolWorker-652:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call l

KeyboardInterrupt: 

Process ForkPoolWorker-655:
Process ForkPoolWorker-656:
Process ForkPoolWorker-659:
Process ForkPoolWorker-660:
Process ForkPoolWorker-657:
Process ForkPoolWorker-658:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(

In [15]:
from keras.models import Model

model_dir = '/home/zhaoqi/autoencoder/models/'
f_name = '1533489233.0026522-0041-0.06559_12_3_a_0.h5'
c_model = load_model(model_dir + f_name)

model_f = Model(c_model.input, c_model.layers[-21].output)

model_f.summary()

train_generator = DataGenerator(range(len(x_train)), x_train, 16, max_length, False)
malcon_train_x = model_f.predict_generator(generator=train_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

val_generator = DataGenerator(range(len(x_val)), x_val, 16, max_length, False)
malcon_val_x = model_f.predict_generator(generator=val_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

test_generator = DataGenerator(range(len(x_test)), x_test, 16, max_length, False)
malcon_test_x = model_f.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10000, 1)          0         
_________________________________________________________________
activation_1 (Activation)    (None, 10000, 1)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 10000, 32)         192       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 10000, 32)         5152      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2000, 32)          0         
_________________________________________________________________
activation_2 (Activation)    (None, 2000, 32)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2000, 32)          3104      
__________

In [16]:
malcon_train_x = malcon_train_x.reshape(len(malcon_train_x), -1)
malcon_val_x = malcon_val_x.reshape(len(malcon_val_x), -1)
malcon_test_x = malcon_test_x.reshape(len(malcon_test_x), -1)

In [17]:
def get_ember_feature(data):
    ember_f = np.zeros((len(data.mw_em_f), 2351), dtype=float)
    for index, item in data.iterrows():
        ember_f[index, :] = item['mw_em_f'].split(';')
    return ember_f

In [30]:
def get_max_feature(origin_data, feature_data):
    feature_data = pd.DataFrame(feature_data)
    feature_data['mw_file_hash'] = origin_data.iloc[0:len(feature_data)][['mw_file_hash']]
    feature_data = feature_data.groupby('mw_file_hash').first().merge(origin_data[['mw_file_hash','mw_em_f','mw_num_engines']], how='inner', on='mw_file_hash')
    tmp_ember = get_ember_feature(feature_data[['mw_em_f']])
    tmp_label = feature_data[['mw_num_engines']].mw_num_engines
    tmp_data = feature_data.drop(['mw_num_engines', 'mw_em_f', 'mw_file_hash'], axis=1)
    return tmp_data, tmp_label, tmp_ember

In [31]:
max_train_data, max_train_label, max_train_ember = get_max_feature(x_train, malcon_train_x)
max_val_data, max_val_label, max_val_ember = get_max_feature(x_val, malcon_val_x)
max_test_data, max_test_label, max_test_ember = get_max_feature(x_test, malcon_test_x)

In [32]:
def merge_feature(m_data, e_data):
    num = len(m_data)
    m_x = np.zeros((num, 100+2351), dtype=float)
    
    for index in range(num):
        m_x[index, 0:100] = m_data.iloc[index]
        m_x[index, 100:100+2351] = e_data[index]  
    return m_x

In [33]:
merge_train_x = merge_feature(max_train_data, max_train_ember)
merge_val_x = merge_feature(max_val_data, max_val_ember)
merge_test_x = merge_feature(max_test_data, max_test_ember)

In [34]:
def get_model(x_train, y_train, x_val, y_val):
    params = {'application': 'binary'}
    lgbm_dataset = lgb.Dataset(x_train, y_train.ravel())
    valid_sets = lgb.Dataset(x_val, y_val.ravel())

    model = lgb.train(params, lgbm_dataset, valid_sets=valid_sets, num_boost_round=10000, early_stopping_rounds=10)
    y_pred = model.predict(x_val, num_iteration=model.best_iteration)
    
    loss = log_loss(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)
    acc = accuracy_score(y_val, (y_pred > 0.5).astype(int))
#     model.save_model(file_path + "-%04d-%.5f-%.5f.h5" % (model.best_iteration, loss, acc),
#                      num_iteration=model.best_iteration)
    print("val loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)
        
    return model

In [35]:
model_m = get_model(merge_train_x[:,:], max_train_label, merge_val_x[:,:], max_val_label )

[1]	valid_0's binary_logloss: 0.585956
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.524512
[3]	valid_0's binary_logloss: 0.47253
[4]	valid_0's binary_logloss: 0.429127
[5]	valid_0's binary_logloss: 0.392087
[6]	valid_0's binary_logloss: 0.359213
[7]	valid_0's binary_logloss: 0.331271
[8]	valid_0's binary_logloss: 0.306276
[9]	valid_0's binary_logloss: 0.283473
[10]	valid_0's binary_logloss: 0.263831
[11]	valid_0's binary_logloss: 0.245734
[12]	valid_0's binary_logloss: 0.229145
[13]	valid_0's binary_logloss: 0.215276
[14]	valid_0's binary_logloss: 0.202024
[15]	valid_0's binary_logloss: 0.189604
[16]	valid_0's binary_logloss: 0.178571
[17]	valid_0's binary_logloss: 0.16941
[18]	valid_0's binary_logloss: 0.16042
[19]	valid_0's binary_logloss: 0.153047
[20]	valid_0's binary_logloss: 0.14555
[21]	valid_0's binary_logloss: 0.139449
[22]	valid_0's binary_logloss: 0.133392
[23]	valid_0's binary_logloss: 0.127458
[24]	valid_0's binary_logloss: 

[199]	valid_0's binary_logloss: 0.0224839
[200]	valid_0's binary_logloss: 0.0224485
[201]	valid_0's binary_logloss: 0.0224325
[202]	valid_0's binary_logloss: 0.0223319
[203]	valid_0's binary_logloss: 0.0222744
[204]	valid_0's binary_logloss: 0.0221952
[205]	valid_0's binary_logloss: 0.022149
[206]	valid_0's binary_logloss: 0.0220725
[207]	valid_0's binary_logloss: 0.0220375
[208]	valid_0's binary_logloss: 0.0219415
[209]	valid_0's binary_logloss: 0.0218851
[210]	valid_0's binary_logloss: 0.021797
[211]	valid_0's binary_logloss: 0.0217401
[212]	valid_0's binary_logloss: 0.0216603
[213]	valid_0's binary_logloss: 0.0216002
[214]	valid_0's binary_logloss: 0.0215656
[215]	valid_0's binary_logloss: 0.0214614
[216]	valid_0's binary_logloss: 0.0213829
[217]	valid_0's binary_logloss: 0.0213222
[218]	valid_0's binary_logloss: 0.0212672
[219]	valid_0's binary_logloss: 0.0212212
[220]	valid_0's binary_logloss: 0.0211599
[221]	valid_0's binary_logloss: 0.0211016
[222]	valid_0's binary_logloss: 0.02

[395]	valid_0's binary_logloss: 0.0147002
[396]	valid_0's binary_logloss: 0.0146897
[397]	valid_0's binary_logloss: 0.0146828
[398]	valid_0's binary_logloss: 0.0146949
[399]	valid_0's binary_logloss: 0.0146787
[400]	valid_0's binary_logloss: 0.0146641
[401]	valid_0's binary_logloss: 0.0146371
[402]	valid_0's binary_logloss: 0.0146582
[403]	valid_0's binary_logloss: 0.014645
[404]	valid_0's binary_logloss: 0.0146556
[405]	valid_0's binary_logloss: 0.0146201
[406]	valid_0's binary_logloss: 0.0145973
[407]	valid_0's binary_logloss: 0.0146094
[408]	valid_0's binary_logloss: 0.0145774
[409]	valid_0's binary_logloss: 0.0145609
[410]	valid_0's binary_logloss: 0.014544
[411]	valid_0's binary_logloss: 0.0145211
[412]	valid_0's binary_logloss: 0.0145077
[413]	valid_0's binary_logloss: 0.0144927
[414]	valid_0's binary_logloss: 0.0145022
[415]	valid_0's binary_logloss: 0.0144813
[416]	valid_0's binary_logloss: 0.0144599
[417]	valid_0's binary_logloss: 0.0144879
[418]	valid_0's binary_logloss: 0.01

In [36]:
y_p = model_m.predict(merge_test_x[:,:])
y_pred = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred[i, 0] = y_p[i]

estimate_model(y_pred, max_test_label)

loss : 0.05631
auc score : 0.99817
accuracy score : 0.98090
thre: 0.9897997097
fp:  0.0009848283
recall:  0.8738945737


(0.9981702410359492, 0.0563072817443707, 0.873894573726255)

In [37]:
model_m = get_model(merge_train_x[:,:100], max_train_label, merge_val_x[:,:100], max_val_label )

[1]	valid_0's binary_logloss: 0.618532
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.582645
[3]	valid_0's binary_logloss: 0.55228
[4]	valid_0's binary_logloss: 0.525854
[5]	valid_0's binary_logloss: 0.502203
[6]	valid_0's binary_logloss: 0.479921
[7]	valid_0's binary_logloss: 0.459869
[8]	valid_0's binary_logloss: 0.443109
[9]	valid_0's binary_logloss: 0.428411
[10]	valid_0's binary_logloss: 0.415526
[11]	valid_0's binary_logloss: 0.403852
[12]	valid_0's binary_logloss: 0.392598
[13]	valid_0's binary_logloss: 0.382316
[14]	valid_0's binary_logloss: 0.37204
[15]	valid_0's binary_logloss: 0.362759
[16]	valid_0's binary_logloss: 0.354982
[17]	valid_0's binary_logloss: 0.347322
[18]	valid_0's binary_logloss: 0.340256
[19]	valid_0's binary_logloss: 0.333634
[20]	valid_0's binary_logloss: 0.327862
[21]	valid_0's binary_logloss: 0.322343
[22]	valid_0's binary_logloss: 0.317448
[23]	valid_0's binary_logloss: 0.312211
[24]	valid_0's binary_logloss

[203]	valid_0's binary_logloss: 0.182717
[204]	valid_0's binary_logloss: 0.182578
[205]	valid_0's binary_logloss: 0.182494
[206]	valid_0's binary_logloss: 0.182383
[207]	valid_0's binary_logloss: 0.182246
[208]	valid_0's binary_logloss: 0.181977
[209]	valid_0's binary_logloss: 0.181857
[210]	valid_0's binary_logloss: 0.181646
[211]	valid_0's binary_logloss: 0.181516
[212]	valid_0's binary_logloss: 0.181444
[213]	valid_0's binary_logloss: 0.181364
[214]	valid_0's binary_logloss: 0.181299
[215]	valid_0's binary_logloss: 0.181168
[216]	valid_0's binary_logloss: 0.181082
[217]	valid_0's binary_logloss: 0.18101
[218]	valid_0's binary_logloss: 0.180883
[219]	valid_0's binary_logloss: 0.180722
[220]	valid_0's binary_logloss: 0.180401
[221]	valid_0's binary_logloss: 0.180078
[222]	valid_0's binary_logloss: 0.179922
[223]	valid_0's binary_logloss: 0.179804
[224]	valid_0's binary_logloss: 0.179685
[225]	valid_0's binary_logloss: 0.179505
[226]	valid_0's binary_logloss: 0.179381
[227]	valid_0's b

[404]	valid_0's binary_logloss: 0.161182
[405]	valid_0's binary_logloss: 0.161145
[406]	valid_0's binary_logloss: 0.161086
[407]	valid_0's binary_logloss: 0.161022
[408]	valid_0's binary_logloss: 0.160916
[409]	valid_0's binary_logloss: 0.160929
[410]	valid_0's binary_logloss: 0.160875
[411]	valid_0's binary_logloss: 0.160846
[412]	valid_0's binary_logloss: 0.160756
[413]	valid_0's binary_logloss: 0.160755
[414]	valid_0's binary_logloss: 0.160552
[415]	valid_0's binary_logloss: 0.160539
[416]	valid_0's binary_logloss: 0.16046
[417]	valid_0's binary_logloss: 0.160432
[418]	valid_0's binary_logloss: 0.160392
[419]	valid_0's binary_logloss: 0.160334
[420]	valid_0's binary_logloss: 0.160288
[421]	valid_0's binary_logloss: 0.160275
[422]	valid_0's binary_logloss: 0.160245
[423]	valid_0's binary_logloss: 0.160046
[424]	valid_0's binary_logloss: 0.159776
[425]	valid_0's binary_logloss: 0.159728
[426]	valid_0's binary_logloss: 0.159586
[427]	valid_0's binary_logloss: 0.159534
[428]	valid_0's b

[605]	valid_0's binary_logloss: 0.150931
[606]	valid_0's binary_logloss: 0.150929
[607]	valid_0's binary_logloss: 0.150934
[608]	valid_0's binary_logloss: 0.15087
[609]	valid_0's binary_logloss: 0.150868
[610]	valid_0's binary_logloss: 0.150811
[611]	valid_0's binary_logloss: 0.150774
[612]	valid_0's binary_logloss: 0.150761
[613]	valid_0's binary_logloss: 0.150756
[614]	valid_0's binary_logloss: 0.150738
[615]	valid_0's binary_logloss: 0.150669
[616]	valid_0's binary_logloss: 0.150638
[617]	valid_0's binary_logloss: 0.150545
[618]	valid_0's binary_logloss: 0.150518
[619]	valid_0's binary_logloss: 0.150509
[620]	valid_0's binary_logloss: 0.150457
[621]	valid_0's binary_logloss: 0.15041
[622]	valid_0's binary_logloss: 0.150359
[623]	valid_0's binary_logloss: 0.150318
[624]	valid_0's binary_logloss: 0.150271
[625]	valid_0's binary_logloss: 0.150246
[626]	valid_0's binary_logloss: 0.150272
[627]	valid_0's binary_logloss: 0.150269
[628]	valid_0's binary_logloss: 0.150157
[629]	valid_0's bi

[806]	valid_0's binary_logloss: 0.145812
[807]	valid_0's binary_logloss: 0.14582
[808]	valid_0's binary_logloss: 0.145865
[809]	valid_0's binary_logloss: 0.145907
[810]	valid_0's binary_logloss: 0.145888
[811]	valid_0's binary_logloss: 0.145867
[812]	valid_0's binary_logloss: 0.145803
[813]	valid_0's binary_logloss: 0.14576
[814]	valid_0's binary_logloss: 0.145742
[815]	valid_0's binary_logloss: 0.145755
[816]	valid_0's binary_logloss: 0.145763
[817]	valid_0's binary_logloss: 0.145773
[818]	valid_0's binary_logloss: 0.145748
[819]	valid_0's binary_logloss: 0.145778
[820]	valid_0's binary_logloss: 0.145777
[821]	valid_0's binary_logloss: 0.145754
[822]	valid_0's binary_logloss: 0.145716
[823]	valid_0's binary_logloss: 0.145729
[824]	valid_0's binary_logloss: 0.145772
[825]	valid_0's binary_logloss: 0.14574
[826]	valid_0's binary_logloss: 0.145722
[827]	valid_0's binary_logloss: 0.145707
[828]	valid_0's binary_logloss: 0.145716
[829]	valid_0's binary_logloss: 0.145748
[830]	valid_0's bin

In [38]:
y_p = model_m.predict(merge_test_x[:,:100])
y_pred = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred[i, 0] = y_p[i]

estimate_model(y_pred, max_test_label)

loss : 0.27903
auc score : 0.93920
accuracy score : 0.89754
thre: 0.9915712339
fp:  0.0009759560
recall:  0.4764220257


(0.9391988074240127, 0.2790333533394877, 0.47642202570199577)

In [39]:
model_m = get_model(merge_train_x[:,100:], max_train_label, merge_val_x[:,100:], max_val_label )

[1]	valid_0's binary_logloss: 0.585956
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.524512
[3]	valid_0's binary_logloss: 0.47255
[4]	valid_0's binary_logloss: 0.429085
[5]	valid_0's binary_logloss: 0.391782
[6]	valid_0's binary_logloss: 0.358682
[7]	valid_0's binary_logloss: 0.330118
[8]	valid_0's binary_logloss: 0.30549
[9]	valid_0's binary_logloss: 0.282788
[10]	valid_0's binary_logloss: 0.262999
[11]	valid_0's binary_logloss: 0.244561
[12]	valid_0's binary_logloss: 0.22845
[13]	valid_0's binary_logloss: 0.214385
[14]	valid_0's binary_logloss: 0.201323
[15]	valid_0's binary_logloss: 0.189575
[16]	valid_0's binary_logloss: 0.179587
[17]	valid_0's binary_logloss: 0.16962
[18]	valid_0's binary_logloss: 0.161338
[19]	valid_0's binary_logloss: 0.153084
[20]	valid_0's binary_logloss: 0.145796
[21]	valid_0's binary_logloss: 0.139744
[22]	valid_0's binary_logloss: 0.13396
[23]	valid_0's binary_logloss: 0.128681
[24]	valid_0's binary_logloss: 0

[199]	valid_0's binary_logloss: 0.021443
[200]	valid_0's binary_logloss: 0.0213981
[201]	valid_0's binary_logloss: 0.0213591
[202]	valid_0's binary_logloss: 0.0212672
[203]	valid_0's binary_logloss: 0.0211935
[204]	valid_0's binary_logloss: 0.0210976
[205]	valid_0's binary_logloss: 0.0210232
[206]	valid_0's binary_logloss: 0.0209166
[207]	valid_0's binary_logloss: 0.0208736
[208]	valid_0's binary_logloss: 0.02076
[209]	valid_0's binary_logloss: 0.0207311
[210]	valid_0's binary_logloss: 0.0206424
[211]	valid_0's binary_logloss: 0.0205676
[212]	valid_0's binary_logloss: 0.0205113
[213]	valid_0's binary_logloss: 0.0204498
[214]	valid_0's binary_logloss: 0.0203973
[215]	valid_0's binary_logloss: 0.0203363
[216]	valid_0's binary_logloss: 0.0202619
[217]	valid_0's binary_logloss: 0.0201873
[218]	valid_0's binary_logloss: 0.0201314
[219]	valid_0's binary_logloss: 0.0200799
[220]	valid_0's binary_logloss: 0.0199886
[221]	valid_0's binary_logloss: 0.0199328
[222]	valid_0's binary_logloss: 0.019

[395]	valid_0's binary_logloss: 0.0136386
[396]	valid_0's binary_logloss: 0.0136352
[397]	valid_0's binary_logloss: 0.013627
[398]	valid_0's binary_logloss: 0.0136012
[399]	valid_0's binary_logloss: 0.0135917
[400]	valid_0's binary_logloss: 0.0135856
[401]	valid_0's binary_logloss: 0.013574
[402]	valid_0's binary_logloss: 0.0135828
[403]	valid_0's binary_logloss: 0.0135756
[404]	valid_0's binary_logloss: 0.0135811
[405]	valid_0's binary_logloss: 0.0135521
[406]	valid_0's binary_logloss: 0.0135475
[407]	valid_0's binary_logloss: 0.0135488
[408]	valid_0's binary_logloss: 0.013535
[409]	valid_0's binary_logloss: 0.0135221
[410]	valid_0's binary_logloss: 0.013512
[411]	valid_0's binary_logloss: 0.0135056
[412]	valid_0's binary_logloss: 0.0135053
[413]	valid_0's binary_logloss: 0.0134987
[414]	valid_0's binary_logloss: 0.0135022
[415]	valid_0's binary_logloss: 0.0134959
[416]	valid_0's binary_logloss: 0.0134572
[417]	valid_0's binary_logloss: 0.0134277
[418]	valid_0's binary_logloss: 0.0134

In [40]:
y_p = model_m.predict(merge_test_x[:,100:])
y_pred = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred[i, 0] = y_p[i]

estimate_model(y_pred, max_test_label)

loss : 0.05525
auc score : 0.99812
accuracy score : 0.98196
thre: 0.9834881903
fp:  0.0009759560
recall:  0.8613909001


(0.9981159760489502, 0.05525050625772819, 0.8613909001041973)