## read index from mysql

In [80]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [81]:
import os

import pymysql
from warnings import filterwarnings

_connection = None

def get_connection(db_config):
    """
    get db connection
    :return:
    """
    global _connection
    if _connection is None:
        _connection = pymysql.connect(host=db_config['host'], user=db_config['username'],
                                      password=db_config['password'],
                                      db=db_config['db'], charset="utf8")
        filterwarnings('ignore', category=pymysql.Warning)

    return _connection


def close():
    """
    close DB connection
    :return:
    """
    global _connection
    if _connection is not None:
        _connection.close()
    _connection = None

In [82]:
db = {
    'host': '172.26.187.242',
    'username': 'malware_r',
    'password': 'GEg22v2O7jbfWhb3',
    'db': 'malware'
}

In [142]:
import time

def get_specific_data(table_suffix):
    start_time = time.time()
    
    global _connection
    if _connection is None:
        raise Exception("please init db connect first")

    cursor = _connection.cursor()
    cursor.execute("SET NAMES utf8mb4")

    ret = []
    
    sql = """
select
  a.mw_file_hash,
  a.section_name,
  c.mw_file_suffix as mw_file_size,
  c.mw_file_prefix as mw_file_directory,
  c.mw_num_engines,
  a.pointerto_raw_data,
  a.virtual_size,
  d.mw_em_f
from mw_index_2017_section_%s as a
  right join mw_index_2017_%s c on a.mw_file_hash = c.mw_file_hash
  right join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
where a.section_name = '.text' and c.mw_num_engines <> -1 and (c.mw_num_engines > 8 or c.mw_num_engines < 4) and
      c.mw_file_prefix in ('201701')
group by mw_file_hash
    """ % (table_suffix, table_suffix, table_suffix)
        
    
    sql2 = """
select
  b.mw_file_hash,
  b.mw_file_prefix as mw_file_directory,
  b.mw_file_suffix as mw_file_size,
  b.mw_num_engines,
  a.section_name,
  a.virtual_size,
  a.pointerto_raw_data,
  c.mw_em_f
from mw_index_2017_section_%s a
  inner join mw_index_2017_%s b on a.mw_file_hash = b.mw_file_hash
  inner join mw_index_2017_feature_%s c on a.mw_file_hash = c.mw_file_hash
where MEM_EXECUTE = 1 and (mw_num_engines > 8 or mw_num_engines < 4) and mw_num_engines <> -1
      and mw_file_prefix in ('201701', '201703')
group by b.mw_file_hash;
    """ % (table_suffix, table_suffix, table_suffix)
    
    
    sql3 = """
select
  mw_file_hash,
  mw_file_prefix as mw_file_directory,
  mw_file_suffix as mw_file_size,
  mw_num_engines
from mw_index_2017_%s
where (mw_num_engines > 8 or mw_num_engines < 4) and mw_num_engines <> -1
      and mw_file_prefix in ('201705');
    """ % table_suffix
        
        
    sql4 = """
select
  b.mw_file_hash,
  mw_file_prefix as mw_file_directory,
  mw_file_suffix as mw_file_size,
  mw_num_engines,
  a.virtual_size,
  a.pointerto_raw_data
from mw_index_2017_section_%s a
  inner join mw_index_2017_%s b on a.mw_file_hash = b.mw_file_hash
where (mw_num_engines > 8 or mw_num_engines < 4) and mw_num_engines <> -1
      and mw_file_prefix in ('201701')
group by b.mw_file_hash;
    """ % (table_suffix, table_suffix)
        
    sql5 = """
select
  a.mw_file_hash,
  c.mw_file_prefix as mw_file_directory,
  c.mw_file_suffix as mw_file_size,
  c.mw_num_engines,
  b.section_name,
  b.virtual_size,
  b.pointerto_raw_data,
  d.mw_em_f
from (select
        mw_file_hash,
        section_name,
        count(1) as cnt
      from mw_index_2017_section_%s
      where section_name = '.text' and pointerto_raw_data <> 0
      group by mw_file_hash, section_name) a inner join mw_index_2017_section_%s b
    on a.mw_file_hash = b.mw_file_hash and a.cnt = 1 and b.section_name = '.text'
  inner join mw_index_2017_%s c on a.mw_file_hash = c.mw_file_hash and (c.mw_num_engines > 8 or c.mw_num_engines < 4) and c.mw_num_engines<> -1 and c.mw_file_prefix = '201703'
  inner join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
    """ % (table_suffix, table_suffix, table_suffix, table_suffix)
    
    cursor.execute(sql2)

    field_names = [i[0] for i in cursor.description]

    for row in cursor:
        temp = {}
        for key in range(len(row)):
            temp[field_names[key]] = row[key]
        ret.append(temp)
     
    cursor.close()
    # _connection.close()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return ret

In [None]:
close()
res = []
get_connection(db)
table_suffix = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"]
for suffix in table_suffix:
    res.extend(get_specific_data(suffix))
close()
print(len(res))

--- 9.431426048278809 seconds ---
--- 11.111799955368042 seconds ---
--- 9.329645872116089 seconds ---
--- 11.745375871658325 seconds ---
--- 9.812025785446167 seconds ---
--- 9.292504787445068 seconds ---


## check and split data

In [None]:
import pandas as pd
data = pd.DataFrame(res)

In [None]:
data.head()

import numpy as np
import scipy.stats as stats
import pylab as pl

clean_data = data.loc[data.virtual_size <= 300000]
clean_data = clean_data.reset_index(drop=True)

print(clean_data.shape)

h = sorted(clean_data.virtual_size.ravel())  #sorted

fit = stats.norm.pdf(h, np.mean(h), np.std(h))  #this is a fitting indeed

pl.plot(h,fit,'-o')

pl.hist(h,normed=True)      #use this to draw histogram of your data

pl.show()                   #use may also need add this 

In [None]:
clean_data.mw_num_engines[clean_data.mw_num_engines < 4 ] = 0
clean_data.mw_num_engines[clean_data.mw_num_engines > 8 ] = 1
label = clean_data.mw_num_engines.ravel()
label

In [127]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(clean_data, label, test_size=0.1, random_state=1234)

In [133]:
max_length = max(x_train.virtual_size.ravel())
max_length = 299996

In [129]:
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

# Mal Conv

In [131]:
import keras
import numpy as np

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'

    def __init__(self, list_IDs, datasets, labels, batch_size=32, dim=8192, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.datasets = datasets
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'  # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.zeros((self.batch_size, self.dim), dtype=float)
        y = np.zeros(self.batch_size, dtype=float)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            base_path = "/malware_data_2017/201703/{0}{1}"
            item = self.datasets.loc[ID]
            file_path = base_path.format(item["mw_file_hash"], item["mw_file_size"])
            in_file = open(file_path, 'rb')
            in_file.seek(item['pointerto_raw_data'])
            bytes_data = [int(single_byte) for single_byte in in_file.read(item['virtual_size'])]
            X[i, 0:len(bytes_data)] = bytes_data
            y[i] = self.labels[ID]

        return X, y

In [107]:
import hashlib
import json
import time

import keras
from keras import Input
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Dense, Embedding, Conv1D, Multiply, GlobalMaxPooling1D
from keras.models import load_model
from sklearn.model_selection import train_test_split

class TMalConv(object):
    """
    train of mal conv
    """

    def __init__(self):
        self.train_df = x_train
        self.label_df = y_train
        self.v_x = None
        self.v_y = None
        self.max_len = max_length
        self.history = None
        self.model = None
        self.p_md5 = None
        self.summary = {
            'time':time.time(),
            'batch_size': 16,
            'epochs': 12,
            's_test_size': 0.05,
            's_random_state': 5242,
            'g_c_filter': 128,
            'g_c_kernel_size': 500,
            'g_c_stride': 500,
        }

    def run(self):
        """
        :return:
        """
        self.train()
        
    def get_p(self, key):
        """
        get the parameter from the summary
        :param key:
        :return:
        """
        return self.summary[key]

    def gate_cnn(self, gate_cnn_input):
        """
        construct a gated cnn by the specific kernel size
        :param gate_cnn_input:
        :param kernel_size:
        :return:
        """
        conv1_out = Conv1D(self.get_p("g_c_filter"), self.get_p("g_c_kernel_size"), strides=self.get_p("g_c_stride"))(
            gate_cnn_input)
        conv2_out = Conv1D(self.get_p("g_c_filter"), self.get_p("g_c_kernel_size"), strides=self.get_p("g_c_stride"),
                           activation="sigmoid")(gate_cnn_input)
        merged = Multiply()([conv1_out, conv2_out])
        gate_cnn_output = GlobalMaxPooling1D()(merged)
        return gate_cnn_output

    def get_model(self):
        """
        get a model
        :param max_len:
        :param kernel_sizes:
        :return:
        """
        net_input = Input(shape=(self.max_len,))

        embedding_out = Embedding(256, 8, input_length=self.max_len)(net_input)
        merged = self.gate_cnn(embedding_out)

        dense_out = Dense(128)(merged)
        net_output = Dense(1, activation='sigmoid')(dense_out)

        model = keras.models.Model(inputs=net_input, outputs=net_output)

        return model

    def train(self):
        batch_size = self.get_p("batch_size")
        epochs = self.get_p("epochs")

        self.model = self.get_model()

        partition_train, partition_validation = train_test_split(range(len(self.train_df)), test_size=0.1, random_state=1234)
        print('Length of the train: ', len(partition_train))
        print('Length of the validation: ', len(partition_validation))
        
#         tensor_board = TensorBoard(log_dir='./logs/', batch_size=batch_size)
        file_path = "/home/zhaoqi/BaseTrain/models/{epoch:04d}-{val_loss:.5f}-{val_acc:.5f}.h5"
#         early_stopping = EarlyStopping("val_loss", patience=2, verbose=0, mode='auto')
        check_point = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=False, mode='auto')
        callbacks_list = [check_point]

        # Generators
        training_generator = DataGenerator(partition_train, self.train_df, self.label_df, batch_size, self.max_len)
        validation_generator = DataGenerator(partition_validation, self.train_df, self.label_df, batch_size, self.max_len)

        self.model.compile(loss='binary_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        self.model.fit_generator(generator=training_generator,
                                 validation_data=validation_generator,
                                 use_multiprocessing=True,
                                 epochs=epochs,
                                 workers=12,
                                 callbacks=callbacks_list)

In [108]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [109]:
t_instance = TMalConv()
t_instance.run()

Length of the train:  75239
Length of the validation:  8360
Epoch 1/12
 747/4702 [===>..........................] - ETA: 11:50 - loss: 0.1652 - acc: 0.9401

Process ForkPoolWorker-201:
Process ForkPoolWorker-205:
Process ForkPoolWorker-211:
Process ForkPoolWorker-200:
Process ForkPoolWorker-208:
Process ForkPoolWorker-206:
Process ForkPoolWorker-202:
Process ForkPoolWorker-209:
Process ForkPoolWorker-207:
Process ForkPoolWorker-204:
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/reduction.py", line 50, in dumps
    cls(buf, protocol).dump(obj)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/zhaoqi/anaconda3/envs/tf/l

KeyboardInterrupt
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
KeyboardInterrupt
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
Traceback (most recent call last):
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/pool.py", line 125, in worker
    put((job, i, result))
KeyboardInterrupt
  File "/home/zhaoqi/anaconda3/envs/tf/lib/python3.5/multiprocessing/queues.py", line 341, in put
 

KeyboardInterrupt: 

KeyboardInterrupt


In [134]:
model_dir = '/home/zhaoqi/BaseTrain/models/'
f_name = '0006-0.12993-0.96117.h5'
c_model = load_model(model_dir + f_name)
test_generator = DataGenerator(range(len(x_test)), x_test, y_test, 16, max_length, False)
y_pred = c_model.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)



In [111]:
len(y_pred)

9280

# Estimate

In [95]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, confusion_matrix

def estimate_model(y_pred, test_y):
    
    loss = log_loss(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred)
    acc = accuracy_score(test_y, (y_pred > 0.5).astype(int))
    print("loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)

    fp_np_index = np.where(test_y == 0)
    fp_np = y_pred[fp_np_index].shape[0]
    thre_index = int(np.ceil(fp_np - fp_np * 0.001))

    sorted_pred_prob = np.sort(y_pred[fp_np_index], axis=0)
    thre = sorted_pred_prob[thre_index]
    if thre == 1:
        thre = max(sorted_pred_prob[np.where(sorted_pred_prob != 1)])

    y_pred_prob = np.vstack((y_pred.transpose(), (1 - y_pred).transpose())).transpose()
    y_pred_prob[:, 1] = thre
    y_pred_label = np.argmin(y_pred_prob, axis=-1)

    tn, fp, fn, tp = confusion_matrix(test_y, y_pred_label).ravel()
    fp_rate = fp / (fp + tn)
    recall_rate = tp / (tp + fn)

    print("thre: %.5f"%  thre)
    print("fp:  %.5f"%  fp_rate)
    print("recall:  %.5f"%  recall_rate)
    
    return auc, loss, recall_rate

In [135]:
estimate_model(y_pred, y_test[0:len(y_pred)])

loss : 0.14071
auc score : 0.98705
accuracy score : 0.95358
thre: 0.99792
fp:  0.00062
recall:  0.72155


(0.9870460807542742, 0.1407084855590686, 0.7215489210759681)

# EMBER

In [113]:
import pandas as pd
import re
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [114]:
def get_ember_feature(data):
    ember_f = np.zeros((len(data.mw_em_f), 2351), dtype=float)
    for index, item in data.iterrows():
        float_arr = item['mw_em_f'].split(';')
        ember_f[index, :] = float_arr
    return ember_f

## ember train data

In [115]:
def get_model(data, label):
    x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.1, random_state=1234)
    params = {'application': 'binary'}
    lgbm_dataset = lgb.Dataset(x_train, y_train.ravel())
    valid_sets = lgb.Dataset(x_test, y_test.ravel())

    model = lgb.train(params, lgbm_dataset, 100000, valid_sets=valid_sets, early_stopping_rounds=10)
    y_pred = model.predict(x_test)
    
    loss = log_loss(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    acc = accuracy_score(y_test, (y_pred > 0.5).astype(int))
#     model.save_model(file_path + "-%04d-%.5f-%.5f.h5" % (model.best_iteration, loss, acc),
#                      num_iteration=model.best_iteration)
    print("val loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)
        
    return model

In [99]:
X_ember_train = get_ember_feature(x_train)
y_ember_train = y_train

In [100]:
X_ember_train.shape

(83599, 2351)

In [101]:
import matplotlib.pyplot as plt
model = get_model(X_ember_train, y_ember_train)

[1]	valid_0's binary_logloss: 0.606705
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.538164
[3]	valid_0's binary_logloss: 0.480632
[4]	valid_0's binary_logloss: 0.431482
[5]	valid_0's binary_logloss: 0.390057
[6]	valid_0's binary_logloss: 0.353771
[7]	valid_0's binary_logloss: 0.322006
[8]	valid_0's binary_logloss: 0.2943
[9]	valid_0's binary_logloss: 0.269763
[10]	valid_0's binary_logloss: 0.248075
[11]	valid_0's binary_logloss: 0.228586
[12]	valid_0's binary_logloss: 0.211549
[13]	valid_0's binary_logloss: 0.196008
[14]	valid_0's binary_logloss: 0.182362
[15]	valid_0's binary_logloss: 0.170003
[16]	valid_0's binary_logloss: 0.158991
[17]	valid_0's binary_logloss: 0.148514
[18]	valid_0's binary_logloss: 0.139195
[19]	valid_0's binary_logloss: 0.130832
[20]	valid_0's binary_logloss: 0.122769
[21]	valid_0's binary_logloss: 0.115702
[22]	valid_0's binary_logloss: 0.109301
[23]	valid_0's binary_logloss: 0.103047
[24]	valid_0's binary_logloss

[199]	valid_0's binary_logloss: 0.0171568
[200]	valid_0's binary_logloss: 0.0171378
[201]	valid_0's binary_logloss: 0.0171202
[202]	valid_0's binary_logloss: 0.0170908
[203]	valid_0's binary_logloss: 0.0170418
[204]	valid_0's binary_logloss: 0.0170329
[205]	valid_0's binary_logloss: 0.0170123
[206]	valid_0's binary_logloss: 0.0170025
[207]	valid_0's binary_logloss: 0.0169855
[208]	valid_0's binary_logloss: 0.0169326
[209]	valid_0's binary_logloss: 0.0169537
[210]	valid_0's binary_logloss: 0.0169676
[211]	valid_0's binary_logloss: 0.0169786
[212]	valid_0's binary_logloss: 0.0170085
[213]	valid_0's binary_logloss: 0.0169838
[214]	valid_0's binary_logloss: 0.016971
[215]	valid_0's binary_logloss: 0.016944
[216]	valid_0's binary_logloss: 0.0168979
[217]	valid_0's binary_logloss: 0.0169188
[218]	valid_0's binary_logloss: 0.0169076
[219]	valid_0's binary_logloss: 0.0169059
[220]	valid_0's binary_logloss: 0.0169407
[221]	valid_0's binary_logloss: 0.0169343
[222]	valid_0's binary_logloss: 0.01

## Ember test data

In [136]:
X_ember_test = get_ember_feature(x_test)
y_ember_test = y_test

In [137]:
y_p = model.predict(X_ember_test)
y_pred_e = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred_e[i, 0] = y_p[i]

estimate_model(y_pred_e, y_ember_test)

loss : 0.20951
auc score : 0.99284
accuracy score : 0.95592
thre: 0.99972
fp:  0.00062
recall:  0.71602


(0.9928395083486442, 0.20951220307005827, 0.7160165484633569)

# Mal Conv + Ember

get 128 features from the last layers of Malconv

In [138]:
from keras.models import Model

model_f = Model(c_model.input, c_model.layers[-2].output)

train_generator = DataGenerator(range(len(x_train)), x_train, y_train, 16, max_length, False)
malcon_train_x = model_f.predict_generator(generator=train_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

test_generator = DataGenerator(range(len(x_test)), x_test, y_test, 16, max_length, False)
malcon_test_x = model_f.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)



## merge mal conv and ember *train* data

In [139]:
num = len(malcon_train_x)
ebm_X = np.zeros((num, 2351+128), dtype=float)
ebm_y = np.zeros(num, dtype=float)

for index in range(num):
    ebm_X[index, 0:2351] = X_ember_train[index]
    ebm_X[index, 2351:2351+128] = malcon_train_x[index]
    ebm_y[index] = y_ember_train[index]

import matplotlib.pyplot as plt
model_m = get_model(ebm_X, ebm_y)

[1]	valid_0's binary_logloss: 0.606903
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.53851
[3]	valid_0's binary_logloss: 0.481297
[4]	valid_0's binary_logloss: 0.433021
[5]	valid_0's binary_logloss: 0.391424
[6]	valid_0's binary_logloss: 0.355563
[7]	valid_0's binary_logloss: 0.32417
[8]	valid_0's binary_logloss: 0.296084
[9]	valid_0's binary_logloss: 0.271401
[10]	valid_0's binary_logloss: 0.250009
[11]	valid_0's binary_logloss: 0.230353
[12]	valid_0's binary_logloss: 0.213244
[13]	valid_0's binary_logloss: 0.197991
[14]	valid_0's binary_logloss: 0.183817
[15]	valid_0's binary_logloss: 0.171655
[16]	valid_0's binary_logloss: 0.160104
[17]	valid_0's binary_logloss: 0.14963
[18]	valid_0's binary_logloss: 0.140102
[19]	valid_0's binary_logloss: 0.131657
[20]	valid_0's binary_logloss: 0.124192
[21]	valid_0's binary_logloss: 0.117441
[22]	valid_0's binary_logloss: 0.110525
[23]	valid_0's binary_logloss: 0.104306
[24]	valid_0's binary_logloss:

## merge mal conv and ember *test* data

In [140]:
num = len(malcon_test_x)
ebm_X_t = np.zeros((num, 2351+128), dtype=float)
ebm_y_t = np.zeros(num, dtype=float)

for index in range(num):
    ebm_X_t[index, 0:2351] = X_ember_test[index]
    ebm_X_t[index, 2351:2351+128] = malcon_test_x[index]
    ebm_y_t[index] = y_ember_test[index]

In [141]:
y_p = model_m.predict(ebm_X_t)
y_pred = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred[i, 0] = y_p[i]

estimate_model(y_pred, ebm_y_t)

loss : 0.18107
auc score : 0.99241
accuracy score : 0.95565
thre: 0.99873
fp:  0.00062
recall:  0.74254


(0.9924079403417576, 0.1810718398054507, 0.7425362104640851)