## Read data from database

In [1]:
import os
# specify which GPU will be used
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import os

import pymysql
from warnings import filterwarnings

_connection = None

def get_connection(db_config):
    """
    get db connection
    :return:
    """
    global _connection
    if _connection is None:
        _connection = pymysql.connect(host=db_config['host'], user=db_config['username'],
                                      password=db_config['password'],
                                      db=db_config['db'], charset="utf8")
        filterwarnings('ignore', category=pymysql.Warning)

    return _connection


def close():
    """
    close DB connection
    :return:
    """
    global _connection
    if _connection is not None:
        _connection.close()
    _connection = None

In [3]:
db = {
    'host': '172.26.187.242',
    'username': 'malware_r',
    'password': 'GEg22v2O7jbfWhb3',
    'db': 'malware'
}

### Fields

- mw_file_suffix: file name after hash value
- mw_file_prefix: directory
- mw_em_f: features of ember, splitted by ";"

In [4]:
import time

# the base function which can query sql and return dict data
def get_specific_data(table_suffix, sql=None):
    start_time = time.time()
    
    global _connection
    if _connection is None:
        raise Exception("please init db connect first")

    cursor = _connection.cursor()
    cursor.execute("SET NAMES utf8mb4")

    ret = []
    
    sql = """
select
  a.mw_file_hash,
  a.section_name,
  c.mw_file_suffix as mw_file_size,
  c.mw_file_prefix as mw_file_directory,
  c.mw_num_engines,
  a.pointerto_raw_data,
  a.virtual_size,
  d.mw_em_f
from mw_index_2017_section_%s as a
  inner join mw_index_2017_%s c on a.mw_file_hash = c.mw_file_hash
  inner join mw_index_2017_feature_%s d on a.mw_file_hash = d.mw_file_hash
where a.section_name = '.text' and c.mw_num_engines <> -1 and (c.mw_num_engines > 6 or c.mw_num_engines = 0) and
      c.mw_file_prefix in ('201701', '201703')
group by mw_file_hash
    """ % (table_suffix, table_suffix, table_suffix)
    
    if sql is None:
        sql = sql
    cursor.execute(sql)

    field_names = [i[0] for i in cursor.description]

    for row in cursor:
        temp = {}
        for key in range(len(row)):
            temp[field_names[key]] = row[key]
        ret.append(temp)
     
    cursor.close()
    # _connection.close()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return ret

In [5]:
close()
res = []
get_connection(db)
table_suffix = ["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"]
# Iterate all partitions of databases
for suffix in table_suffix:
    res.extend(get_specific_data(suffix))
close()
print(len(res))

--- 8.362408638000488 seconds ---
--- 6.992529630661011 seconds ---
--- 6.6882483959198 seconds ---
--- 6.539879322052002 seconds ---
--- 4.872157335281372 seconds ---
--- 6.577094793319702 seconds ---
--- 8.06700873374939 seconds ---
--- 6.630244255065918 seconds ---
--- 6.555645942687988 seconds ---
--- 8.353384017944336 seconds ---
--- 5.908476829528809 seconds ---
--- 5.881518363952637 seconds ---
--- 6.985220432281494 seconds ---
--- 5.4133665561676025 seconds ---
--- 5.166863918304443 seconds ---
--- 5.010003089904785 seconds ---
239085


## Check and split data

In [6]:
import pandas as pd
data = pd.DataFrame(res)
del res

In [7]:
data.head()

Unnamed: 0,mw_em_f,mw_file_directory,mw_file_hash,mw_file_size,mw_num_engines,pointerto_raw_data,section_name,virtual_size
0,0.21537522971630094;0.02217341773211956;0.0131...,201703,000002D8A582B2083F1F2DEF365A82234766D546EC8021...,_958490,0,1024,.text,440152
1,0.2265700697898865;0.01793677918612957;0.01251...,201703,000043B9D37086041959E44EB6EED77F05DA44492F3B6A...,_6343168,0,634368,.text,5549270
2,0.30814531445503235;0.004307170398533343;0.003...,201701,0000A3EAB7F06DB29113F7BDE61328D56BC874B16DD895...,_339434,11,1024,.text,13352
3,0.7433035969734192;0.004743303637951613;0.0041...,201701,0000D95594F0BC57CA94FD90C9477A7B984A8429B7A45B...,_3584,7,1024,.text,566
4,0.013131157495081423;0.004108430817723274;0.00...,201701,00015BCA398B78199C5442DB5B28D14405549FB135FA69...,_2098368,0,1024,.text,25467


In [8]:
import numpy as np
import scipy.stats as stats
import pylab as pl

max_length = 300000

# only keep the data whoose length less than max_length
clean_data = data.loc[data.virtual_size <= max_length]
clean_data = clean_data.reset_index(drop=True)
del data

print(clean_data.shape)

h = sorted(clean_data.virtual_size.ravel())  #sorted

fit = stats.norm.pdf(h, np.mean(h), np.std(h))  #this is a fitting indeed

pl.plot(h,fit,'-o')

pl.hist(h,normed=True)      #use this to draw histogram of your data

pl.show()                   #use may also need add this 

(195320, 8)




<Figure size 640x480 with 1 Axes>

In [9]:
# re-value label
clean_data.mw_num_engines[clean_data.mw_num_engines == 0 ] = 0
clean_data.mw_num_engines[clean_data.mw_num_engines > 6 ] = 1
label = clean_data.mw_num_engines.ravel()
label

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([1, 1, 0, ..., 1, 0, 1])

In [10]:
from sklearn.model_selection import train_test_split

x_tmp, x_test, y_tmp, y_test = train_test_split(clean_data, label, test_size=0.1, random_state=1234)
x_train, x_val, y_train, y_val = train_test_split(x_tmp, y_tmp, test_size=0.1, random_state=1234)
del x_tmp, y_tmp

In [11]:
if False:
    _, x_train, _, y_train = train_test_split(x_train, y_train, test_size=0.1, random_state=1234)
    _, x_val, _, y_val = train_test_split(x_val, y_val, test_size=0.1, random_state=1234)
    _, x_test, _, y_test = train_test_split(x_test, y_test, test_size=0.1, random_state=1234)
    
x_train = x_train.reset_index(drop=True)
x_val = x_val.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

## EMBER

In [12]:
import pandas as pd
import re
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, confusion_matrix

In [13]:
def get_ember_feature(data):
    ember_f = np.zeros((len(data.mw_em_f), 2351), dtype=float)
    for index, item in data.iterrows():
        ember_f[index, :] = item['mw_em_f'].split(';')
    return ember_f

In [14]:
def get_model(x_train, y_train, x_val, y_val):
    params = {'application': 'binary'}
    lgbm_dataset = lgb.Dataset(x_train, y_train.ravel())
    valid_sets = lgb.Dataset(x_val, y_val.ravel())

    model = lgb.train(params, lgbm_dataset, 100000, valid_sets=valid_sets, early_stopping_rounds=10)
    y_pred = model.predict(x_val)
    
    loss = log_loss(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)
    acc = accuracy_score(y_val, (y_pred > 0.5).astype(int))
#     model.save_model(file_path + "-%04d-%.5f-%.5f.h5" % (model.best_iteration, loss, acc),
#                      num_iteration=model.best_iteration)
    print("val loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)
        
    return model

In [15]:
x_etrain = get_ember_feature(x_train)
x_eval = get_ember_feature(x_val)
x_etest = get_ember_feature(x_test)

In [16]:
import matplotlib.pyplot as plt
model = get_model(x_etrain, y_train, x_eval, y_val)

[1]	valid_0's binary_logloss: 0.614484
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.550099
[3]	valid_0's binary_logloss: 0.494397
[4]	valid_0's binary_logloss: 0.448523
[5]	valid_0's binary_logloss: 0.408788
[6]	valid_0's binary_logloss: 0.374659
[7]	valid_0's binary_logloss: 0.343673
[8]	valid_0's binary_logloss: 0.317212
[9]	valid_0's binary_logloss: 0.293053
[10]	valid_0's binary_logloss: 0.272161
[11]	valid_0's binary_logloss: 0.253889
[12]	valid_0's binary_logloss: 0.237777
[13]	valid_0's binary_logloss: 0.223329
[14]	valid_0's binary_logloss: 0.20963
[15]	valid_0's binary_logloss: 0.197764
[16]	valid_0's binary_logloss: 0.186851
[17]	valid_0's binary_logloss: 0.176721
[18]	valid_0's binary_logloss: 0.167491
[19]	valid_0's binary_logloss: 0.159512
[20]	valid_0's binary_logloss: 0.151825
[21]	valid_0's binary_logloss: 0.144917
[22]	valid_0's binary_logloss: 0.138606
[23]	valid_0's binary_logloss: 0.132997
[24]	valid_0's binary_loglos

[200]	valid_0's binary_logloss: 0.0376357
[201]	valid_0's binary_logloss: 0.0376006
[202]	valid_0's binary_logloss: 0.0375417
[203]	valid_0's binary_logloss: 0.0375119
[204]	valid_0's binary_logloss: 0.037477
[205]	valid_0's binary_logloss: 0.037536
[206]	valid_0's binary_logloss: 0.03754
[207]	valid_0's binary_logloss: 0.0374733
[208]	valid_0's binary_logloss: 0.0374413
[209]	valid_0's binary_logloss: 0.0373689
[210]	valid_0's binary_logloss: 0.0373494
[211]	valid_0's binary_logloss: 0.0372682
[212]	valid_0's binary_logloss: 0.0372647
[213]	valid_0's binary_logloss: 0.0372082
[214]	valid_0's binary_logloss: 0.0371941
[215]	valid_0's binary_logloss: 0.0372099
[216]	valid_0's binary_logloss: 0.0371795
[217]	valid_0's binary_logloss: 0.0371744
[218]	valid_0's binary_logloss: 0.0371375
[219]	valid_0's binary_logloss: 0.0370788
[220]	valid_0's binary_logloss: 0.0370617
[221]	valid_0's binary_logloss: 0.0370499
[222]	valid_0's binary_logloss: 0.0370233
[223]	valid_0's binary_logloss: 0.0370

In [17]:
def estimate_model(y_pred, test_y):
    
    loss = log_loss(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred)
    acc = accuracy_score(test_y, (y_pred > 0.5).astype(int))
    print("loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)

    fp_np_index = np.where(test_y == 0)
    fp_np = y_pred[fp_np_index].shape[0]
    thre_index = int(np.ceil(fp_np - fp_np * 0.001))

    sorted_pred_prob = np.sort(y_pred[fp_np_index], axis=0)
    thre = sorted_pred_prob[thre_index]
    if thre == 1:
        thre = max(sorted_pred_prob[np.where(sorted_pred_prob != 1)])

    y_pred_prob = np.vstack((y_pred.transpose(), (1 - y_pred).transpose())).transpose()
    y_pred_prob[:, 1] = thre
    y_pred_label = np.argmin(y_pred_prob, axis=-1)

    tn, fp, fn, tp = confusion_matrix(test_y, y_pred_label).ravel()
    fp_rate = fp / (fp + tn)
    recall_rate = tp / (tp + fn)

    print("thre: %.10f"%  thre)
    print("fp:  %.10f"%  fp_rate)
    print("recall:  %.5f"%  recall_rate)
    
    return auc, loss, recall_rate

In [18]:
y_p = model.predict(x_etest)
y_pred_e = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred_e[i, 0] = y_p[i]

estimate_model(y_pred_e, y_test)

loss : 0.03353
auc score : 0.99921
accuracy score : 0.98843
thre: 0.9819368905
fp:  0.0008126778
recall:  0.9167010735


(0.9992092686369798, 0.0335258567883324, 0.916701073492981)

## Malcon

In [19]:
import keras
import numpy as np

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'

    def __init__(self, list_IDs, datasets, labels, batch_size=32, dim=8192, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.datasets = datasets
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'  # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.zeros((self.batch_size, self.dim), dtype=float)
        y = np.zeros(self.batch_size, dtype=float)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            base_path = "/ssd/2017/{0}/{1}{2}"
            item = self.datasets.loc[ID]
            file_path = base_path.format(item["mw_file_directory"], item["mw_file_hash"], item["mw_file_size"])
            in_file = open(file_path, 'rb')
            in_file.seek(item['pointerto_raw_data'])
            bytes_data = [int(single_byte) for single_byte in in_file.read(item['virtual_size'])]
            X[i, 0:len(bytes_data)] = bytes_data
            y[i] = self.labels[ID]

        return X, y

Using TensorFlow backend.


In [20]:
import hashlib
import json
import time

import keras
from keras import Input
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Dense, Embedding, Conv1D, Multiply, GlobalMaxPooling1D, Dropout
from keras.models import load_model
from sklearn.model_selection import train_test_split

class TMalConv(object):
    """
    train of mal conv
    """

    def __init__(self):
        self.max_len = max_length
        self.history = None
        self.model = None
        self.p_md5 = None
        self.time = time.time()
        self.summary = {
            'time':time.time(),
            'batch_size': 32,
            'epochs': 64,
            'g_c_filter': 128,
            'g_c_kernel_size': 500,
            'g_c_stride': 500,
        }

    def run(self):
        """
        :return:
        """
        self.train()
        
    def get_p(self, key):
        """
        get the parameter from the summary
        :param key:
        :return:
        """
        return self.summary[key]

    def gate_cnn(self, gate_cnn_input):
        """
        construct a gated cnn by the specific kernel size
        :param gate_cnn_input:
        :param kernel_size:
        :return:
        """
        conv1_out = Conv1D(self.get_p("g_c_filter"), self.get_p("g_c_kernel_size"), strides=self.get_p("g_c_stride"))(
            gate_cnn_input)
        conv2_out = Conv1D(self.get_p("g_c_filter"), self.get_p("g_c_kernel_size"), strides=self.get_p("g_c_stride"),
                           activation="sigmoid")(gate_cnn_input)
        merged = Multiply()([conv1_out, conv2_out])
        gate_cnn_output = GlobalMaxPooling1D()(merged)
        return gate_cnn_output

    def get_model(self):
        """
        get a model
        :param max_len:
        :param kernel_sizes:
        :return:
        """
        net_input = Input(shape=(self.max_len,))

        embedding_out = Embedding(256, 8, input_length=self.max_len)(net_input)
        merged = self.gate_cnn(embedding_out)

        dense_out = Dense(128)(merged)
        
        net_output = Dense(1, activation='sigmoid')(dense_out)

        model = keras.models.Model(inputs=net_input, outputs=net_output)

        return model

    def train(self):
        batch_size = self.get_p("batch_size")
        epochs = self.get_p("epochs")

        self.model = self.get_model()

        print('Length of the train: ', len(x_train))
        print('Length of the validation: ', len(x_val))
        
#         tensor_board = TensorBoard(log_dir='./logs/', batch_size=batch_size)
        file_path = "/home/zhaoqi/BaseTrain/models/"+ str(self.time) +"-{epoch:04d}-{val_loss:.5f}-{val_acc:.5f}.h5"
        early_stopping = EarlyStopping("val_loss", patience=3, verbose=0, mode='auto')
        check_point = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
        callbacks_list = [check_point, early_stopping]

        # Generators
        training_generator = DataGenerator(range(len(x_train)), x_train, y_train, batch_size, self.max_len)
        validation_generator = DataGenerator(range(len(x_val)), x_val, y_val, batch_size, self.max_len)

        self.model.compile(loss='binary_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        self.model.fit_generator(generator=training_generator,
                                 validation_data=validation_generator,
                                 use_multiprocessing=True,
                                 epochs=epochs,
                                 workers=6,
                                 callbacks=callbacks_list)

In [21]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [22]:
t_instance = TMalConv()
t_instance.run()

Length of the train:  158209
Length of the validation:  17579
Epoch 1/64

Epoch 00001: val_loss improved from inf to 0.13008, saving model to /home/zhaoqi/BaseTrain/models/1531979342.479591-0001-0.13008-0.95205.h5
Epoch 2/64

Epoch 00002: val_loss improved from 0.13008 to 0.12904, saving model to /home/zhaoqi/BaseTrain/models/1531979342.479591-0002-0.12904-0.95409.h5
Epoch 3/64

Epoch 00003: val_loss improved from 0.12904 to 0.12892, saving model to /home/zhaoqi/BaseTrain/models/1531979342.479591-0003-0.12892-0.95415.h5
Epoch 4/64

  % delta_t_median)




  % delta_t_median)




  % delta_t_median)



Epoch 00004: val_loss did not improve from 0.12892
Epoch 5/64
 362/4945 [=>............................] - ETA: 34:46 - loss: 0.0607 - acc: 0.9848

  % delta_t_median)


 946/4945 [====>.........................] - ETA: 31:04 - loss: 0.0630 - acc: 0.9840

  % delta_t_median)


1037/4945 [=====>........................] - ETA: 30:37 - loss: 0.0624 - acc: 0.9840

  % delta_t_median)


1066/4945 [=====>........................] - ETA: 30:28 - loss: 0.0618 - acc: 0.9842

  % delta_t_median)


1067/4945 [=====>........................] - ETA: 30:28 - loss: 0.0617 - acc: 0.9842

  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)



Epoch 00005: val_loss did not improve from 0.12892
Epoch 6/64
 177/4945 [>.............................] - ETA: 39:05 - loss: 0.0552 - acc: 0.9868

  % delta_t_median)


 183/4945 [>.............................] - ETA: 39:04 - loss: 0.0566 - acc: 0.9862

  % delta_t_median)


 273/4945 [>.............................] - ETA: 38:57 - loss: 0.0545 - acc: 0.9860

  % delta_t_median)


 359/4945 [=>............................] - ETA: 38:23 - loss: 0.0554 - acc: 0.9856

  % delta_t_median)


 365/4945 [=>............................] - ETA: 38:20 - loss: 0.0555 - acc: 0.9856

  % delta_t_median)


 448/4945 [=>............................] - ETA: 37:56 - loss: 0.0571 - acc: 0.9855

  % delta_t_median)


 538/4945 [==>...........................] - ETA: 37:18 - loss: 0.0540 - acc: 0.9861

  % delta_t_median)


 634/4945 [==>...........................] - ETA: 36:34 - loss: 0.0551 - acc: 0.9857

  % delta_t_median)


 718/4945 [===>..........................] - ETA: 35:57 - loss: 0.0549 - acc: 0.9859

  % delta_t_median)


 724/4945 [===>..........................] - ETA: 35:53 - loss: 0.0553 - acc: 0.9858

  % delta_t_median)


 814/4945 [===>..........................] - ETA: 35:11 - loss: 0.0548 - acc: 0.9859

  % delta_t_median)


 896/4945 [====>.........................] - ETA: 34:35 - loss: 0.0554 - acc: 0.9857

  % delta_t_median)


 985/4945 [====>.........................] - ETA: 33:52 - loss: 0.0553 - acc: 0.9859

  % delta_t_median)


 991/4945 [=====>........................] - ETA: 33:50 - loss: 0.0553 - acc: 0.9858

  % delta_t_median)


1074/4945 [=====>........................] - ETA: 33:10 - loss: 0.0564 - acc: 0.9856

  % delta_t_median)


1080/4945 [=====>........................] - ETA: 33:06 - loss: 0.0564 - acc: 0.9856

  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)



Epoch 00006: val_loss did not improve from 0.12892


In [23]:
model_dir = '/home/zhaoqi/BaseTrain/models/'
f_name = '1531919274.01446370006-0.13225-0.95475.h5'
c_model = load_model(model_dir + f_name)
test_generator = DataGenerator(range(len(x_test)), x_test, y_test, 32, max_length, False)
y_pred = c_model.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)



In [24]:
estimate_model(y_pred, y_test[0:len(y_pred)])

ValueError: Found input variables with inconsistent numbers of samples: [19552, 19532]

## Merge features

In [None]:
from keras.models import Model

model_f = Model(c_model.input, c_model.layers[-2].output)

train_generator = DataGenerator(range(len(x_train)), x_train, y_train, 32, max_length, False)
malcon_train_x = model_f.predict_generator(generator=train_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

val_generator = DataGenerator(range(ßßlen(x_val)), x_val, y_val, 32, max_length, False)
malcon_val_x = model_f.predict_generator(generator=val_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

test_generator = DataGenerator(range(len(x_test)), x_test, y_test, 32, max_length, False)
malcon_test_x = model_f.predict_generator(generator=test_generator, max_queue_size=10, workers=6, use_multiprocessing=True, verbose=1)

In [None]:
model_m = get_model(malcon_train_x, y_train[0:len(malcon_train_x)], malcon_val_x, y_val[0:len(malcon_val_x)] )

In [None]:
y_p = model_m.predict(malcon_test_x)
y_pred = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred[i, 0] = y_p[i]

estimate_model(y_pred, y_test[0:len(malcon_test_x)])

In [None]:
def merge_feature(m_data, e_data):
    num = len(m_data)
    m_x = np.zeros((num, 128+2351), dtype=float)
    
    for index in range(num):
        m_x[index, 0:128] = m_data[index]
        m_x[index, 128:128+2351] = e_data[index]    
    return m_x

In [None]:
merge_train_x = merge_feature(malcon_train_x, x_etrain)
merge_val_x = merge_feature(malcon_val_x, x_eval)
merge_test_x = merge_feature(malcon_test_x, x_etest)

model_m = get_model(merge_train_x, y_train[0:len(merge_train_x)], merge_val_x, y_val[0:len(merge_val_x)] )

In [None]:
y_p = model_m.predict(merge_test_x)
y_pred = np.zeros((len(y_p), 1))
for i in range(len(y_p)):
    y_pred[i, 0] = y_p[i]

estimate_model(y_pred, y_test[0:len(merge_test_x)])