In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, confusion_matrix


In [7]:
max_len = 2048

def crop_exceed_data(data):
    if len(data) <= max_len:
        return data
    return data[0: max_len]


def get_bytes_array(data):
    """
    int to bytes array
    :param data:
    :return:
    """
    bytes_data = bytes(map(int, data.split(",")))
    bytes_data = crop_exceed_data(bytes_data)
    return [int(single_byte) for single_byte in bytes_data]

In [8]:
max_len = 2048


def crop_exceed_data(data):
    if len(data) <= max_len:
        return data
    return data[0: max_len]


def get_bytes_array(data):
    """
    int to bytes array
    :param data:
    :return:
    """
    bytes_data = bytes(map(int, data.split(",")))
    bytes_data = crop_exceed_data(bytes_data)
    return [int(single_byte) for single_byte in bytes_data]


def reverse_bytes(original):
    """
    make the bytes inverse
    :param original:
    :return:
    """
    return original[::-1]


def convert_int(str_bytes):
    """
    convert bytes to int
    :param str_bytes:
    :return:
    """
    return int.from_bytes(str_bytes, byteorder='big', signed=False)


def decode_rich_sign(rich_sign):
    """
    decode the rich sign, use the last 4 bytes to xor each 4 bytes from the start to end
    :param rich_sign:
    :return:
    """
    key = rich_sign[-4:]
    rich_sign_d = bytearray()
    for i in range(len(rich_sign)):
        rich_sign_d.append(rich_sign[i] ^ key[i % 4])
    return bytes(rich_sign_d)


def get_fixed_head(data):
    """
    select some useful parts from the whole PE head
    :param data:
    :return:
    """
    bytes_data = bytes(map(int, data.split(",")))
    # mz head
    mz_head = bytes_data[0:64]
    # # dos sub
    ms_dos_sub = bytes_data[64:128]
    # decode rich sign
    rich_sign_end = bytes_data[128:].find(b'\x52\x69\x63\x68') + 136
    rich_sign = decode_rich_sign(bytes_data[128:rich_sign_end])
    # pe head
    pe_head_start = bytes_data[128:].find(b'\x50\x45\x00\x00') + 128
    pe_head = bytes_data[pe_head_start: pe_head_start + 24]
    # there are two types of image optional head, PE 32 and PE 32+
    other_head = bytes_data[pe_head_start + 24:]
    if other_head[0:2] == b'\x0b\x01':
        image_optional_head_end = 96
    else:
        image_optional_head_end = 112
    image_optional_head = other_head[0:image_optional_head_end]
    # data directory
    data_directory = other_head[image_optional_head_end: image_optional_head_end + 128]
    # append all above parts
    fixed_head = mz_head + ms_dos_sub + rich_sign + pe_head + image_optional_head + data_directory
    # fixed_head = mz_head + rich_sign + pe_head + image_optional_head + data_directory
    # for each sections, just get the non-zero value
    number_of_sections = convert_int(reverse_bytes(pe_head[6:8]))
    for offset in range(number_of_sections):
        offset_sections_start = image_optional_head_end + 128 + 40 * offset
        fixed_head += other_head[offset_sections_start: offset_sections_start + 28] + \
                       other_head[offset_sections_start + 36:offset_sections_start + 40]
        fixed_head += other_head[offset_sections_start + 36:offset_sections_start + 40]
    return [int(single_byte) for single_byte in fixed_head]

In [9]:
train_x = pd.read_csv("./input/1_train.csv", header=None, sep="|", names=['row_data'], error_bad_lines=False)
tmp_v = train_x["row_data"].apply(lambda x: get_bytes_array(x))
train_x = pd.DataFrame(tmp_v.tolist(), dtype=float)
train_y = pd.read_csv("./input/1_train_label.csv", header=None, error_bad_lines=False)
del tmp_v
print('Shape of the train_x data: ', train_x.shape)
print('Shape of the train_y data: ', train_y.shape)

Shape of the train_x data:  (113133, 2048)
Shape of the train_y data:  (113133, 1)


In [10]:
test_x = pd.read_csv("./input/test.csv", header=None, sep="|", names=['row_data'], error_bad_lines=False)
tmp_v = test_x["row_data"].apply(lambda x: get_bytes_array(x))
test_x = pd.DataFrame(tmp_v.tolist(), dtype=float)
test_y = pd.read_csv("./input/test_label.csv", header=None, error_bad_lines=False)
del tmp_v
print('Shape of the test_x data: ', test_x.shape)
print('Shape of the test_y data: ', test_y.shape)

Shape of the test_x data:  (143180, 2048)
Shape of the test_y data:  (143180, 1)


In [11]:
tmp_v = pd.read_csv("./input/1_train.csv", header=None, sep="|", names=['row_data'], error_bad_lines=False)
train_x = np.zeros((tmp_v.shape[0], max_len), dtype=int)
train_y = pd.read_csv("./input/1_train_label.csv", header=None, error_bad_lines=False)

for i, item in enumerate(tmp_v["row_data"]):
    # Store sample
    bytes_data = get_fixed_head(item)
    if len(bytes_data) > max_len:
        train_x[i, :] = bytes_data[:max_len]
    else:
        train_x[i, 0:len(bytes_data)] = bytes_data
        
del tmp_v       
print('Shape of the train_x data: ', train_x.shape)
print('Shape of the train_y data: ', train_y.shape)

Shape of the train_x data:  (113133, 2048)
Shape of the train_y data:  (113133, 1)


In [12]:
tmp_v = pd.read_csv("./input/test.csv", header=None, sep="|", names=['row_data'], error_bad_lines=False)
test_x = np.zeros((tmp_v.shape[0], max_len), dtype=int)
test_y = pd.read_csv("./input/test_label.csv", header=None, error_bad_lines=False)

for i, item in enumerate(tmp_v["row_data"]):
    # Store sample
    bytes_data = get_fixed_head(item)
    if len(bytes_data) > max_len:
        test_x[i, :] = bytes_data[:max_len]
    else:
        test_x[i, 0:len(bytes_data)] = bytes_data
        
del tmp_v       
print('Shape of the test_x data: ', test_x.shape)
print('Shape of the test_y data: ', test_y.shape)

Shape of the test_x data:  (143180, 2048)
Shape of the test_y data:  (143180, 1)


In [13]:
def get_model(data, label):
    x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.05, random_state=5242)
    params = {'application': 'binary'}
    lgbm_dataset = lgb.Dataset(x_train, y_train.values.ravel())
    valid_sets = lgb.Dataset(x_test, y_test.values.ravel())

    model = lgb.train(params, lgbm_dataset, 100000, valid_sets=valid_sets, early_stopping_rounds=10, 
                     verbose_eval=True)
    return model

In [5]:
model = get_model(train_x, train_y)
print('Plot feature importances...')
ax = lgb.plot_importance(model, max_num_features=21, figsize=(10, 10), importance_type='gain')
plt.show()

NameError: name 'get_model' is not defined

In [None]:
def estimate_model(model, test_x, test_y):

    y_p = model.predict(test_x)
    y_pred = np.zeros((len(y_p), 1))
    for i in range(len(y_p)):
        y_pred[i, 0] = y_p[i]

    loss = log_loss(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred)
    acc = accuracy_score(test_y, (y_pred > 0.5).astype(int))
    print("loss : %.5f" % loss)
    print("auc score : %.5f" % auc)
    print("accuracy score : %.5f" % acc)

    fp_np_index = np.where(test_y == 0)
    fp_np = y_pred[fp_np_index].shape[0]
    thre_index = int(np.ceil(fp_np - fp_np * 0.001))

    sorted_pred_prob = np.sort(y_pred[fp_np_index], axis=0)
    thre = sorted_pred_prob[thre_index]
    if thre == 1:
        thre = max(sorted_pred_prob[np.where(sorted_pred_prob != 1)])

    y_pred_prob = np.vstack((y_pred.transpose(), (1 - y_pred).transpose())).transpose()
    y_pred_prob[:, 1] = thre
    y_pred_label = np.argmin(y_pred_prob, axis=-1)

    tn, fp, fn, tp = confusion_matrix(test_y, y_pred_label).ravel()
    fp_rate = fp / (fp + tn)
    recall_rate = tp / (tp + fn)

    print("thre: %.5f"%  thre)
    print("fp:  %.5f"%  fp_rate)
    print("recall:  %.5f"%  recall_rate)
    
    return auc, loss, recall_rate

In [None]:
auc, loss, recall = estimate_model(model, test_x, test_y)