In [1]:
import math
from tqdm import tqdm
import collections
import os
import zipfile
import pickle
import numpy as np

FILE_PATH = r'/home/cuckoo/word/virusign'
SAVE_PATH = r'/home/cuckoo/word/feature'

LEN_OF_DF = 4096

In [2]:
def create_file_list(PATH):
    file_list = []
    for root, dirs, files in os.walk(PATH):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list


def set_count(file_list):
    m_set = set()
    for i in file_list:
        try:
            # document will be the filetype zipfile.ZipFile
            document = zipfile.ZipFile(i)
            name_list = document.namelist()
            for i in name_list:
                items = i.split('/')
                for i in range(0, len(items)):
                    m_set.add('/'.join(items[i:]))
        except zipfile.BadZipfile:
            continue
    return m_set


def make_table(file_list):
    df_dict = {}
    for i in file_list:
        try:
            document = zipfile.ZipFile(i)
            name_list = document.namelist()

            for i in name_list:
                items = i.split('/')
                for i in range(0, len(items)):
                    df_dict['/'.join(items[i:])] = df_dict.get('/'.join(items[i:]), 0) + 1

        except zipfile.BadZipfile:
            continue

    df_rank_list = sorted(df_dict.items(), key=lambda x: x[1], reverse=True)[:1024]

    df_rank_dict = dict()
    rank = 0

    for k, _ in df_rank_list:
        df_rank_dict[k] = rank
        rank += 1

    return df_rank_dict


def get_entropy(data):
    """Calculate the entropy of a chunk of data."""

    if len(data) == 0:
        return 0.0

    occurences = collections.Counter(bytearray(data))

    entropy = 0
    for x in occurences.values():
        p_x = float(x) / len(data)
        entropy -= p_x * math.log(p_x, 2)

    return entropy


def make_feature_vec(df_rank_dict, file_list):
    for path in file_list:
        try:
            feature_vector = [0 for _ in range(1024)]
            entropy_list = []
            file_size = []

            with zipfile.ZipFile(path) as document:
                name_list = document.namelist()

                for name in name_list:
                    items = name.split('/')
                    for i in range(0, len(items)):
                        k = '/'.join(items[i:])
                        if k in df_rank_dict:
                            feature_vector[df_rank_dict[k]] = 1



#                 for name in name_list:
#                     with document.open(name) as f:
#                         data = f.read()
#                         entropy_list.append(get_entropy(data))
#                         file_size.append(len(data))

#                 feature_vector += [min(entropy_list), max(entropy_list), np.mean(entropy_list), 
#                                    min(file_size), max(file_size), np.mean(file_size)]

            with open(os.path.join(SAVE_PATH, str(os.path.basename(path).split('.')[0])) + '.pkl', 'wb') as f:
                pickle.dump(feature_vector, f)

        except zipfile.BadZipfile:
            continue

In [3]:
label_dict = dict()

with open("/home/cuckoo/word/word_virusign_label.csv", "r") as f:
    next(iter(f))
    
    for line in f.readlines():
        md5, label = line.strip().split(",")
        
        label_dict[md5] = int(label)

In [4]:
import os
import glob

file_path = glob.glob("/home/cuckoo/word/virusign/*.vir")

mal_file_path = []
ben_file_path = []

for path in file_path:
    md5 = os.path.basename(path).split(".")[0]
    
    if md5 in label_dict:
        if label_dict[md5] == 1:
            mal_file_path.append(path)
        else:
            ben_file_path.append(path)

mal_file_path = mal_file_path[:500]
ben_file_path = ben_file_path[:500]

In [5]:
len(mal_file_path)

500

In [6]:
len(ben_file_path)

500

In [7]:
mal_df_dict = make_table(mal_file_path)


In [8]:
ben_df_dict = make_table(ben_file_path)

In [9]:
mal_df_rank = sorted(mal_df_dict.items(), key=lambda x: x[1], reverse=True)
ben_df_rank = sorted(ben_df_dict.items(), key=lambda x: x[1], reverse=True)


In [10]:
for k, v in mal_df_rank[:10]:
    print(k, v)

HwPXR/HwPXR/HwPXRManager.aIr 1023
.LuYw 1022
_LuYw/.LuYw 1021
[Content_Types].aIr 1020
thRQGManager.nmE.CdTW 1019
_CdTW/thRQGManager.nmE.CdTW 1018
thRQG/_CdTW/thRQGManager.nmE.CdTW 1017
thRQG/thRQG/_CdTW/thRQGManager.nmE.CdTW 1016
thRQG1.nmE 1015
thRQG/thRQG1.nmE 1014


In [11]:
for k, v in ben_df_rank[:10]:
    print(k, v)

item1.xml.rels 87
_rels/item1.xml.rels 86
customXml/_rels/item1.xml.rels 85
itemProps1.xml 84
customXml/itemProps1.xml 83
item1.xml 82
customXml/item1.xml 81
custom.xml 80
docProps/custom.xml 79
webSettings.xml 78


In [12]:
mal_df = [x for x, _ in mal_df_rank]
ben_df = [x for x, _ in ben_df_rank]

mal_df = set(mal_df)
ben_df = set(ben_df)

In [13]:
len(mal_df)

1024

In [14]:
len(ben_df)

88

In [15]:
len(mal_df.intersection(ben_df))

20

In [16]:
all_df = dict()

for k, v in mal_df_dict.items():
    all_df[k] = v

for k, v in ben_df_dict.items():
    all_df[k] = all_df.get(k, 0) + v

all_df_rank = sorted(all_df.items(), key=lambda x: x[1], reverse=True)

all_df_rank_dict = dict()
rank = 0

for k, _ in all_df_rank[:LEN_OF_DF]:
    all_df_rank_dict[k] = rank
    rank += 1

In [17]:
for k, v in all_df_rank[:100]:
    print(k, v)

HwPXR/HwPXR/HwPXRManager.aIr 1023
.LuYw 1022
_LuYw/.LuYw 1021
[Content_Types].aIr 1020
thRQGManager.nmE.CdTW 1019
_CdTW/thRQGManager.nmE.CdTW 1018
thRQG/_CdTW/thRQGManager.nmE.CdTW 1017
thRQG/thRQG/_CdTW/thRQGManager.nmE.CdTW 1016
thRQG1.nmE 1015
thRQG/thRQG1.nmE 1014
thRQG/thRQG/thRQG1.nmE 1013
thRQGManager.nmE 1012
thRQG/thRQGManager.nmE 1011
thRQG/thRQG/thRQGManager.nmE 1010
.CdTW 1009
_CdTW/.CdTW 1008
[Content_Types].nmE 1007
DjdSpManager.HRq.fENH 1006
_fENH/DjdSpManager.HRq.fENH 1005
DjdSp/_fENH/DjdSpManager.HRq.fENH 1004
DjdSp/DjdSp/_fENH/DjdSpManager.HRq.fENH 1003
DjdSp1.HRq 1002
DjdSp/DjdSp1.HRq 1001
DjdSp/DjdSp/DjdSp1.HRq 1000
DjdSpManager.HRq 999
DjdSp/DjdSpManager.HRq 998
DjdSp/DjdSp/DjdSpManager.HRq 997
.fENH 996
_fENH/.fENH 995
[Content_Types].HRq 994
downrevGYfl 993
drs/downrevGYfl 992
e2oDocGYfl 991
drs/e2oDocGYfl 990
.nizh 989
_nizh/.nizh 988
lfiSCCPNYjddmjIGYfl 987
.aiVf 986
_aiVf/.aiVf 985
ipojjUVFYPwjhlH.xml 984
DXztvManager.awV.EMFG 983
_EMFG/DXztvManager.awV.EMFG 9

In [18]:
all_df_rank_dict

{'HwPXR/HwPXR/HwPXRManager.aIr': 0,
 '.LuYw': 1,
 '_LuYw/.LuYw': 2,
 '[Content_Types].aIr': 3,
 'thRQGManager.nmE.CdTW': 4,
 '_CdTW/thRQGManager.nmE.CdTW': 5,
 'thRQG/_CdTW/thRQGManager.nmE.CdTW': 6,
 'thRQG/thRQG/_CdTW/thRQGManager.nmE.CdTW': 7,
 'thRQG1.nmE': 8,
 'thRQG/thRQG1.nmE': 9,
 'thRQG/thRQG/thRQG1.nmE': 10,
 'thRQGManager.nmE': 11,
 'thRQG/thRQGManager.nmE': 12,
 'thRQG/thRQG/thRQGManager.nmE': 13,
 '.CdTW': 14,
 '_CdTW/.CdTW': 15,
 '[Content_Types].nmE': 16,
 'DjdSpManager.HRq.fENH': 17,
 '_fENH/DjdSpManager.HRq.fENH': 18,
 'DjdSp/_fENH/DjdSpManager.HRq.fENH': 19,
 'DjdSp/DjdSp/_fENH/DjdSpManager.HRq.fENH': 20,
 'DjdSp1.HRq': 21,
 'DjdSp/DjdSp1.HRq': 22,
 'DjdSp/DjdSp/DjdSp1.HRq': 23,
 'DjdSpManager.HRq': 24,
 'DjdSp/DjdSpManager.HRq': 25,
 'DjdSp/DjdSp/DjdSpManager.HRq': 26,
 '.fENH': 27,
 '_fENH/.fENH': 28,
 '[Content_Types].HRq': 29,
 'downrevGYfl': 30,
 'drs/downrevGYfl': 31,
 'e2oDocGYfl': 32,
 'drs/e2oDocGYfl': 33,
 '.nizh': 34,
 '_nizh/.nizh': 35,
 'lfiSCCPNYjddmjIGY

In [19]:
make_feature_vec(all_df_rank_dict, file_path)

In [20]:
import lightgbm as lgb
import glob

features_path = glob.glob("/home/cuckoo/word/feature/*")



In [21]:
features = []
target = []

for path in features_path:
    md5 = os.path.basename(path).split(".")[0]
    if md5 in label_dict:
        with open(path, "rb") as f:
            features.append(pickle.load(f))
            target.append(label_dict[md5])

In [22]:
len(features)

1037

In [23]:
param = {'num_leaves': 64,
         'min_data_in_leaf': 64,
         'objective':'binary',
         'nthread': 1,
         'max_depth': -1,
         'learning_rate': 0.05,
         "boosting": "gbdt",
         "feature_fraction": 0.7,
         "bagging_freq": 1,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 11,
         "metric": ['auc','binary_logloss'],
         "lambda_l1": 0.05,
         "random_state": 24,
         "verbosity": -1}

In [24]:

max(features[0])

7076

In [25]:
# print(len(feature), len(feature[0]), len(target))

feature = np.array(features)
trn = lgb.Dataset(feature, label=target)

for ele in features:
    if len(ele) != 1030:
        print(ele)

In [26]:
import numpy as np


bst = lgb.train(param, 
                trn,
                num_boost_round=10000,
                valid_sets=trn,
                verbose_eval=100,
                early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.157751	training's auc: 0.973976
[200]	training's binary_logloss: 0.140591	training's auc: 0.977386
[300]	training's binary_logloss: 0.132068	training's auc: 0.979184
[400]	training's binary_logloss: 0.126769	training's auc: 0.981491
[500]	training's binary_logloss: 0.123479	training's auc: 0.982075
[600]	training's binary_logloss: 0.121299	training's auc: 0.983138
[700]	training's binary_logloss: 0.119902	training's auc: 0.983437
[800]	training's binary_logloss: 0.118633	training's auc: 0.983833
[900]	training's binary_logloss: 0.117542	training's auc: 0.984355
[1000]	training's binary_logloss: 0.116515	training's auc: 0.984708
[1100]	training's binary_logloss: 0.115743	training's auc: 0.984812
[1200]	training's binary_logloss: 0.115098	training's auc: 0.984798
[1300]	training's binary_logloss: 0.114648	training's auc: 0.985132
[1400]	training's binary_logloss: 0.114065	training's auc: 0.9

In [27]:
cv_result = lgb.cv(param, trn, verbose_eval=20, early_stopping_rounds=100)
cv_result["auc-mean"][-1]


[20]	cv_agg's binary_logloss: 0.279267 + 0.0206139	cv_agg's auc: 0.968722 + 0.0172731
[40]	cv_agg's binary_logloss: 0.209477 + 0.0341476	cv_agg's auc: 0.970355 + 0.0175887
[60]	cv_agg's binary_logloss: 0.190154 + 0.0417237	cv_agg's auc: 0.971164 + 0.0178093
[80]	cv_agg's binary_logloss: 0.183304 + 0.0461351	cv_agg's auc: 0.971283 + 0.0180661
[100]	cv_agg's binary_logloss: 0.17859 + 0.0479965	cv_agg's auc: 0.971182 + 0.0179539


0.971181897755892

In [28]:
test_vir_path = glob.glob("/home/cuckoo/word/virusshare/*")
test_vir_path += glob.glob("/home/cuckoo/word/google/*")

make_feature_vec(all_df_rank_dict, test_vir_path)

In [29]:
test_label = dict()
with open("/home/cuckoo/word/word_virusshare_label.csv", "r") as f:
    next(iter(f))
    
    for line in f.readlines():
        md5, label = line.strip().split(",")
        
        test_label[md5] = int(label)
    
with open("/home/cuckoo/word/google_label.csv", "r") as f:
    next(iter(f))
    
    for line in f.readlines():
        md5, label = line.strip().split(",")
        
        test_label[md5] = int(label)

In [30]:
test_feature = []
test_target = []
test_md5 = []

test_feature_path = glob.glob("/home/cuckoo/word/feature/*")

for path in test_feature_path:
    md5 = os.path.basename(path).split(".")[0]
    if md5 in test_label:
        with open(path, "rb") as f:
            test_feature.append(pickle.load(f))
        test_target.append(test_label[md5])
        test_md5.append(md5)

In [31]:
prediction = bst.predict(test_feature, num_iteration=bst.best_iteration)
prediction

array([0.98736531, 0.99795724, 0.99795724, ..., 0.19524909, 0.98736531,
       0.98192365])

In [32]:
acc = 0

for i in range(len(prediction)):
    if prediction[i] >= 0.5:
        prediction[i] = 1
    else :
        prediction[i] = 0
    
    if prediction[i] == test_label[test_md5[i]]:
        acc += 1
        print(prediction[i], test_label[test_md5[i]])

    else:
        print(prediction[i], test_label[test_md5[i]])

print(acc / len(prediction) * 100)

1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
0.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
0.0 0
1.0 0
0.0 0
0.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
0.0 0
0.0 0
1.0 0
0.0 0
0.0 0
0.0 0
1.0 0
0.0 0
1.0 0
0.0 0
0.0 0
1.0 0
0.0 0
1.0 0
0.0 0
0.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
0.0 0
0.0 0
0.0 0
0.0 0
1.0 0
0.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
0.0 0
0.0 0
1.0 0
0.0 0
1.0 0
0.0 0
0.0 0
1.0 0
1.0 0
1.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
0.0 0
0.0 0
1.0 0
1.0 0
0.0 0
1.0 0
0.0 0
0.0 0
0.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
0.0 0
0.0 0
1.0 0
1.0 0
0.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
1.0 0
0.0 0
1.0 0
0.0 0
1.0 0
0.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 1
0.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 0
0.0 0
1.0 0
1.0 0
1.0 0
1.0 