In [1]:
import math
import collections
import os
import zipfile
import pickle
import numpy as np
import glob
from tqdm import tqdm_notebook as tqdm




In [2]:
label_dict = dict()
for path in glob.glob(r"D:\capstone\word\data\docx\*.csv"):
    with open(path, "r") as f:
        next(iter(f))

        for line in f.readlines():
            md5, label = line.strip().split(",")

            label_dict[md5] = int(label)

In [3]:
SIZE_OF_DF_VECTOR = 4096

def create_file_list(PATH):
    file_list = []
    for root, dirs, files in os.walk(PATH):
        for file in files:
            if file.split(".")[-1] != "csv":
                file_list.append(os.path.join(root, file))
    return file_list


def set_count(file_list):
    m_set = set()
    for i in file_list:
        try:
            # document will be the filetype zipfile.ZipFile
            document = zipfile.ZipFile(i)
            name_list = document.namelist()
            for i in name_list:
                items = i.split('/')
                for i in range(0, len(items)):
                    m_set.add('/'.join(items[i:]))
        except zipfile.BadZipfile:
            continue
    return m_set


def make_table(file_list):
    df_dict = {}
    for i in tqdm(file_list):
        try:
            document = zipfile.ZipFile(i)
            name_list = document.namelist()

            for i in name_list:
                items = i.split('/')
                for i in range(0, len(items)):
                    df_dict['/'.join(items[i:])] = df_dict.get('/'.join(items[i:]), 0) + 1

        except Exception as e:
            continue
    
    df_rank_list = sorted(df_dict.items(), key=lambda x: x[1], reverse=True)[:SIZE_OF_DF_VECTOR]

    df_rank_dict = dict()
    rank = 0

    for k, _ in df_rank_list:
        df_rank_dict[k] = rank
        rank += 1

    return df_rank_dict


def get_entropy(data):
    """Calculate the entropy of a chunk of data."""

    if len(data) == 0:
        return 0.0

    occurences = collections.Counter(bytearray(data))

    entropy = 0
    for x in occurences.values():
        p_x = float(x) / len(data)
        entropy -= p_x * math.log(p_x, 2)

    return entropy


def make_feature_vec(df_rank_dict, file_list, label):
    ret_feature = []
    ret_label = []
    for path in tqdm(file_list):
        md5 = os.path.basename(path).split(".")[0]
        
        if md5 in label:
            try:
                feature_vector = [0 for _ in range(SIZE_OF_DF_VECTOR)]
                entropy_list = []
                file_size = []


                with zipfile.ZipFile(path) as document:
                    name_list = document.namelist()

                    for name in name_list:
                        items = name.split('/')
                        for i in range(0, len(items)):
                            k = '/'.join(items[i:])
                            if k in df_rank_dict:
                                feature_vector[df_rank_dict[k]] = 1



                    for name in name_list:
                        with document.open(name) as f:
                            data = f.read()
                            entropy_list.append(get_entropy(data))
                            file_size.append(len(data))

#                 feature_vector += [min(entropy_list), max(entropy_list), np.mean(entropy_list),
#                                    os.path.getsize(path), min(file_size), max(file_size), np.mean(file_size)]

    #             with open(os.path.join(SAVE_PATH, str(os.path.basename(path).split('.')[0])) + '.pkl', 'wb') as f:
    #                 pickle.dump(feature_vector, f)
                ret_feature.append(feature_vector)
                ret_label.append(label[md5])

            except Exception:
                continue

    return ret_feature, ret_label


In [4]:
FILE_PATH = r'D:\capstone\word\data\virusign'
SAVE_PATH = r''

In [5]:
file_list = create_file_list(r"D:\capstone\word\data\docx")
# file_list += create_file_list(r"D:\capstone\word\data\virusshare")
# file_list += create_file_list(r"D:\capstone\word\data\google_docx")
df_dict = make_table(file_list)


HBox(children=(IntProgress(value=0, max=5457), HTML(value='')))




In [6]:
features, label = make_feature_vec(df_dict, file_list, label_dict)

HBox(children=(IntProgress(value=0, max=5457), HTML(value='')))




In [7]:
param = {'num_leaves': 64,
         'min_data_in_leaf': 64,
         'objective':'binary',
         'nthread': 1,
         'max_depth': -1,
         'learning_rate': 0.05,
         "boosting": "gbdt",
         "feature_fraction": 0.7,
         "bagging_freq": 1,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 11,
         "metric": ['auc','binary_logloss'],
         "lambda_l1": 0.1,
         "random_state": 24,
         "verbosity": -1}

In [8]:
mal, ben = 0, 0

for i in label:
    if i == 1: mal += 1 
    else: ben += 1
        
print(mal, ben)

162 4882


In [9]:
len(label)

5044

In [10]:
import xgboost as xgb

params = {'eta': 1, "lambda":0.7, 'objective': 'binary:logistic', "metric":"auc", "silent":True}

trn_data = xgb.DMatrix(np.array(features), label=label)

cv_results = xgb.cv(params, trn_data, nfold=10, num_boost_round=100, early_stopping_rounds=5, verbose_eval=10, metrics="auc")
print(cv_results)

[0]	train-auc:0.889537+0.0299921	test-auc:0.871425+0.0624036
[10]	train-auc:0.983566+0.00139581	test-auc:0.963781+0.0285423
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.889537       0.029992       0.871425      0.062404
1        0.953708       0.002435       0.949763      0.029374
2        0.968098       0.002167       0.957512      0.025788
3        0.973619       0.002328       0.964926      0.025679
4        0.978001       0.001782       0.963617      0.029024
5        0.979466       0.001373       0.962383      0.031588
6        0.980736       0.001152       0.961773      0.033651
7        0.981602       0.001315       0.961860      0.033808
8        0.982402       0.001253       0.965799      0.027241


In [11]:
import lightgbm as lgb

trn_data = lgb.Dataset(np.array(features), label=label)

cv_results = lgb.cv(param, trn_data, nfold=10, verbose_eval=50, early_stopping_rounds=10, num_boost_round=500)

print('Best num_boost_round:', len(cv_results['auc-mean']))
print('Best CV score:', cv_results['auc-mean'][-1])

[50]	cv_agg's auc: 0.923599 + 0.0321902	cv_agg's binary_logloss: 0.091637 + 0.00928612
Best num_boost_round: 82
Best CV score: 0.9312730472701753


- DF 4096 : CV 0.86
- DF_new 4096 : CV 0.98

In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

models = []
models.append(("LR", LogisticRegression()))
models.append(("DT", DecisionTreeClassifier()))
models.append(("SVM", SVC()))
models.append(("NB", GaussianNB()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("RF", RandomForestClassifier()))


def get_cv(x_train, y_train):
    for name, model in models:
        scores = np.mean(cross_val_score(model, x_train, y_train, cv=10))
        print(name, "'s mean cv 10-fold is ", scores)

get_cv(features, label)

LR 's mean cv 10-fold is  0.9894880481837005
DT 's mean cv 10-fold is  0.988495984691637
SVM 's mean cv 10-fold is  0.9678838070142419
NB 's mean cv 10-fold is  0.35764869188782233
KNN 's mean cv 10-fold is  0.9859228935315892
RF 's mean cv 10-fold is  0.989288851245373


LR 's 10-Fold CV Score :  0.9254725526731571  
DT 's 10-Fold CV Score :  0.9571000986149965  
SVM 's 10-Fold CV Score :  0.9284180185954861  
NB 's 10-Fold CV Score :  0.3311396892139982  
KNN 's 10-Fold CV Score :  0.9482514568334703  
RF 's 10-Fold CV Score :  0.9630061495245068  
LightGBM's 10-Fold CV Score : 0.9842461380757872  

LR 's 10-fold CV score :0.9195729686998744  
DT 's 10-fold CV score : 0.9211851512361253  
SVM 's 10-fold CV score : 0.9190360549808281  
NB 's 10-fold CV score : 0.6133994340544391  
KNN 's 10-fold CV score : 0.9027258945530207  
RF 's 10-fold CV score : 0.9201106031084766  
LightGBM's 10-fold CV score : 0.8612848846386318  

[0]	train-auc:0.938966+0.00847722	test-auc:0.930315+0.0351822  
[10]	train-auc:1+9e-07	test-auc:0.990878+0.0126826  
    train-auc-mean  train-auc-std  test-auc-mean  test-auc-std  
0         0.938966   8.477220e-03       0.930315      0.035182  
1         0.972962   3.824079e-03       0.968325      0.035363  
2         0.991657   2.781738e-03       0.978689      0.025427  
3         0.995503   1.908980e-03       0.983970      0.018244  
4         0.998909   8.547105e-04       0.982017      0.019508  
5         0.999767   1.469840e-04       0.988409      0.014128  
6         0.999948   2.818883e-05       0.988385      0.013744  
7         0.999985   1.876939e-05       0.987859      0.016501  
8         0.999994   9.293008e-06       0.988979      0.016222  
9         0.999998   2.332381e-06       0.989519      0.013905  
10        1.000000   9.000000e-07       0.990878      0.012683  
11        1.000000   0.000000e+00       0.991216      0.012434  