In [1]:
import ember
import pandas as pd
import numpy as np
import json
import datetime

import matplotlib.pyplot as plt

from typing import Type
from ember.features import (
    PEFeatureExtractor,
    ByteHistogram,
    ByteEntropyHistogram,
    SectionInfo,
    ImportsInfo,
    ExportsInfo,
    GeneralFileInfo,
    HeaderFileInfo,
    StringExtractor,
    DataDirectories
)

# Get Feature Map

In [2]:
def histogram_map():
    return [f'histogram-{i+1}' for i in range(256)]

def entropy_map ():
    return [f'entropy-{i+1}' for i in range(256)]

def strings_map():
    features = []
    features.extend(['strings-numstrings', 'strings-avlength', 'strings-printables'])
    features.extend(f'strings-printable_dist-{i+1}' for i in range(96))
    features.extend(['strings-entropy', 'strings-paths', 'strings-urls', 'strings-registry','strings-MZ'])
    return features

def general_map():
    return ['general-size', 'general-vsize', 'general-has_debug', 'general-exports', 'general-imports', 
            'general-has_relocations', 'general-has_resources', 'general-has_signature',
            'general-has_tls', 'general-symbols']

def header_map():
    features = ['header-timestamp']
    features.extend(f'header-machine_{i+1}' for i in range(10))
    features.extend(f'header-charistics_{i+1}' for i in range(10))
    features.extend(f'header-subsystem_{i+1}' for i in range(10))
    features.extend(f'header-dll_charistics_{i+1}' for i in range(10))
    features.extend(f'header-magic_{i+1}' for i in range(10))
    features.extend(['header-major_image_ver', 'header-minor_image_ver', 'header-major_linker_ver', 
                     'header-minor_linker_ver', 'header-major_os_ver', 'header-minor_os_ver', 
                     'header-major_subsystem_ver', 'header-minor_subsystem_ver', 'header-sizeof_code',
                     'header-sizeof_headers', 'header-sizeof_heap_commit'])
    return features

def section_map():
    features = ['section-total_num', 'section-num_nonzero', 'section-num_noname', 'section-num_rx', 'section-num_w']
    features.extend(f'section-sizes_h_{i+1}' for i in range(50))
    features.extend(f'section-entropy_h_{i+1}' for i in range(50))
    features.extend(f'section-vsize_h_{i+1}' for i in range(50))
    features.extend(f'section-name_h_{i+1}' for i in range(50))
    features.extend(f'section-charistics_h_{i+1}' for i in range(50))
    return features

def imports_map():
    features = []
    features.extend(f'imports-libraries_h_{i+1}' for i in range(256))
    features.extend(f'imports-imports_h_{i+1}' for i in range(1024))
    return features

def exports_map():
    features = []
    features.extend(f'exports-hashed_{i+1}' for i in range(128))
    return features

def datadirs_map():
    features = []
    features.extend(f'datadirs-{i+1}' for i in range(30))
    return features


def get_map(feature_type) -> list:
    """ Gets the feature map based on the type """
    
    if isinstance(feature_type, ByteHistogram):
        features = histogram_map()
    
    elif isinstance(feature_type, ByteEntropyHistogram):
        features = entropy_map()
    
    elif isinstance(feature_type, SectionInfo):
        features = section_map()
    
    elif isinstance(feature_type, ImportsInfo):
        features = imports_map()
    
    elif isinstance(feature_type, ExportsInfo):
        features = exports_map()
    
    elif isinstance(feature_type, GeneralFileInfo):
        features = general_map()
    
    elif isinstance(feature_type, HeaderFileInfo):
        features = header_map()
    
    elif isinstance(feature_type, StringExtractor):
        features = strings_map()
    
    elif isinstance(feature_type, DataDirectories):
        features = datadirs_map()
        
    else:
        raise ValueError(f"Type {feature_type.__class__.__name__} is not supported")
    
    assert feature_type.dim == len(features), f"Type {feature_type.__class__.__name__} is missing features"
    return features


def feature_map():
    features = []
    for f in PEFeatureExtractor().features:
        features.extend(get_map(f))
    return features


In [3]:
features = feature_map()
len(features)



2381

# Get Data

1. Get the vectorized features
2. Add the label
3. Remove any entry with label of -1

In [4]:
X_train, y_train, X_test, y_test = ember.read_vectorized_features("ember2018/")



In [5]:
train_df = pd.DataFrame(X_train, columns=features)

In [6]:
train_df['label'] = y_train

In [7]:
train_df.head()

Unnamed: 0,histogram-1,histogram-2,histogram-3,histogram-4,histogram-5,histogram-6,histogram-7,histogram-8,histogram-9,histogram-10,...,datadirs-22,datadirs-23,datadirs-24,datadirs-25,datadirs-26,datadirs-27,datadirs-28,datadirs-29,datadirs-30,label
0,0.014676,0.004222,0.003923,0.004029,0.004007,0.003775,0.003825,0.003887,0.004153,0.003804,...,35240.0,0.0,0.0,660.0,32768.0,0.0,0.0,0.0,0.0,0.0
1,0.184524,0.031308,0.005693,0.005959,0.008144,0.003512,0.005786,0.00855,0.009141,0.001791,...,92936.0,408.0,608.0,2604.0,4096.0,224.0,442296.0,0.0,0.0,0.0
2,0.251737,0.014205,0.006841,0.008556,0.023493,0.002858,0.003401,0.008556,0.010215,0.001176,...,0.0,0.0,0.0,1120.0,4096.0,192.0,37280.0,0.0,0.0,0.0
3,0.008964,0.004055,0.003925,0.003936,0.004037,0.003878,0.003847,0.003946,0.003939,0.003834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.020401,0.005213,0.004519,0.004097,0.00424,0.004029,0.003785,0.004593,0.004875,0.00378,...,0.0,0.0,0.0,520.0,4096.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train_df = train_df.loc[train_df['label'] != -1]

# Analyzing the Dataset

In [9]:
from scipy import stats

In [15]:
for column in train_df.columns:
    alpha = 0.05
    shapiro = stats.shapiro(train_df[column]).pvalue > alpha
    agostino = stats.normaltest(train_df[column]).pvalue > alpha
    if shapiro or agostino:
        print(f"{column} is likely normal")

  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


header-machine_1 is likely normal
header-machine_10 is likely normal
header-charistics_2 is likely normal
header-subsystem_2 is likely normal
header-subsystem_4 is likely normal
header-subsystem_6 is likely normal
header-dll_charistics_3 is likely normal
header-dll_charistics_7 is likely normal
header-dll_charistics_10 is likely normal
header-magic_1 is likely normal
header-magic_3 is likely normal
header-magic_4 is likely normal
header-magic_6 is likely normal
header-magic_7 is likely normal
header-magic_8 is likely normal
header-magic_9 is likely normal
header-magic_10 is likely normal
section-name_h_7 is likely normal
section-name_h_17 is likely normal
section-name_h_20 is likely normal
section-name_h_25 is likely normal
section-name_h_41 is likely normal
section-name_h_49 is likely normal
section-charistics_h_2 is likely normal
section-charistics_h_4 is likely normal
section-charistics_h_6 is likely normal
section-charistics_h_7 is likely normal
section-charistics_h_8 is likely nor

In [9]:
train_df.describe()

Unnamed: 0,histogram-1,histogram-2,histogram-3,histogram-4,histogram-5,histogram-6,histogram-7,histogram-8,histogram-9,histogram-10,...,datadirs-22,datadirs-23,datadirs-24,datadirs-25,datadirs-26,datadirs-27,datadirs-28,datadirs-29,datadirs-30,label
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,...,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,0.212265,0.011521,0.007161,0.005871,0.008077,0.004159,0.004931,0.003519,0.006469,0.003114,...,626483.8,320774.6,119949.5,7577.226,844156.6,420196.7,706795.2,867692.9,221821.6,0.5
std,0.180663,0.011756,0.007484,0.007713,0.012017,0.00542,0.007135,0.003517,0.004845,0.004913,...,34812320.0,30265330.0,14338720.0,4908170.0,40370680.0,38555340.0,40531570.0,41640740.0,20371620.0,0.5
min,0.000104,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.080163,0.004912,0.003834,0.003468,0.004034,0.002538,0.002309,0.002035,0.003802,0.001721,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.169621,0.00865,0.005253,0.00463,0.00618,0.003791,0.003648,0.00335,0.005448,0.002714,...,0.0,0.0,0.0,212.0,8192.0,0.0,0.0,0.0,0.0,0.5
75%,0.28202,0.014275,0.007838,0.006695,0.009899,0.004954,0.004795,0.004043,0.008675,0.003859,...,41592.0,0.0,0.0,680.0,167936.0,0.0,0.0,0.0,0.0,1.0
max,0.999956,0.859942,0.638846,0.482176,0.386876,0.949696,0.361418,0.549687,0.747807,0.417935,...,3590280000.0,4278220000.0,3787571000.0,3800990000.0,3914422000.0,3771926000.0,4106158000.0,3758096000.0,4099570000.0,1.0


In [10]:
train_df.cov()

Unnamed: 0,histogram-1,histogram-2,histogram-3,histogram-4,histogram-5,histogram-6,histogram-7,histogram-8,histogram-9,histogram-10,...,datadirs-22,datadirs-23,datadirs-24,datadirs-25,datadirs-26,datadirs-27,datadirs-28,datadirs-29,datadirs-30,label
histogram-1,3.263910e-02,0.000142,0.000011,-0.000095,-5.745684e-07,-0.000110,-0.000048,-0.000130,-0.000140,-0.000129,...,1.260872e+04,9.803446e+03,-3.200852e+03,-1.313927e+03,7.801030e+03,2.051668e+04,-1.119266e+04,-8.719299e+04,5.832481e+03,-0.014127
histogram-2,1.416713e-04,0.000138,0.000044,0.000022,2.587136e-05,0.000010,0.000020,0.000005,0.000010,0.000003,...,-2.344609e+03,-2.123120e+03,-7.943956e+02,-4.445135e+01,-2.395680e+03,-2.737943e+03,-2.926946e+03,-6.591731e+03,-1.375656e+03,-0.001325
histogram-3,1.077242e-05,0.000044,0.000056,0.000018,2.209341e-05,0.000008,0.000019,0.000006,0.000008,0.000004,...,-5.471382e+02,-8.649704e+02,-3.365686e+02,-2.019585e+01,-1.047296e+03,-1.020030e+03,-7.210996e+02,-2.574988e+03,-5.364752e+02,-0.000722
histogram-4,-9.519297e-05,0.000022,0.000018,0.000059,1.353280e-05,0.000007,0.000011,0.000004,0.000006,0.000003,...,-7.707221e+02,-6.900406e+02,-1.914414e+02,-1.163919e+01,-9.481306e+02,-8.149339e+02,-8.184400e+02,-1.763740e+03,-4.484542e+02,-0.000356
histogram-5,-5.745684e-07,0.000026,0.000022,0.000014,1.444202e-04,0.000006,0.000013,0.000004,0.000009,0.000002,...,-1.488125e+03,-1.389771e+03,-5.168135e+02,-2.749666e+01,-1.745802e+03,-1.807845e+03,-2.088252e+03,-3.566980e+03,-9.092755e+02,-0.000866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
datadirs-27,2.051668e+04,-2737.942858,-1020.030444,-814.933900,-1.807845e+03,-497.026766,-837.218865,-169.698760,-1445.178766,-133.156860,...,1.174323e+15,9.888989e+14,3.222585e+14,1.585368e+13,1.548678e+15,1.486514e+15,1.537150e+15,6.844472e+14,6.590120e+14,210069.375823
datadirs-28,-1.119266e+04,-2926.945530,-721.099590,-818.439985,-2.088252e+03,-440.480070,-854.685951,164.359570,-1102.992126,-53.226293,...,1.216384e+15,1.072190e+15,3.587017e+14,2.250890e+13,1.605603e+15,1.537150e+15,1.642808e+15,7.451534e+14,7.168727e+14,256738.769775
datadirs-29,-8.719299e+04,-6591.730587,-2574.987982,-1763.739755,-3.566980e+03,-1311.651204,-1486.640772,-658.384302,-2651.218559,-240.278447,...,8.276615e+14,6.615452e+14,2.338070e+14,3.947527e+12,7.224181e+14,6.844472e+14,7.451534e+14,1.733952e+15,4.345321e+14,433837.306458
datadirs-30,5.832481e+03,-1375.655970,-536.475209,-448.454176,-9.092755e+02,-228.295741,-360.387208,-63.201094,-726.513679,-23.543989,...,5.241464e+14,6.110557e+14,2.307000e+14,2.755335e+13,6.936357e+14,6.590120e+14,7.168727e+14,4.345321e+14,4.150031e+14,104431.170378


### Spearman's

### Ridge Regression

### Linear Discriminant Analysis

### PCA w/ Grid Search