In [20]:
import ember
import pandas as pd
import numpy as np
import json
import datetime

import matplotlib.pyplot as plt

from typing import Type
from ember.features import (
    PEFeatureExtractor,
    ByteHistogram,
    ByteEntropyHistogram,
    SectionInfo,
    ImportsInfo,
    ExportsInfo,
    GeneralFileInfo,
    HeaderFileInfo,
    StringExtractor,
    DataDirectories
)

In [2]:
metadata = ember.read_metadata("ember2018/")

In [3]:
metadata.head()

Unnamed: 0,sha256,appeared,label,avclass,subset
0,0abb4fda7d5b13801d63bee53e5e256be43e141faa077a...,2006-12,0,,train
1,c9cafff8a596ba8a80bafb4ba8ae6f2ef3329d95b85f15...,2007-01,0,,train
2,eac8ddb4970f8af985742973d6f0e06902d42a3684d791...,2007-02,0,,train
3,7f513818bcc276c531af2e641c597744da807e21cc1160...,2007-02,0,,train
4,ca65e1c387a4cc9e7d8a8ce12bf1bcf9f534c9032b9d95...,2007-02,0,,train


In [8]:
PEFeatureExtractor().features



[histogram(256),
 byteentropy(256),
 strings(104),
 general(10),
 header(62),
 section(255),
 imports(1280),
 exports(128),
 datadirectories(30)]

In [29]:
def histogram_map():
    return [f'histogram-{i+1}' for i in range(256)]

def entropy_map ():
    return [f'entropy-{i+1}' for i in range(256)]

def strings_map():
    features = []
    features.extend(['strings-numstrings', 'strings-avlength', 'strings-printables'])
    features.extend(f'strings-printable_dist-{i+1}' for i in range(96))
    features.extend(['strings-entropy', 'strings-paths', 'strings-urls', 'strings-registry','strings-MZ'])
    return features

def general_map():
    return ['general-size', 'general-vsize', 'general-has_debug', 'general-exports', 'general-imports', 
            'general-has_relocations', 'general-has_resources', 'general-has_signature',
            'general-has_tls', 'general-symbols']

def header_map():
    features = ['header-timestamp']
    features.extend(f'header-machine_{i+1}' for i in range(10))
    features.extend(f'header-charistics_{i+1}' for i in range(10))
    features.extend(f'header-subsystem_{i+1}' for i in range(10))
    features.extend(f'header-dll_charistics_{i+1}' for i in range(10))
    features.extend(f'header-magic_{i+1}' for i in range(10))
    features.extend(['header-major_image_ver', 'header-minor_image_ver', 'header-major_linker_ver', 
                     'header-minor_linker_ver', 'header-major_os_ver', 'header-minor_os_ver', 
                     'header-major_subsystem_ver', 'header-minor_subsystem_ver', 'header-sizeof_code',
                     'header-sizeof_headers', 'header-sizeof_heap_commit'])
    return features

def section_map():
    features = ['section-total_num', 'section-num_nonzero', 'section-num_noname', 'section-num_rx', 'section-num_w']
    features.extend(f'section-sizes_h_{i+1}' for i in range(50))
    features.extend(f'section-entropy_h_{i+1}' for i in range(50))
    features.extend(f'section-vsize_h_{i+1}' for i in range(50))
    features.extend(f'section-name_h_{i+1}' for i in range(50))
    features.extend(f'section-charistics_h_{i+1}' for i in range(50))
    return features

def imports_map():
    features = []
    features.extend(f'imports-libraries_h_{i+1}' for i in range(256))
    features.extend(f'imports-imports_h_{i+1}' for i in range(1024))
    return features

def exports_map():
    features = []
    features.extend(f'exports-hashed_{i+1}' for i in range(128))
    return features

def datadirs_map():
    features = []
    features.extend(f'datadirs-{i+1}' for i in range(30))
    return features


def get_map(feature_type) -> list:
    """ Gets the feature map based on the type """
    
    if isinstance(feature_type, ByteHistogram):
        features = histogram_map()
    
    elif isinstance(feature_type, ByteEntropyHistogram):
        features = entropy_map()
    
    elif isinstance(feature_type, SectionInfo):
        features = section_map()
    
    elif isinstance(feature_type, ImportsInfo):
        features = imports_map()
    
    elif isinstance(feature_type, ExportsInfo):
        features = exports_map()
    
    elif isinstance(feature_type, GeneralFileInfo):
        features = general_map()
    
    elif isinstance(feature_type, HeaderFileInfo):
        features = header_map()
    
    elif isinstance(feature_type, StringExtractor):
        features = strings_map()
    
    elif isinstance(feature_type, DataDirectories):
        features = datadirs_map()
        
    else:
        raise ValueError(f"Type {feature_type.__class__.__name__} is not supported")
    
    assert feature_type.dim == len(features), f"Type {feature_type.__class__.__name__} is missing features"
    return features


def feature_map():
    features = []
    for f in PEFeatureExtractor().features:
        features.extend(get_map(f))
    return features


In [30]:
features = feature_map()
len(features)



2381

In [31]:
X_train, y_train, X_test, y_test = ember.read_vectorized_features("ember2018/")



In [32]:
train_df = pd.DataFrame(X_train, columns=features)

In [35]:
train_df['label'] = y_train

In [36]:
train_df.head()

Unnamed: 0,histogram-1,histogram-2,histogram-3,histogram-4,histogram-5,histogram-6,histogram-7,histogram-8,histogram-9,histogram-10,...,datadirs-22,datadirs-23,datadirs-24,datadirs-25,datadirs-26,datadirs-27,datadirs-28,datadirs-29,datadirs-30,label
0,0.014676,0.004222,0.003923,0.004029,0.004007,0.003775,0.003825,0.003887,0.004153,0.003804,...,35240.0,0.0,0.0,660.0,32768.0,0.0,0.0,0.0,0.0,0.0
1,0.184524,0.031308,0.005693,0.005959,0.008144,0.003512,0.005786,0.00855,0.009141,0.001791,...,92936.0,408.0,608.0,2604.0,4096.0,224.0,442296.0,0.0,0.0,0.0
2,0.251737,0.014205,0.006841,0.008556,0.023493,0.002858,0.003401,0.008556,0.010215,0.001176,...,0.0,0.0,0.0,1120.0,4096.0,192.0,37280.0,0.0,0.0,0.0
3,0.008964,0.004055,0.003925,0.003936,0.004037,0.003878,0.003847,0.003946,0.003939,0.003834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.020401,0.005213,0.004519,0.004097,0.00424,0.004029,0.003785,0.004593,0.004875,0.00378,...,0.0,0.0,0.0,520.0,4096.0,0.0,0.0,0.0,0.0,0.0


In [38]:
train_df = train_df.loc[train_df['label'] != -1]