# Simple Heuristic

The purpose of this notebook is to test a simple heuristic on the static imports.

It creates 3 bags of features: one with goodware features only; one with malware features only; one with goodware and malware common features.

This is using Celery, make sure it's running (`celery -A tasks worker --loglevel=info -Ofair`).

In [1]:
import pandas as pd
import numpy as np
import datetime
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from celery import group
from tasks import simple_heuristic


# globals
cv_token_pattern = u'[^;]+'
min_df=1

samples = pd.read_csv('data/pe32_imports_labeled.csv')
goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]

# Bag creation (all samples)

In [8]:
goodware_cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
goodware_cv.fit(goodware.imports)
goodware_features = goodware_cv.get_feature_names()
print('Total goodware features: {}'.format(len(goodware_features)))

malware_cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
malware_cv.fit(malware.imports)
malware_features = malware_cv.get_feature_names()
print('Total malware features: {}'.format(len(malware_features)))

common_features = set(goodware_features).intersection(malware_features)
malware_only_features = set(malware_features).difference(goodware_features)
goodware_only_features = set(goodware_features).difference(malware_features)

# Save features to use in 1.6
# pd.Series(list(goodware_only_features)).to_csv(path='data/goodware_only_features.csv')
# pd.Series(list(malware_only_features)).to_csv(path='data/malware_only_features.csv')
# pd.Series(list(common_features)).to_csv(path='data/common_features.csv')

print('Common features: {}\nMalware only: {}\nGoodware only:{}'.format(
    len(common_features), len(malware_only_features), len(goodware_only_features)))

Total goodware features: 239545
Total malware features: 34583
Common features: 22031
Malware only: 12552
Goodware only:217514


## Sanity Check

Take malware and goodware, check to which bag they get assigned.

In [3]:
sanity_validation = pd.concat([goodware, malware])

common_count = 0
malware_count= 0
goodware_count=0
# Malware mixed with common
mmixed_count = 0
# Goodware mixed with common
gmixed_count = 0

fp_count = 0
tp_count = 0
fn_count = 0
tn_count = 0

for _, sample in sanity_validation.iterrows():
    features = set(sample.imports.split(';'))
    
    # Common only features
    common = features.difference(common_features)
    if len(common) == 0:
        common_count += 1
        continue
        
    # Malware only features
    diff = common.difference(malware_only_features)
    if len(diff) == 0:
        malware_count += 1
        # True positive
        if sample.malware == 1:
            tp_count += 1
        # False positive
        else:
            fp_count += 1
        continue
    
    # Goodware only features
    diff = common.difference(goodware_only_features)
    if len(diff) == 0:
        goodware_count += 1
        # True negative
        if sample.malware == 0:
            tn_count += 1
        else:
            fn_count += 1
        continue

print('Total:\t\t{}'.format(len(sanity_validation)))
print('Common:\t\t{} ({:.3f})\nMalware:\t{} ({:.3f})\nGoodware:\t{} ({:.3f})'.format(
    common_count, common_count/len(sanity_validation),
    malware_count, malware_count/len(sanity_validation),
    goodware_count, goodware_count/len(sanity_validation)))

fp_rate = fp_count / (fp_count + tp_count)
fn_rate = fn_count / (fn_count + tn_count)

print('False Positive: {:.2f}\nFalse Negative: {:.2f}'.format(fp_rate, fn_rate))

Total:		102382
Common:		94949 (0.927)
Malware:	3282 (0.032)
Goodware:	4151 (0.041)
False Positive: 0.00
False Negative: 0.00


# Bag creation (subset of samples)

In [4]:
test_size = 0.1

goodware_split = [goodware[:int(len(goodware) * (1 - test_size))],
                  goodware[int(len(goodware) * (1 - test_size)):]]
malware_split = [malware[:int(len(malware) * (1 - test_size))],
                  malware[int(len(malware) * (1 - test_size)):]]
#malware_split = [malware[:len(goodware_split[0])],
#                  malware[len(goodware_s:]]

goodware_cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
goodware_cv.fit(goodware_split[0].imports)
goodware_features = goodware_cv.get_feature_names()
print('Total goodware features: {}'.format(len(goodware_features)))

malware_cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
malware_cv.fit(malware_split[0].imports)
malware_features = malware_cv.get_feature_names()
print('Total malware features: {}'.format(len(malware_features)))

common_features = set(goodware_features).intersection(malware_features)
malware_only_features = set(malware_features).difference(goodware_features)
goodware_only_features = set(goodware_features).difference(malware_features)

print('Common features: {}\nMalware only: {}\nGoodware only:{}'.format(
    len(common_features), len(malware_only_features), len(goodware_only_features)))

Total goodware features: 219230
Total malware features: 32931
Common features: 21216
Malware only: 11715
Goodware only:198014


In [5]:
validation = pd.concat([goodware_split[1], malware_split[1]])

# Splitting in batches of ~x samples
n_batches = int(len(validation) / 5000)
batches = np.array_split(validation, n_batches)

print('[{0}] Starting batches...'.format(datetime.datetime.now()))
print('[{0}] Sending count tasks...'.format(datetime.datetime.now()))
jobs = group([simple_heuristic.s(
    list(common_features),
    list(malware_features),
    list(goodware_features),
    batch.to_json(orient='records')) for batch in batches])
print('[{0}] Done sending, waiting...'.format(datetime.datetime.now()))
result = jobs.apply_async()
result.join()
print('[{0}] All tasks done.'.format(datetime.datetime.now()))

[2017-03-20 19:50:28.411570] Starting batches...
[2017-03-20 19:50:28.411681] Sending count tasks...
[2017-03-20 19:50:28.515879] Done sending, waiting...
[2017-03-20 19:50:29.934283] All tasks done.


In [6]:
common_count = 0
unknown_count = 0

fp_count = 0
tp_count = 0
fn_count = 0
tn_count = 0

for r in result.get():
    common_count += r['common_count']
    unknown_count += r['unknown_count']
    fp_count += r['fp_count']
    tp_count += r['tp_count']
    fn_count += r['fn_count']
    tn_count += r['tn_count']
    
print('Total:\t\t{}'.format(len(validation)))
print('Common:\t\t{} ({:.3f})\nUnknown:\t{} ({:.3f})\nMalware:\t{} ({:.3f})\nGoodware:\t{} ({:.3f})'.format(
    common_count, common_count/len(validation),
    unknown_count, unknown_count/len(validation),
    fp_count + tp_count, (fp_count + tp_count)/len(validation),
    fn_count + tn_count, (fn_count + tn_count)/len(validation)))

fp_rate = fp_count / (fp_count + tp_count)
fn_rate = fn_count / (fn_count + tn_count)

print('False Positive: {:.3f}\nFalse Negative: {:.3f}'.format(fp_rate, fn_rate))

Total:		10239
Common:		9465 (0.924)
Unknown:	71 (0.007)
Malware:	226 (0.022)
Goodware:	477 (0.047)
False Positive: 0.044
False Negative: 0.143
