# LR Filtered Dataset

Testing the classifier when using a filtered dataset (selected malware types e.g. virus, trojan, worm)

In [70]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import lib.helpers as jcfg_helpers


samples = pd.read_csv('data/pe32_imports_labeled.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
samples = samples.set_index('date').sort_index()

goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]

In [71]:
test_size = 0.1

## Dataset based on Microsoft naming

Create a dataset with trojan and goodware

In [122]:
# Get the microsoft classification, simple and straightforward
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'virus']
microsoft_malware = malware[malware.link.isin(microsoft_filter.link)]

display(microsoft_class.groupby('type').count().md5.sort_values(ascending=False))
display(len(microsoft_class))

type
trojan              19005
backdoor            16017
worm                11679
trojandownloader     9643
virus                6259
virtool              5796
pws                  5603
adware               3715
trojanspy            2961
ransom               2688
trojandropper        2502
hacktool             1905
rogue                 712
ddos                  413
softwarebundler       395
trojanproxy           384
browsermodifier       241
monitoringtool        206
dialer                136
exploit               136
spammer               127
trojanclicker         107
program                55
constructor            51
joke                   19
tool                   17
settingsmodifier        9
spyware                 8
                        8
dos                     6
remoteaccess            3
misleading              2
Name: md5, dtype: int64

90808

### Training/Validation creation

In [123]:
# Limit the datasets by the smaller one
lim = min(len(goodware), len(microsoft_malware))

training_size = int(lim * (1 - test_size))
validation_size = int(lim * test_size)
mal_downscale = 0.5

training = pd.concat([goodware[:training_size],
                      microsoft_malware[:int(training_size * mal_downscale)]])
validation = pd.concat([goodware[training_size:training_size + validation_size],
                        microsoft_malware[int(training_size * mal_downscale):int(training_size * mal_downscale) + int(validation_size * mal_downscale)]])

print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
                                                                 len(training[training.malware == 0])))
print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
                                                                     len(validation[validation.malware == 0])))

Malware in training: 1990
Goodware in training: 3981
Malware in validation: 221
Goodware in validation: 442


### Feature Vector w/o Feature Selection

In [124]:
cv_token_pattern = u'[^;]+'
min_df = 2

cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
train_X = cv.fit_transform(training.imports)

print('Total features: {}'.format(len(cv.get_feature_names())))
# display(cv.get_feature_names())

Total features: 14629


#### LR

In [125]:
# Logistic Regression Stuff
train_Y = training.malware
lr = LogisticRegression().fit(train_X, train_Y)

#### Scoring

In [126]:
# Scoring
test_X = cv.transform(validation.imports)
test_Y = validation.malware

score = confusion_matrix(test_Y, lr.predict(test_X))
fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)


print()
print('CC:\t{:.4f}'.format(cc_rate))
print('DR:\t{:.4f}'.format(det_rate))
print('FP:\t{:.4f}'.format(fp_rate))
print('FN:\t{:.4f}'.format(fn_rate))


CC:	0.8311
DR:	0.8054
FP:	0.2794
FN:	0.1034


### Feature Vector exclusive features

In [99]:
malware_features = pd.Series.from_csv('data/malware_only_features.csv')
goodware_features = pd.Series.from_csv('data/goodware_only_features.csv')
features = pd.concat([malware_features, goodware_features])
# Needs previous CV to have feature names
features = features[features.isin(cv.get_feature_names())]

cv_token_pattern = u'[^;]+'
min_df = 2

cv2 = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=features)
train_X = cv2.transform(training.imports)

print('Total features: {}'.format(len(cv2.get_feature_names())))
# display(cv.get_feature_names())

Total features: 18731


#### LR

In [100]:
# Logistic Regression Stuff
train_Y = training.malware
lr2 = LogisticRegression().fit(train_X, train_Y)

#### Scoring

In [101]:
# Scoring
test_X = cv2.transform(validation.imports)
test_Y = validation.malware

score = confusion_matrix(test_Y, lr2.predict(test_X))
fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)


print()
print('CC:\t{:.4f}'.format(cc_rate))
print('DR:\t{:.4f}'.format(det_rate))
print('FP:\t{:.4f}'.format(fp_rate))
print('FN:\t{:.4f}'.format(fn_rate))


CC:	0.6761
DR:	0.1362
FP:	0.0000
FN:	0.3414
