# LR Filtered Dataset

Testing the classifier when using a filtered dataset (selected malware types e.g. virus, trojan, worm) with feature selection

In [1]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import lib.helpers as jcfg_helpers


samples = pd.read_csv('data/pe32_imports_labeled.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
samples = samples.set_index('date').sort_index()

samples = samples[samples.imports.str.count(';') > 5]

goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]

print('Total goodware: {}'.format(len(goodware)))
print('Total malware: {}'.format(len(malware)))

Total goodware: 27779
Total malware: 54933


## Feature Selection

Get common and exclusive features for both malware and goodware

In [2]:
cv_token_pattern = u'[^;]+'
min_df = 2

# Feature Vector for malware
cv_malware = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
cv_malware.fit(malware.imports)

malware_features = set(cv_malware.get_feature_names())
print('Malware total features (with min_df): {}'.format(len(malware_features)))

# Feature Vector for goodware
cv_goodware = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
cv_goodware.fit(goodware.imports)

goodware_features = set(cv_goodware.get_feature_names())
print('Goodware total features (with min_df): {}'.format(len(goodware_features)))

# Do the filtering
common_features = malware_features.intersection(goodware_features)
print('Common features: {}'.format(len(common_features)))

Malware total features (with min_df): 18450
Goodware total features (with min_df): 85770
Common features: 13442


## Dataset for Trojans

In [3]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'trojan']
symantec_filter = symantec_class[symantec_class.prefix == 'trojan']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'troj']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 34619


## Testing (common features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [4]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=common_features)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9307
DR:	0.3847
FP:	0.2764
FN:	0.0587

		Downscale of 0.2
CC:	0.8857
DR:	0.4676
FP:	0.2471
FN:	0.0989

		Downscale of 0.3
CC:	0.7860
DR:	0.2122
FP:	0.3970
FN:	0.1979

		Downscale of 0.4
CC:	0.8872
DR:	0.7875
FP:	0.1882
FN:	0.0840

		Downscale of 0.5
CC:	0.8613
DR:	0.7601
FP:	0.1882
FN:	0.1163

		Downscale of 0.6
CC:	0.8491
DR:	0.7631
FP:	0.1781
FN:	0.1363

		Downscale of 0.7
CC:	0.8031
DR:	0.6747
FP:	0.1847
FN:	0.2032

		Downscale of 0.8
CC:	0.8039
DR:	0.7077
FP:	0.1738
FN:	0.2098

		Downscale of 0.9
CC:	0.8103
DR:	0.7489
FP:	0.1664
FN:	0.2070

		Downscale of 1.0
CC:	0.7850
DR:	0.7124
FP:	0.1666
FN:	0.2511



## Dataset for Backdoors

In [5]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'backdoor']
symantec_filter = symantec_class[symantec_class.prefix == 'backdoor']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'bkdr']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 8887


## Testing (common features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [6]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=common_features)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9366
DR:	0.4369
FP:	0.2362
FN:	0.0540

		Downscale of 0.2
CC:	0.9171
DR:	0.6622
FP:	0.1945
FN:	0.0652

		Downscale of 0.3
CC:	0.9086
DR:	0.8228
FP:	0.2104
FN:	0.0538

		Downscale of 0.4
CC:	0.8752
DR:	0.7286
FP:	0.1851
FN:	0.1041

		Downscale of 0.5
CC:	0.8658
DR:	0.7604
FP:	0.1766
FN:	0.1154

		Downscale of 0.6
CC:	0.8559
DR:	0.7875
FP:	0.1792
FN:	0.1244

		Downscale of 0.7
CC:	0.8551
DR:	0.8205
FP:	0.1737
FN:	0.1250

		Downscale of 0.8
CC:	0.8569
DR:	0.8446
FP:	0.1648
FN:	0.1254

		Downscale of 0.9
CC:	0.8452
DR:	0.8293
FP:	0.1585
FN:	0.1516

		Downscale of 1.0
CC:	0.8217
DR:	0.7897
FP:	0.1563
FN:	0.1976



## Dataset for Worms

In [7]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'worm']
symantec_filter = symantec_class[symantec_class.suffix == 'worm']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'worm']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 7120


## Testing (common features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [8]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=common_features)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9142
DR:	0.4157
FP:	0.4638
FN:	0.0571

		Downscale of 0.2
CC:	0.9583
DR:	0.9803
FP:	0.1903
FN:	0.0041

		Downscale of 0.3
CC:	0.9283
DR:	0.8408
FP:	0.1528
FN:	0.0476

		Downscale of 0.4
CC:	0.9250
DR:	0.8806
FP:	0.1399
FN:	0.0482

		Downscale of 0.5
CC:	0.9378
DR:	0.9393
FP:	0.1181
FN:	0.0314

		Downscale of 0.6
CC:	0.9266
DR:	0.9204
FP:	0.1120
FN:	0.0488

		Downscale of 0.7
CC:	0.9032
DR:	0.8732
FP:	0.1104
FN:	0.0876

		Downscale of 0.8
CC:	0.8820
DR:	0.8371
FP:	0.1091
FN:	0.1243

		Downscale of 0.9
CC:	0.8696
DR:	0.8240
FP:	0.1075
FN:	0.1482

		Downscale of 1.0
CC:	0.8610
DR:	0.8146
FP:	0.1022
FN:	0.1697



## Dataset for Virus

In [9]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'virus']
symantec_filter = symantec_class[symantec_class.prefix == 'virusdoctor']
trendmicro_filter = trendmicro_class[trendmicro_class.family == 'virus']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 3607


## Testing (common features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [10]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=common_features)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9062
DR:	0.2444
FP:	0.5319
FN:	0.0720

		Downscale of 0.2
CC:	0.8622
DR:	0.4722
FP:	0.3885
FN:	0.1008

		Downscale of 0.3
CC:	0.8318
DR:	0.5815
FP:	0.3485
FN:	0.1215

		Downscale of 0.4
CC:	0.8176
DR:	0.6750
FP:	0.3174
FN:	0.1293

		Downscale of 0.5
CC:	0.8194
DR:	0.7289
FP:	0.2711
FN:	0.1354

		Downscale of 0.6
CC:	0.7835
DR:	0.6722
FP:	0.2711
FN:	0.1877

		Downscale of 0.7
CC:	0.7524
DR:	0.6254
FP:	0.2663
FN:	0.2374

		Downscale of 0.8
CC:	0.7323
DR:	0.6306
FP:	0.2701
FN:	0.2663

		Downscale of 0.9
CC:	0.7329
DR:	0.6667
FP:	0.2572
FN:	0.2744

		Downscale of 1.0
CC:	0.7653
DR:	0.7580
FP:	0.2309
FN:	0.2385

