# LR Filtered Dataset

Testing the classifier when using a filtered dataset (selected malware types e.g. virus, trojan, worm) with feature selection

In [1]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import lib.helpers as jcfg_helpers


samples = pd.read_csv('data/pe32_imports_labeled.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
samples = samples.set_index('date').sort_index()

goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]

print('Total goodware: {}'.format(len(goodware)))
print('Total malware: {}'.format(len(malware)))

Total goodware: 34509
Total malware: 67873


## Feature Selection

Get common and exclusive features for both malware and goodware

In [2]:
cv_token_pattern = u'[^;]+'
min_df = 2

# Feature Vector for malware
cv_malware = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
cv_malware.fit(malware.imports)

malware_features = set(cv_malware.get_feature_names())
print('Malware total features (with min_df): {}'.format(len(malware_features)))

# Feature Vector for goodware
cv_goodware = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
cv_goodware.fit(goodware.imports)

goodware_features = set(cv_goodware.get_feature_names())
print('Goodware total features (with min_df): {}'.format(len(goodware_features)))

# Do the filtering
malware_only_features = malware_features.difference(goodware_features)
print('Malware features: {}'.format(len(malware_only_features)))

goodware_only_features = goodware_features.difference(malware_features)
print('Goodware features: {}'.format(len(goodware_only_features)))

Malware total features (with min_df): 18457
Goodware total features (with min_df): 85774
Malware features: 5012
Goodware features: 72329


## Dataset for Trojans

In [3]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'trojan']
symantec_filter = symantec_class[symantec_class.prefix == 'trojan']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'troj']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 41870


## Testing (exclusive features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [4]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df,
                         vocabulary=goodware_only_features.union(malware_only_features))
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9101
DR:	0.0104
FP:	0.0000
FN:	0.0900

		Downscale of 0.2
CC:	0.8354
DR:	0.0145
FP:	0.1379
FN:	0.1647

		Downscale of 0.3
CC:	0.7753
DR:	0.0278
FP:	0.0526
FN:	0.2259

		Downscale of 0.4
CC:	0.7368
DR:	0.0797
FP:	0.0143
FN:	0.2691

		Downscale of 0.5
CC:	0.6908
DR:	0.0735
FP:	0.0155
FN:	0.3167

		Downscale of 0.6
CC:	0.6470
DR:	0.0599
FP:	0.0190
FN:	0.3608

		Downscale of 0.7
CC:	0.6136
DR:	0.0633
FP:	0.0280
FN:	0.3963

		Downscale of 0.8
CC:	0.5840
DR:	0.0662
FP:	0.0338
FN:	0.4280

		Downscale of 0.9
CC:	0.5542
DR:	0.0618
FP:	0.0457
FN:	0.4585

		Downscale of 1.0
CC:	0.5593
DR:	0.9919
FP:	0.4682
FN:	0.0602



## Dataset for Backdoors

In [5]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'backdoor']
symantec_filter = symantec_class[symantec_class.prefix == 'backdoor']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'bkdr']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 13273


## Testing (exclusive features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [6]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df,
                         vocabulary=goodware_only_features.union(malware_only_features))
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9101
DR:	0.0091
FP:	0.0000
FN:	0.0900

		Downscale of 0.2
CC:	0.8367
DR:	0.0196
FP:	0.0000
FN:	0.1638

		Downscale of 0.3
CC:	0.7943
DR:	0.1095
FP:	0.0091
FN:	0.2108

		Downscale of 0.4
CC:	0.7302
DR:	0.0565
FP:	0.0132
FN:	0.2740

		Downscale of 0.5
CC:	0.6767
DR:	0.0313
FP:	0.0370
FN:	0.3264

		Downscale of 0.6
CC:	0.6353
DR:	0.0296
FP:	0.0781
FN:	0.3682

		Downscale of 0.7
CC:	0.5956
DR:	0.0198
FP:	0.0980
FN:	0.4072

		Downscale of 0.8
CC:	0.5638
DR:	0.0207
FP:	0.0984
FN:	0.4397

		Downscale of 0.9
CC:	0.5351
DR:	0.0211
FP:	0.1127
FN:	0.4690

		Downscale of 1.0
CC:	0.5487
DR:	0.9958
FP:	0.4743
FN:	0.0399



## Dataset for Worms

In [7]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'worm']
symantec_filter = symantec_class[symantec_class.suffix == 'worm']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'worm']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 9418


## Testing (exclusive features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [8]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df,
                         vocabulary=goodware_only_features.union(malware_only_features))
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9115
DR:	0.0255
FP:	0.0000
FN:	0.0887

		Downscale of 0.2
CC:	0.8360
DR:	0.0170
FP:	0.1111
FN:	0.1641

		Downscale of 0.3
CC:	0.8042
DR:	0.1544
FP:	0.0180
FN:	0.2024

		Downscale of 0.4
CC:	0.7557
DR:	0.1467
FP:	0.0143
FN:	0.2545

		Downscale of 0.5
CC:	0.7057
DR:	0.1189
FP:	0.0141
FN:	0.3060

		Downscale of 0.6
CC:	0.6452
DR:	0.0552
FP:	0.0250
FN:	0.3619

		Downscale of 0.7
CC:	0.6053
DR:	0.0437
FP:	0.0526
FN:	0.4013

		Downscale of 0.8
CC:	0.5858
DR:	0.0712
FP:	0.0429
FN:	0.4269

		Downscale of 0.9
CC:	0.5709
DR:	0.0977
FP:	0.0372
FN:	0.4489

		Downscale of 1.0
CC:	0.5480
DR:	0.1007
FP:	0.0444
FN:	0.4747



## Dataset for Virus

In [9]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'virus']
symantec_filter = symantec_class[symantec_class.prefix == 'virusdoctor']
trendmicro_filter = trendmicro_class[trendmicro_class.family == 'virus']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 4467


## Testing (exclusive features)

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [10]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df,
                         vocabulary=goodware_only_features.union(malware_only_features))
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    # malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    # goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    # print('\nMalware stats:')
    # display(pd.Series(malware_only[:,1]).describe())
    # print('\nGoodware stats:')
    # display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1


  fp_rate = matrix[0][1] / (matrix[0][1] + matrix[1][1])


CC:	0.9095
DR:	0.0000
FP:	nan
FN:	0.0905

		Downscale of 0.2
CC:	0.8335
DR:	0.0000
FP:	nan
FN:	0.1665

		Downscale of 0.3
CC:	0.7703
DR:	0.0030
FP:	0.0000
FN:	0.2298

		Downscale of 0.4
CC:	0.7183
DR:	0.0135
FP:	0.0000
FN:	0.2828

		Downscale of 0.5
CC:	0.6703
DR:	0.0125
FP:	0.1250
FN:	0.3307

		Downscale of 0.6
CC:	0.6280
DR:	0.0105
FP:	0.2222
FN:	0.3727

		Downscale of 0.7
CC:	0.5999
DR:	0.0333
FP:	0.1333
FN:	0.4044

		Downscale of 0.8
CC:	0.5692
DR:	0.0359
FP:	0.1351
FN:	0.4363

		Downscale of 0.9
CC:	0.5335
DR:	0.0229
FP:	0.2581
FN:	0.4696

		Downscale of 1.0
CC:	0.5287
DR:	0.9892
FP:	0.4851
FN:	0.1364

