# LR Filtered Dataset

Testing the classifier when using a filtered dataset (selected malware types e.g. virus, trojan, worm)

In [2]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import lib.helpers as jcfg_helpers


samples = pd.read_csv('data/pe32_imports_labeled.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
samples = samples.set_index('date').sort_index()

goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]

print('Total goodware: {}'.format(len(goodware)))
print('Total malware: {}'.format(len(malware)))

Total goodware: 34509
Total malware: 67873


## Dataset for Trojans

In [2]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'trojan']
symantec_filter = symantec_class[symantec_class.prefix == 'trojan']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'troj']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 41870


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [12]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9150
DR:	0.3082
FP:	0.4426
FN:	0.0661

Malware stats:


count    3.310000e+02
mean     3.633775e-01
std      3.564969e-01
min      2.154350e-07
25%      5.470049e-02
50%      2.420007e-01
75%      6.038979e-01
max      9.999999e-01
dtype: float64


Goodware stats:


count    3318.000000
mean        0.942601
std         0.132651
min         0.024514
25%         0.945300
50%         0.989109
75%         0.999479
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.8963
DR:	0.5415
FP:	0.2329
FN:	0.0865

Malware stats:


count    6.630000e+02
mean     5.489624e-01
std      3.640254e-01
min      8.770690e-08
25%      1.589174e-01
50%      6.108304e-01
75%      9.280080e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.912766
std         0.150912
min         0.005487
25%         0.841083
50%         0.982173
75%         0.999632
max         1.000000
dtype: float64


		Downscale of 0.3
CC:	0.8855
DR:	0.6583
FP:	0.1904
FN:	0.0970

Malware stats:


count    9.950000e+02
mean     6.780235e-01
std      3.185661e-01
min      7.130759e-10
25%      3.296162e-01
50%      7.444049e-01
75%      9.825324e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.870535
std         0.191155
min         0.001401
25%         0.670384
50%         0.981594
75%         0.999399
max         1.000000
dtype: float64


		Downscale of 0.4
CC:	0.8637
DR:	0.6534
FP:	0.1663
FN:	0.1276

Malware stats:


count    1327.000000
mean        0.665938
std         0.329939
min         0.000001
25%         0.385686
50%         0.775968
75%         0.988356
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.850287
std         0.212208
min         0.002386
25%         0.614314
50%         0.975537
75%         0.998951
max         1.000000
dtype: float64


		Downscale of 0.5
CC:	0.8583
DR:	0.7179
FP:	0.1660
FN:	0.1319

Malware stats:


count    1.659000e+03
mean     6.869545e-01
std      2.992347e-01
min      2.741793e-09
25%      4.114548e-01
50%      7.588778e-01
75%      9.662130e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.830215
std         0.227503
min         0.000844
25%         0.588545
50%         0.961805
75%         0.998540
max         1.000000
dtype: float64


		Downscale of 0.6
CC:	0.8668
DR:	0.8010
FP:	0.1633
FN:	0.1164

Malware stats:


count    1990.000000
mean        0.730033
std         0.253779
min         0.000011
25%         0.619831
50%         0.798213
75%         0.942779
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.805407
std         0.247039
min         0.000088
25%         0.546928
50%         0.955000
75%         0.997711
max         1.000000
dtype: float64


		Downscale of 0.7
CC:	0.8342
DR:	0.7476
FP:	0.1674
FN:	0.1648

Malware stats:


count    2.322000e+03
mean     7.513755e-01
std      2.661164e-01
min      4.983562e-08
25%      4.784336e-01
50%      8.500344e-01
75%      9.836111e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.796193
std         0.256222
min         0.000030
25%         0.536700
50%         0.947498
75%         0.997998
max         1.000000
dtype: float64


		Downscale of 0.8
CC:	0.7989
DR:	0.6737
FP:	0.1578
FN:	0.2250

Malware stats:


count    2.654000e+03
mean     7.414045e-01
std      2.812362e-01
min      6.415496e-10
25%      4.840918e-01
50%      8.644190e-01
75%      9.941780e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.787304
std         0.262844
min         0.000005
25%         0.515908
50%         0.936831
75%         0.998052
max         1.000000
dtype: float64


		Downscale of 0.9
CC:	0.7824
DR:	0.8684
FP:	0.2741
FN:	0.1439

Malware stats:


count    2.986000e+03
mean     7.367667e-01
std      2.801853e-01
min      1.178909e-09
25%      5.112202e-01
50%      8.667157e-01
75%      9.831193e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.731386e-01
std      2.750734e-01
min      2.318087e-08
25%      4.887798e-01
50%      9.282055e-01
75%      9.980447e-01
max      1.000000e+00
dtype: float64


		Downscale of 1.0
CC:	0.7819
DR:	0.8677
FP:	0.2593
FN:	0.1597

Malware stats:


count    3.318000e+03
mean     7.214340e-01
std      2.671056e-01
min      2.697429e-10
25%      5.520293e-01
50%      8.297241e-01
75%      9.601650e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.575619e-01
std      2.871125e-01
min      1.954325e-08
25%      4.479707e-01
50%      9.216381e-01
75%      9.974889e-01
max      1.000000e+00
dtype: float64




## Dataset for Backdoors

In [20]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'backdoor']
symantec_filter = symantec_class[symantec_class.prefix == 'backdoor']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'bkdr']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 13273


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [9]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9150
DR:	0.3082
FP:	0.4426
FN:	0.0661

Malware stats:


count    3.310000e+02
mean     3.633775e-01
std      3.564969e-01
min      2.154350e-07
25%      5.470049e-02
50%      2.420007e-01
75%      6.038979e-01
max      9.999999e-01
dtype: float64


Goodware stats:


count    3318.000000
mean        0.942601
std         0.132651
min         0.024514
25%         0.945300
50%         0.989109
75%         0.999479
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.8963
DR:	0.5415
FP:	0.2329
FN:	0.0865

Malware stats:


count    6.630000e+02
mean     5.489624e-01
std      3.640254e-01
min      8.770690e-08
25%      1.589174e-01
50%      6.108304e-01
75%      9.280080e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.912766
std         0.150912
min         0.005487
25%         0.841083
50%         0.982173
75%         0.999632
max         1.000000
dtype: float64


		Downscale of 0.30000000000000004
CC:	0.8855
DR:	0.6583
FP:	0.1904
FN:	0.0970

Malware stats:


count    9.950000e+02
mean     6.780235e-01
std      3.185661e-01
min      7.130759e-10
25%      3.296162e-01
50%      7.444049e-01
75%      9.825324e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.870535
std         0.191155
min         0.001401
25%         0.670384
50%         0.981594
75%         0.999399
max         1.000000
dtype: float64


		Downscale of 0.4
CC:	0.8637
DR:	0.6534
FP:	0.1663
FN:	0.1276

Malware stats:


count    1327.000000
mean        0.665938
std         0.329939
min         0.000001
25%         0.385686
50%         0.775968
75%         0.988356
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.850287
std         0.212208
min         0.002386
25%         0.614314
50%         0.975537
75%         0.998951
max         1.000000
dtype: float64


		Downscale of 0.5
CC:	0.8583
DR:	0.7179
FP:	0.1660
FN:	0.1319

Malware stats:


count    1.659000e+03
mean     6.869545e-01
std      2.992347e-01
min      2.741793e-09
25%      4.114548e-01
50%      7.588778e-01
75%      9.662130e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.830215
std         0.227503
min         0.000844
25%         0.588545
50%         0.961805
75%         0.998540
max         1.000000
dtype: float64


		Downscale of 0.6
CC:	0.8668
DR:	0.8010
FP:	0.1633
FN:	0.1164

Malware stats:


count    1990.000000
mean        0.730033
std         0.253779
min         0.000011
25%         0.619831
50%         0.798213
75%         0.942779
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.805407
std         0.247039
min         0.000088
25%         0.546928
50%         0.955000
75%         0.997711
max         1.000000
dtype: float64


		Downscale of 0.7000000000000001
CC:	0.8342
DR:	0.7476
FP:	0.1674
FN:	0.1648

Malware stats:


count    2.322000e+03
mean     7.513755e-01
std      2.661164e-01
min      4.983562e-08
25%      4.784336e-01
50%      8.500344e-01
75%      9.836111e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.796193
std         0.256222
min         0.000030
25%         0.536700
50%         0.947498
75%         0.997998
max         1.000000
dtype: float64


		Downscale of 0.8
CC:	0.7989
DR:	0.6737
FP:	0.1578
FN:	0.2250

Malware stats:


count    2.654000e+03
mean     7.414045e-01
std      2.812362e-01
min      6.415496e-10
25%      4.840918e-01
50%      8.644190e-01
75%      9.941780e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.787304
std         0.262844
min         0.000005
25%         0.515908
50%         0.936831
75%         0.998052
max         1.000000
dtype: float64


		Downscale of 0.9
CC:	0.7824
DR:	0.8684
FP:	0.2741
FN:	0.1439

Malware stats:


count    2.986000e+03
mean     7.367667e-01
std      2.801853e-01
min      1.178909e-09
25%      5.112202e-01
50%      8.667157e-01
75%      9.831193e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.731386e-01
std      2.750734e-01
min      2.318087e-08
25%      4.887798e-01
50%      9.282055e-01
75%      9.980447e-01
max      1.000000e+00
dtype: float64


		Downscale of 1.0
CC:	0.7819
DR:	0.8677
FP:	0.2593
FN:	0.1597

Malware stats:


count    3.318000e+03
mean     7.214340e-01
std      2.671056e-01
min      2.697429e-10
25%      5.520293e-01
50%      8.297241e-01
75%      9.601650e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.575619e-01
std      2.871125e-01
min      1.954325e-08
25%      4.479707e-01
50%      9.216381e-01
75%      9.974889e-01
max      1.000000e+00
dtype: float64




## Dataset for Worms

In [35]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'worm']
symantec_filter = symantec_class[symantec_class.suffix == 'worm']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'worm']

l = [i for i in microsoft_filter.link if i not in trendmicro_filter.link.values]
display(len(l))

links = pd.Series(l)

# inter = microsoft_filter.link.isin([symantec_filter.link, trendmicro_filter])
#inter = pd.Index([microsoft_filter.link.values, symantec_filter.link.values, trendmicro_filter.link.values])
# links = pd.concat(inter)
# links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()
# display(inter[0])

#links = pd.concat(pd.Series([inter[0], inter[1], inter[2]]))

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

6162

Total malware: 3800


In [36]:
display(trendmicro_class[trendmicro_class.link.isin(links)])

Unnamed: 0,date,md5,link,family,prefix,suffix
12,2013-04-16,452e20bda5be890364fc3a9905278b1d,YmRhMmY4YjBkNTM4NDllMTkyZjQzODg3N2ZmNTQwNDE,generic,pak,001
31,2013-04-17,8a41cd0a312259a1996e29c8609fa11b,MzhmNGJkYzc3NWZiNGUwNzlmODNhNDZiMGJhNTJiMWY,spnr,troj,11cp13
36,2013-04-17,f0063edef419009b1564e5fba81157db,Mjk3NmYwOTI4Y2EzNDcxMTk2YjBjNGI2OTA1MDZjOTY,fareit,tspy,tpa
38,2013-04-17,e332e11cdafb7683fc186245460a5f2a,NmVhYTQ3OGM2YTc0NDgyOGIxODcwZjJjMzg3YjZjNzA,gen,troj,rcbochr
51,2013-04-18,8a41cd0a312259a1996e29c8609fa11b,NzI3MGE2YmNlY2MzNGQ3ZWFhZGYxYTliODgyMzE0MGE,spnr,troj,11cp13
57,2013-04-18,f5c927945db809946363775a1ee3992d,ZDk0ZDFjY2QwZTI5NDU1OTg5MDBiMzhlMDIxYTljZjk,spnr,troj,02l111
68,2013-04-18,77591ef66f98d6c7db7892b81b131530,YzA1NzIxMzQ2OTlhNGYwNjlmMDc1Yjg0MWEyOTc2NGI,spnr,troj,0bda13
92,2013-04-19,19c70de08fbb2ce178c25d4afccc7397,NDUyYTdiOWMwZGM1NDg4OGEwODhiYWViODhhYjM3NWI,spnr,troj,02k811
96,2013-04-20,241bb8a2ce3461c1edcf59801b0ef427,NmI0MWFhZmQyNGM5NDRlNmJjYTViYTZhNjEzMTE4MmM,spnr,troj,11j212
99,2013-04-20,9352259f72b8ef603ab644de70bc09c4,OWM0NWIwMjkzMGZhNDFlYTg4NDg0MzRiZGMyNTE3MDU,gen,troj,r27cccl


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [25]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9733
DR:	0.9670
FP:	0.2143
FN:	0.0033

Malware stats:


count    91.000000
mean      0.923375
std       0.125254
min       0.343109
25%       0.911687
50%       0.961884
75%       0.999242
max       0.999976
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.551196e-01
std      1.291835e-01
min      4.145330e-07
25%      9.889712e-01
50%      9.986451e-01
75%      9.999345e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.2
CC:	0.9728
DR:	0.9672
FP:	0.1194
FN:	0.0067

Malware stats:


count    183.000000
mean       0.946750
std        0.133062
min        0.166689
25%        0.975240
50%        0.989928
75%        0.999792
max        0.999998
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.537578e-01
std      1.336838e-01
min      5.324228e-07
25%      9.884742e-01
50%      9.987501e-01
75%      9.999301e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.3
CC:	0.9405
DR:	0.8436
FP:	0.1077
FN:	0.0460

Malware stats:


count    275.000000
mean       0.784474
std        0.258038
min        0.000040
25%        0.665766
50%        0.937410
75%        0.977976
max        0.999993
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.507944e-01
std      1.400493e-01
min      2.474486e-07
25%      9.878421e-01
50%      9.988241e-01
75%      9.999410e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.4
CC:	0.9425
DR:	0.8965
FP:	0.0986
FN:	0.0413

Malware stats:


count    367.000000
mean       0.812444
std        0.214908
min        0.000109
25%        0.733665
50%        0.891672
75%        0.974384
max        0.999998
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.424077e-01
std      1.557841e-01
min      1.157998e-07
25%      9.868106e-01
50%      9.986945e-01
75%      9.999302e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.5
CC:	0.9557
DR:	0.9586
FP:	0.0871
FN:	0.0212

Malware stats:


count    459.000000
mean       0.885942
std        0.154557
min        0.000092
25%        0.847300
50%        0.943381
75%        0.986608
max        0.999994
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.350745e-01
std      1.704479e-01
min      2.504779e-08
25%      9.857028e-01
50%      9.983330e-01
75%      9.999145e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.6
CC:	0.9619
DR:	0.9855
FP:	0.0812
FN:	0.0091

Malware stats:


count    551.000000
mean       0.928139
std        0.131975
min        0.003943
25%        0.941743
50%        0.987624
75%        0.989748
max        0.999967
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.322786e-01
std      1.775133e-01
min      1.995510e-09
25%      9.847260e-01
50%      9.966825e-01
75%      9.999166e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.7
CC:	0.9481
DR:	0.9549
FP:	0.0781
FN:	0.0324

Malware stats:


count    643.000000
mean       0.911130
std        0.188441
min        0.000003
25%        0.934826
50%        0.989122
75%        0.993658
max        0.999999
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.296589e-01
std      1.834051e-01
min      1.601904e-09
25%      9.844658e-01
50%      9.967028e-01
75%      9.999298e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.8
CC:	0.9359
DR:	0.9306
FP:	0.0744
FN:	0.0557

Malware stats:


count    735.000000
mean       0.865953
std        0.224399
min        0.000002
25%        0.869629
50%        0.957270
75%        0.992581
max        0.999999
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.268092e-01
std      1.878498e-01
min      3.122876e-09
25%      9.842688e-01
50%      9.965614e-01
75%      9.999373e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.9
CC:	0.9192
DR:	0.9081
FP:	0.0797
FN:	0.0817

Malware stats:


count    8.270000e+02
mean     8.538701e-01
std      2.352986e-01
min      7.891192e-07
25%      8.636570e-01
50%      9.501600e-01
75%      9.862588e-01
max      9.999992e-01
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.198976e-01
std      2.008439e-01
min      9.149481e-09
25%      9.840403e-01
50%      9.967235e-01
75%      9.999305e-01
max      1.000000e+00
dtype: float64


		Downscale of 1.0
CC:	0.9173
DR:	0.9064
FP:	0.0734
FN:	0.0916

Malware stats:


count    9.190000e+02
mean     8.527681e-01
std      2.500917e-01
min      4.674451e-07
25%      8.359314e-01
50%      9.682761e-01
75%      9.892080e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    9.190000e+02
mean     9.181464e-01
std      2.040309e-01
min      1.733618e-08
25%      9.839213e-01
50%      9.963814e-01
75%      9.999386e-01
max      1.000000e+00
dtype: float64




## Dataset for Virus

In [18]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'virus']
symantec_filter = symantec_class[symantec_class.prefix == 'virusdoctor']
trendmicro_filter = trendmicro_class[trendmicro_class.family == 'virus']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 4467


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [19]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.8998
DR:	0.2432
FP:	0.5909
FN:	0.0724

Malware stats:


count    111.000000
mean       0.277960
std        0.294748
min        0.000009
25%        0.016794
50%        0.199848
75%        0.466603
max        0.940911
dtype: float64


Goodware stats:


count    1116.000000
mean        0.928899
std         0.151273
min         0.006975
25%         0.945831
50%         0.987215
75%         0.996882
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.8656
DR:	0.5202
FP:	0.3862
FN:	0.0930

Malware stats:


count    223.000000
mean       0.507310
std        0.369305
min        0.001326
25%        0.109795
50%        0.620596
75%        0.872086
max        0.999913
dtype: float64


Goodware stats:


count    1116.000000
mean        0.889235
std         0.201934
min         0.001365
25%         0.899302
50%         0.985725
75%         0.994126
max         1.000000
dtype: float64


		Downscale of 0.3
CC:	0.8359
DR:	0.6138
FP:	0.3471
FN:	0.1136

Malware stats:


count    334.000000
mean       0.584147
std        0.329397
min        0.000032
25%        0.330507
50%        0.675649
75%        0.862627
max        0.999997
dtype: float64


Goodware stats:


count    1116.000000
mean        0.858803
std         0.227991
min         0.000291
25%         0.858174
50%         0.965826
75%         0.990903
max         1.000000
dtype: float64


		Downscale of 0.4
CC:	0.8342
DR:	0.7623
FP:	0.3103
FN:	0.0992

Malware stats:


count    446.000000
mean       0.696722
std        0.317618
min        0.000630
25%        0.549515
50%        0.816548
75%        0.952437
max        0.999998
dtype: float64


Goodware stats:


count    1116.000000
mean        0.831571
std         0.255777
min         0.000317
25%         0.803936
50%         0.963056
75%         0.987075
max         1.000000
dtype: float64


		Downscale of 0.5
CC:	0.8369
DR:	0.7939
FP:	0.2629
FN:	0.1072

Malware stats:


count    558.000000
mean       0.732025
std        0.299664
min        0.000379
25%        0.627684
50%        0.820537
75%        0.965282
max        1.000000
dtype: float64


Goodware stats:


count    1116.000000
mean        0.818433
std         0.265190
min         0.000121
25%         0.767200
50%         0.955143
75%         0.986774
max         1.000000
dtype: float64


		Downscale of 0.6
CC:	0.8101
DR:	0.7549
FP:	0.2574
FN:	0.1484

Malware stats:


count    669.000000
mean       0.689506
std        0.321380
min        0.000062
25%        0.509943
50%        0.828011
75%        0.932222
max        1.000000
dtype: float64


Goodware stats:


count    1116.000000
mean        0.806276
std         0.271520
min         0.000093
25%         0.738359
50%         0.947137
75%         0.983236
max         1.000000
dtype: float64


		Downscale of 0.7
CC:	0.7723
DR:	0.6991
FP:	0.2651
FN:	0.2036

Malware stats:


count    7.810000e+02
mean     6.517612e-01
std      3.506729e-01
min      3.827735e-10
25%      3.251641e-01
50%      8.201366e-01
75%      9.357463e-01
max      9.999986e-01
dtype: float64


Goodware stats:


count    1116.000000
mean        0.793152
std         0.282060
min         0.000031
25%         0.718443
50%         0.936643
75%         0.977178
max         1.000000
dtype: float64


		Downscale of 0.8
CC:	0.7535
DR:	0.6917
FP:	0.2628
FN:	0.2348

Malware stats:


count    8.920000e+02
mean     6.365694e-01
std      3.531786e-01
min      1.814998e-09
25%      3.170079e-01
50%      8.356416e-01
75%      9.383500e-01
max      9.998306e-01
dtype: float64


Goodware stats:


count    1116.000000
mean        0.774701
std         0.298423
min         0.000116
25%         0.683425
50%         0.934347
75%         0.974287
max         1.000000
dtype: float64


		Downscale of 0.9
CC:	0.7594
DR:	0.7211
FP:	0.2411
FN:	0.2401

Malware stats:


count    1.004000e+03
mean     6.677321e-01
std      3.424561e-01
min      8.378664e-09
25%      3.875589e-01
50%      8.498956e-01
75%      9.613432e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    1116.000000
mean        0.766688
std         0.302338
min         0.000190
25%         0.661756
50%         0.924666
75%         0.973224
max         1.000000
dtype: float64


		Downscale of 1.0
CC:	0.7836
DR:	0.7894
FP:	0.2197
FN:	0.2131

Malware stats:


count    1.116000e+03
mean     7.321270e-01
std      3.007564e-01
min      3.605102e-07
25%      5.908487e-01
50%      8.649940e-01
75%      9.676963e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    1116.000000
mean        0.746229
std         0.310891
min         0.000042
25%         0.616377
50%         0.895339
75%         0.969030
max         1.000000
dtype: float64


