# LR Filtered Dataset

Testing the classifier when using a filtered dataset (selected malware types e.g. virus, trojan, worm)

In [2]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import lib.helpers as jcfg_helpers


samples = pd.read_csv('data/pe32_imports_labeled.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
samples = samples.set_index('date').sort_index()

goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]

print('Total goodware: {}'.format(len(goodware)))
print('Total malware: {}'.format(len(malware)))

Total goodware: 34509
Total malware: 67873


## Dataset for Trojans

In [2]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'trojan']
symantec_filter = symantec_class[symantec_class.prefix == 'trojan']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'troj']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 41870


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [12]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9150
DR:	0.3082
FP:	0.4426
FN:	0.0661

Malware stats:


count    3.310000e+02
mean     3.633775e-01
std      3.564969e-01
min      2.154350e-07
25%      5.470049e-02
50%      2.420007e-01
75%      6.038979e-01
max      9.999999e-01
dtype: float64


Goodware stats:


count    3318.000000
mean        0.942601
std         0.132651
min         0.024514
25%         0.945300
50%         0.989109
75%         0.999479
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.8963
DR:	0.5415
FP:	0.2329
FN:	0.0865

Malware stats:


count    6.630000e+02
mean     5.489624e-01
std      3.640254e-01
min      8.770690e-08
25%      1.589174e-01
50%      6.108304e-01
75%      9.280080e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.912766
std         0.150912
min         0.005487
25%         0.841083
50%         0.982173
75%         0.999632
max         1.000000
dtype: float64


		Downscale of 0.3
CC:	0.8855
DR:	0.6583
FP:	0.1904
FN:	0.0970

Malware stats:


count    9.950000e+02
mean     6.780235e-01
std      3.185661e-01
min      7.130759e-10
25%      3.296162e-01
50%      7.444049e-01
75%      9.825324e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.870535
std         0.191155
min         0.001401
25%         0.670384
50%         0.981594
75%         0.999399
max         1.000000
dtype: float64


		Downscale of 0.4
CC:	0.8637
DR:	0.6534
FP:	0.1663
FN:	0.1276

Malware stats:


count    1327.000000
mean        0.665938
std         0.329939
min         0.000001
25%         0.385686
50%         0.775968
75%         0.988356
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.850287
std         0.212208
min         0.002386
25%         0.614314
50%         0.975537
75%         0.998951
max         1.000000
dtype: float64


		Downscale of 0.5
CC:	0.8583
DR:	0.7179
FP:	0.1660
FN:	0.1319

Malware stats:


count    1.659000e+03
mean     6.869545e-01
std      2.992347e-01
min      2.741793e-09
25%      4.114548e-01
50%      7.588778e-01
75%      9.662130e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.830215
std         0.227503
min         0.000844
25%         0.588545
50%         0.961805
75%         0.998540
max         1.000000
dtype: float64


		Downscale of 0.6
CC:	0.8668
DR:	0.8010
FP:	0.1633
FN:	0.1164

Malware stats:


count    1990.000000
mean        0.730033
std         0.253779
min         0.000011
25%         0.619831
50%         0.798213
75%         0.942779
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.805407
std         0.247039
min         0.000088
25%         0.546928
50%         0.955000
75%         0.997711
max         1.000000
dtype: float64


		Downscale of 0.7
CC:	0.8342
DR:	0.7476
FP:	0.1674
FN:	0.1648

Malware stats:


count    2.322000e+03
mean     7.513755e-01
std      2.661164e-01
min      4.983562e-08
25%      4.784336e-01
50%      8.500344e-01
75%      9.836111e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.796193
std         0.256222
min         0.000030
25%         0.536700
50%         0.947498
75%         0.997998
max         1.000000
dtype: float64


		Downscale of 0.8
CC:	0.7989
DR:	0.6737
FP:	0.1578
FN:	0.2250

Malware stats:


count    2.654000e+03
mean     7.414045e-01
std      2.812362e-01
min      6.415496e-10
25%      4.840918e-01
50%      8.644190e-01
75%      9.941780e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.787304
std         0.262844
min         0.000005
25%         0.515908
50%         0.936831
75%         0.998052
max         1.000000
dtype: float64


		Downscale of 0.9
CC:	0.7824
DR:	0.8684
FP:	0.2741
FN:	0.1439

Malware stats:


count    2.986000e+03
mean     7.367667e-01
std      2.801853e-01
min      1.178909e-09
25%      5.112202e-01
50%      8.667157e-01
75%      9.831193e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.731386e-01
std      2.750734e-01
min      2.318087e-08
25%      4.887798e-01
50%      9.282055e-01
75%      9.980447e-01
max      1.000000e+00
dtype: float64


		Downscale of 1.0
CC:	0.7819
DR:	0.8677
FP:	0.2593
FN:	0.1597

Malware stats:


count    3.318000e+03
mean     7.214340e-01
std      2.671056e-01
min      2.697429e-10
25%      5.520293e-01
50%      8.297241e-01
75%      9.601650e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.575619e-01
std      2.871125e-01
min      1.954325e-08
25%      4.479707e-01
50%      9.216381e-01
75%      9.974889e-01
max      1.000000e+00
dtype: float64




## Dataset for Backdoors

In [20]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'backdoor']
symantec_filter = symantec_class[symantec_class.prefix == 'backdoor']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'bkdr']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 13273


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [9]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9150
DR:	0.3082
FP:	0.4426
FN:	0.0661

Malware stats:


count    3.310000e+02
mean     3.633775e-01
std      3.564969e-01
min      2.154350e-07
25%      5.470049e-02
50%      2.420007e-01
75%      6.038979e-01
max      9.999999e-01
dtype: float64


Goodware stats:


count    3318.000000
mean        0.942601
std         0.132651
min         0.024514
25%         0.945300
50%         0.989109
75%         0.999479
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.8963
DR:	0.5415
FP:	0.2329
FN:	0.0865

Malware stats:


count    6.630000e+02
mean     5.489624e-01
std      3.640254e-01
min      8.770690e-08
25%      1.589174e-01
50%      6.108304e-01
75%      9.280080e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.912766
std         0.150912
min         0.005487
25%         0.841083
50%         0.982173
75%         0.999632
max         1.000000
dtype: float64


		Downscale of 0.30000000000000004
CC:	0.8855
DR:	0.6583
FP:	0.1904
FN:	0.0970

Malware stats:


count    9.950000e+02
mean     6.780235e-01
std      3.185661e-01
min      7.130759e-10
25%      3.296162e-01
50%      7.444049e-01
75%      9.825324e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.870535
std         0.191155
min         0.001401
25%         0.670384
50%         0.981594
75%         0.999399
max         1.000000
dtype: float64


		Downscale of 0.4
CC:	0.8637
DR:	0.6534
FP:	0.1663
FN:	0.1276

Malware stats:


count    1327.000000
mean        0.665938
std         0.329939
min         0.000001
25%         0.385686
50%         0.775968
75%         0.988356
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.850287
std         0.212208
min         0.002386
25%         0.614314
50%         0.975537
75%         0.998951
max         1.000000
dtype: float64


		Downscale of 0.5
CC:	0.8583
DR:	0.7179
FP:	0.1660
FN:	0.1319

Malware stats:


count    1.659000e+03
mean     6.869545e-01
std      2.992347e-01
min      2.741793e-09
25%      4.114548e-01
50%      7.588778e-01
75%      9.662130e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.830215
std         0.227503
min         0.000844
25%         0.588545
50%         0.961805
75%         0.998540
max         1.000000
dtype: float64


		Downscale of 0.6
CC:	0.8668
DR:	0.8010
FP:	0.1633
FN:	0.1164

Malware stats:


count    1990.000000
mean        0.730033
std         0.253779
min         0.000011
25%         0.619831
50%         0.798213
75%         0.942779
max         1.000000
dtype: float64


Goodware stats:


count    3318.000000
mean        0.805407
std         0.247039
min         0.000088
25%         0.546928
50%         0.955000
75%         0.997711
max         1.000000
dtype: float64


		Downscale of 0.7000000000000001
CC:	0.8342
DR:	0.7476
FP:	0.1674
FN:	0.1648

Malware stats:


count    2.322000e+03
mean     7.513755e-01
std      2.661164e-01
min      4.983562e-08
25%      4.784336e-01
50%      8.500344e-01
75%      9.836111e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.796193
std         0.256222
min         0.000030
25%         0.536700
50%         0.947498
75%         0.997998
max         1.000000
dtype: float64


		Downscale of 0.8
CC:	0.7989
DR:	0.6737
FP:	0.1578
FN:	0.2250

Malware stats:


count    2.654000e+03
mean     7.414045e-01
std      2.812362e-01
min      6.415496e-10
25%      4.840918e-01
50%      8.644190e-01
75%      9.941780e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3318.000000
mean        0.787304
std         0.262844
min         0.000005
25%         0.515908
50%         0.936831
75%         0.998052
max         1.000000
dtype: float64


		Downscale of 0.9
CC:	0.7824
DR:	0.8684
FP:	0.2741
FN:	0.1439

Malware stats:


count    2.986000e+03
mean     7.367667e-01
std      2.801853e-01
min      1.178909e-09
25%      5.112202e-01
50%      8.667157e-01
75%      9.831193e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.731386e-01
std      2.750734e-01
min      2.318087e-08
25%      4.887798e-01
50%      9.282055e-01
75%      9.980447e-01
max      1.000000e+00
dtype: float64


		Downscale of 1.0
CC:	0.7819
DR:	0.8677
FP:	0.2593
FN:	0.1597

Malware stats:


count    3.318000e+03
mean     7.214340e-01
std      2.671056e-01
min      2.697429e-10
25%      5.520293e-01
50%      8.297241e-01
75%      9.601650e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    3.318000e+03
mean     7.575619e-01
std      2.871125e-01
min      1.954325e-08
25%      4.479707e-01
50%      9.216381e-01
75%      9.974889e-01
max      1.000000e+00
dtype: float64




## Dataset for Worms

In [15]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'worm']
symantec_filter = symantec_class[symantec_class.suffix == 'worm']
trendmicro_filter = trendmicro_class[trendmicro_class.prefix == 'worm']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 9418


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [16]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.9289
DR:	0.4596
FP:	0.3455
FN:	0.0524

Malware stats:


count    235.000000
mean       0.458184
std        0.354555
min        0.000011
25%        0.100127
50%        0.464396
75%        0.795255
max        0.999970
dtype: float64


Goodware stats:


count    2354.000000
mean        0.953813
std         0.126771
min         0.000100
25%         0.971733
50%         0.996608
75%         0.999780
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.9359
DR:	0.8043
FP:	0.1906
FN:	0.0390

Malware stats:


count    470.000000
mean       0.785789
std        0.298460
min        0.003958
25%        0.644103
50%        0.960739
75%        0.995203
max        1.000000
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.415555e-01
std      1.480760e-01
min      5.405406e-08
25%      9.652621e-01
50%      9.964590e-01
75%      9.997266e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.3
CC:	0.9402
DR:	0.8909
FP:	0.1442
FN:	0.0331

Malware stats:


count    7.060000e+02
mean     7.919611e-01
std      2.518206e-01
min      4.732057e-08
25%      6.992951e-01
50%      8.934974e-01
75%      9.864473e-01
max      9.999998e-01
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.334788e-01
std      1.656524e-01
min      1.070076e-09
25%      9.627645e-01
50%      9.950599e-01
75%      9.997185e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.4
CC:	0.9381
DR:	0.9171
FP:	0.1274
FN:	0.0338

Malware stats:


count    9.410000e+02
mean     8.307211e-01
std      2.111391e-01
min      7.631981e-07
25%      8.015404e-01
50%      8.853044e-01
75%      9.818147e-01
max      9.999999e-01
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.227005e-01
std      1.827241e-01
min      4.060330e-10
25%      9.545321e-01
50%      9.941348e-01
75%      9.996809e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.5
CC:	0.9439
DR:	0.9482
FP:	0.1093
FN:	0.0268

Malware stats:


count    1177.000000
mean        0.886166
std         0.172799
min         0.000037
25%         0.890899
50%         0.932763
75%         0.988963
max         1.000000
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.162957e-01
std      1.926773e-01
min      3.016032e-12
25%      9.473496e-01
50%      9.940020e-01
75%      9.996967e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.6
CC:	0.9373
DR:	0.9476
FP:	0.1080
FN:	0.0327

Malware stats:


count    1412.000000
mean        0.874777
std         0.190467
min         0.000002
25%         0.829165
50%         0.925224
75%         0.992236
max         1.000000
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.117562e-01
std      2.005343e-01
min      1.154410e-12
25%      9.443304e-01
50%      9.944057e-01
75%      9.997307e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.7
CC:	0.9053
DR:	0.8731
FP:	0.1057
FN:	0.0873

Malware stats:


count    1.647000e+03
mean     7.949810e-01
std      2.558930e-01
min      1.854896e-08
25%      7.906064e-01
50%      8.765433e-01
75%      9.546134e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.083963e-01
std      2.059550e-01
min      3.272937e-13
25%      9.386038e-01
50%      9.938040e-01
75%      9.997438e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.8
CC:	0.8867
DR:	0.8338
FP:	0.0961
FN:	0.1252

Malware stats:


count    1.883000e+03
mean     7.713083e-01
std      2.799545e-01
min      9.965926e-08
25%      7.662966e-01
50%      8.681887e-01
75%      9.602606e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    2.354000e+03
mean     9.040380e-01
std      2.121797e-01
min      1.439737e-12
25%      9.346576e-01
50%      9.939272e-01
75%      9.997099e-01
max      1.000000e+00
dtype: float64


		Downscale of 0.9
CC:	0.8904
DR:	0.8602
FP:	0.0962
FN:	0.1205

Malware stats:


count    2118.000000
mean        0.791400
std         0.283141
min         0.000006
25%         0.787496
50%         0.907688
75%         0.985182
max         1.000000
dtype: float64


Goodware stats:


count    2.354000e+03
mean     8.915860e-01
std      2.229390e-01
min      2.915446e-13
25%      9.152757e-01
50%      9.907384e-01
75%      9.996770e-01
max      1.000000e+00
dtype: float64


		Downscale of 1.0
CC:	0.8717
DR:	0.8309
FP:	0.0953
FN:	0.1563

Malware stats:


count    2.354000e+03
mean     7.933651e-01
std      3.097255e-01
min      3.634418e-09
25%      7.942247e-01
50%      9.336490e-01
75%      9.880253e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    2.354000e+03
mean     8.842927e-01
std      2.302343e-01
min      1.227907e-13
25%      9.018958e-01
50%      9.897749e-01
75%      9.996078e-01
max      1.000000e+00
dtype: float64




## Dataset for Virus

In [18]:
microsoft_class = pd.read_csv('data/microsoft_classification.csv')
microsoft_class.fillna('', inplace=True)
symantec_class = pd.read_csv('data/symantec_classification.csv')
symantec_class.fillna('', inplace=True)
trendmicro_class = pd.read_csv('data/trendmicro_classification.csv')
trendmicro_class.fillna('', inplace=True)

microsoft_filter = microsoft_class[microsoft_class.type == 'virus']
symantec_filter = symantec_class[symantec_class.prefix == 'virusdoctor']
trendmicro_filter = trendmicro_class[trendmicro_class.family == 'virus']

links = pd.concat([microsoft_filter.link, symantec_filter.link, trendmicro_filter.link]).unique()

filtered_malware = samples[samples.link.isin(links)]

print('Total malware: {}'.format(len(filtered_malware)))

Total malware: 4467


## Testing

Creating multiple LR and varying the ratio of malware/goodware in training/validation

In [19]:
for i in np.arange(0.1, 1.1, 0.1):
    print('\t\tDownscale of {:.1f}'.format(i))
    # Training/Validation creation
    training, validation = jcfg_helpers.train_test_split(
    pd.concat([goodware, filtered_malware]),
    validation_size = 0.25,
    balanced = True,
    malware_downscale = i)
    #print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
    #                                                                 len(training[training.malware == 0])))
    #print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
    #                                                                     len(validation[validation.malware == 0])))
    
    ################
    # Feature Vector
    cv_token_pattern = u'[^;]+'
    min_df = 2
    cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
    train_X = cv.fit_transform(training.imports)
    #print('Total features: {}'.format(len(cv.get_feature_names())))
    
    ################
    # Logistic Regression
    train_Y = training.malware
    lr = LogisticRegression().fit(train_X, train_Y)

    ################
    # Scoring
    test_X = cv.transform(validation.imports)
    test_Y = validation.malware
    score = confusion_matrix(test_Y, lr.predict(test_X))
    fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
    print('CC:\t{:.4f}'.format(cc_rate))
    print('DR:\t{:.4f}'.format(det_rate))
    print('FP:\t{:.4f}'.format(fp_rate))
    print('FN:\t{:.4f}'.format(fn_rate))
    malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
    goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
    print('\nMalware stats:')
    display(pd.Series(malware_only[:,1]).describe())
    print('\nGoodware stats:')
    display(pd.Series(goodware_only[:,0]).describe())
    print()

		Downscale of 0.1
CC:	0.8998
DR:	0.2432
FP:	0.5909
FN:	0.0724

Malware stats:


count    111.000000
mean       0.277960
std        0.294748
min        0.000009
25%        0.016794
50%        0.199848
75%        0.466603
max        0.940911
dtype: float64


Goodware stats:


count    1116.000000
mean        0.928899
std         0.151273
min         0.006975
25%         0.945831
50%         0.987215
75%         0.996882
max         1.000000
dtype: float64


		Downscale of 0.2
CC:	0.8656
DR:	0.5202
FP:	0.3862
FN:	0.0930

Malware stats:


count    223.000000
mean       0.507310
std        0.369305
min        0.001326
25%        0.109795
50%        0.620596
75%        0.872086
max        0.999913
dtype: float64


Goodware stats:


count    1116.000000
mean        0.889235
std         0.201934
min         0.001365
25%         0.899302
50%         0.985725
75%         0.994126
max         1.000000
dtype: float64


		Downscale of 0.3
CC:	0.8359
DR:	0.6138
FP:	0.3471
FN:	0.1136

Malware stats:


count    334.000000
mean       0.584147
std        0.329397
min        0.000032
25%        0.330507
50%        0.675649
75%        0.862627
max        0.999997
dtype: float64


Goodware stats:


count    1116.000000
mean        0.858803
std         0.227991
min         0.000291
25%         0.858174
50%         0.965826
75%         0.990903
max         1.000000
dtype: float64


		Downscale of 0.4
CC:	0.8342
DR:	0.7623
FP:	0.3103
FN:	0.0992

Malware stats:


count    446.000000
mean       0.696722
std        0.317618
min        0.000630
25%        0.549515
50%        0.816548
75%        0.952437
max        0.999998
dtype: float64


Goodware stats:


count    1116.000000
mean        0.831571
std         0.255777
min         0.000317
25%         0.803936
50%         0.963056
75%         0.987075
max         1.000000
dtype: float64


		Downscale of 0.5
CC:	0.8369
DR:	0.7939
FP:	0.2629
FN:	0.1072

Malware stats:


count    558.000000
mean       0.732025
std        0.299664
min        0.000379
25%        0.627684
50%        0.820537
75%        0.965282
max        1.000000
dtype: float64


Goodware stats:


count    1116.000000
mean        0.818433
std         0.265190
min         0.000121
25%         0.767200
50%         0.955143
75%         0.986774
max         1.000000
dtype: float64


		Downscale of 0.6
CC:	0.8101
DR:	0.7549
FP:	0.2574
FN:	0.1484

Malware stats:


count    669.000000
mean       0.689506
std        0.321380
min        0.000062
25%        0.509943
50%        0.828011
75%        0.932222
max        1.000000
dtype: float64


Goodware stats:


count    1116.000000
mean        0.806276
std         0.271520
min         0.000093
25%         0.738359
50%         0.947137
75%         0.983236
max         1.000000
dtype: float64


		Downscale of 0.7
CC:	0.7723
DR:	0.6991
FP:	0.2651
FN:	0.2036

Malware stats:


count    7.810000e+02
mean     6.517612e-01
std      3.506729e-01
min      3.827735e-10
25%      3.251641e-01
50%      8.201366e-01
75%      9.357463e-01
max      9.999986e-01
dtype: float64


Goodware stats:


count    1116.000000
mean        0.793152
std         0.282060
min         0.000031
25%         0.718443
50%         0.936643
75%         0.977178
max         1.000000
dtype: float64


		Downscale of 0.8
CC:	0.7535
DR:	0.6917
FP:	0.2628
FN:	0.2348

Malware stats:


count    8.920000e+02
mean     6.365694e-01
std      3.531786e-01
min      1.814998e-09
25%      3.170079e-01
50%      8.356416e-01
75%      9.383500e-01
max      9.998306e-01
dtype: float64


Goodware stats:


count    1116.000000
mean        0.774701
std         0.298423
min         0.000116
25%         0.683425
50%         0.934347
75%         0.974287
max         1.000000
dtype: float64


		Downscale of 0.9
CC:	0.7594
DR:	0.7211
FP:	0.2411
FN:	0.2401

Malware stats:


count    1.004000e+03
mean     6.677321e-01
std      3.424561e-01
min      8.378664e-09
25%      3.875589e-01
50%      8.498956e-01
75%      9.613432e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    1116.000000
mean        0.766688
std         0.302338
min         0.000190
25%         0.661756
50%         0.924666
75%         0.973224
max         1.000000
dtype: float64


		Downscale of 1.0
CC:	0.7836
DR:	0.7894
FP:	0.2197
FN:	0.2131

Malware stats:


count    1.116000e+03
mean     7.321270e-01
std      3.007564e-01
min      3.605102e-07
25%      5.908487e-01
50%      8.649940e-01
75%      9.676963e-01
max      1.000000e+00
dtype: float64


Goodware stats:


count    1116.000000
mean        0.746229
std         0.310891
min         0.000042
25%         0.616377
50%         0.895339
75%         0.969030
max         1.000000
dtype: float64


