# Extract Specific AV Samples

Extract samples from specific vendors, meaning samples they classify as either goodware or malware.

In [1]:
import pandas as pd
from IPython.display import display

av_class = pd.read_csv('data/mined_data/pe32_static_av.csv', dtype=str)
av_class = av_class.set_index('link')

## Microsoft Samples

In [2]:
microsoft_re = r'(?P<type>\w+):(?P<platform>\w+)/(?P<family>\w+)(\.(?P<variant>\S+))?(!(?P<info>\S+))?'
microsoft_samples = av_class.filter(items=['microsoft']).dropna()
microsoft_clean = microsoft_samples[microsoft_samples.microsoft == 'clean']
microsoft_malware = microsoft_samples[microsoft_samples.microsoft != 'clean']

print('Microsoft total samples: {}'.format(len(microsoft_samples)))
print('Malware count: {} ({:.2f}%)'.format(len(microsoft_malware), len(microsoft_malware) / len(microsoft_samples) * 100))
print('Goodware count: {} ({:.2f}%)'.format(len(microsoft_clean), len(microsoft_clean) / len(microsoft_samples) * 100))

# Create a dataframe with the correct naming
microsoft_naming = microsoft_malware.microsoft.str.extract(microsoft_re, expand=True)
microsoft_naming.drop([3, 5], axis=1, inplace=True)

print('\nTypes:')
display(microsoft_naming['type'].value_counts() / len(microsoft_malware))
print('\nPlatforms:')
display(microsoft_naming['platform'].value_counts() / len(microsoft_malware))


# Save the naming with the clean samples
pd.concat([microsoft_clean, microsoft_naming]).drop(['microsoft'], axis=1).to_csv(path_or_buf='data/mined_data/microsoft_classification.csv')

Microsoft total samples: 283282
Malware count: 90808 (32.06%)
Goodware count: 192474 (67.94%)

Types:


trojan              0.209288
backdoor            0.176383
worm                0.128612
trojandownloader    0.106191
virus               0.068926
virtool             0.063827
pws                 0.061702
adware              0.040910
trojanspy           0.032607
ransom              0.029601
trojandropper       0.027553
hacktool            0.020978
rogue               0.007841
ddos                0.004548
softwarebundler     0.004350
trojanproxy         0.004229
browsermodifier     0.002654
monitoringtool      0.002269
dialer              0.001498
exploit             0.001498
spammer             0.001399
trojanclicker       0.001178
program             0.000606
constructor         0.000562
joke                0.000209
tool                0.000187
settingsmodifier    0.000099
spyware             0.000088
dos                 0.000066
remoteaccess        0.000033
misleading          0.000022
Name: type, dtype: float64


Platforms:


win32         0.918388
msil          0.070071
winnt         0.005803
autoit        0.002368
bat           0.000870
vbs           0.000529
win95         0.000341
win64         0.000264
js            0.000253
inf           0.000176
html          0.000154
irc           0.000121
dos           0.000121
o97m          0.000110
androidos     0.000077
python        0.000077
java          0.000066
powershell    0.000033
w97m          0.000033
php           0.000022
wm            0.000011
winreg        0.000011
win16         0.000011
Name: platform, dtype: float64