# Normalize AV Classifcation

Trying to figure out the naming convention each vendor uses, this allows a finer filter on choosing malware.

In [49]:
import pandas as pd
import numpy as np
import datetime
import re
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer

pe32_av = pd.read_csv('data/pe32_static_av.csv', dtype=str)
pe32_av['date'] = pd.to_datetime(pe32_av['date'], format='%Y/%m/%d')
pe32_av = pe32_av.set_index('date').sort_index()

# Filter vendors with high presence
vendors_presence = pd.Series.from_csv('data/vendors_presence.csv')
vendors = vendors_presence[vendors_presence > 0.7].keys()

# Remove unused columns
pe32_av.drop([v for v in pe32_av.columns[2:] if v not in vendors], axis=1, inplace=True)
pe32_av.fillna('clean', inplace=True)

print('Total samples: {}'.format(len(pe32_av)))

Total samples: 388513


In [51]:
# Vendors to filter
display(', '.join(vendors))

'ahnlab-v3, antiy-avl, avast, avg, bitdefender, cat-quickheal, clamav, comodo, drweb, emsisoft, eset-nod32, f-prot, f-secure, fortinet, gdata, ikarus, jiangmin, k7antivirus, k7gw, kaspersky, malwarebytes, mcafee, mcafee-gw-edition, microsoft, microworld-escan, nano-antivirus, nprotect, panda, sophos, superantispyware, symantec, thehacker, trendmicro, trendmicro-housecall, vba32, vipre, virobot'

## Known Naming

Start with vendors that provide their naming convention

### Microsoft

`TYPE DELIM PLATFORM DELIM FAMILY DELIM VARIANT DELIM INFO`

e.g.: `Backdoor:Win32/Caphaw.D!lnk`

In [169]:
microsoft_samples = pe32_av[list(pe32_av.columns[:2]) + ['microsoft']]
microsoft_samples = microsoft_samples[microsoft_samples != 'clean'].dropna()

microsoft_re = re.compile(r'((?P<type>[^:]+):)?(?P<platform>[^/]+)/(?P<family>[^.!]+)(\.(?P<variant>[^!]+))?(!(?P<info>.+))?')

def microsoft_parse_naming(x):
    match = re.match(microsoft_re, x)
    name = dict()
    attr = ['platform', 'type', 'family', 'variant', 'info']
    for a in attr:
        name[a] = match.group(a) if match.group(a) else None
    return name

parsed_micro = pd.DataFrame(list(map(microsoft_parse_naming, microsoft_samples.microsoft)), index=microsoft_samples.link)
microsoft_samples = microsoft_samples.join(parsed_micro, on='link').drop(['microsoft'], axis=1)

microsoft_samples.to_csv(path_or_buf='data/microsoft_classification.csv')

display(microsoft_samples.groupby('type').count().md5.sort_values(ascending=False))
display(len(microsoft_samples))

type
trojan              19005
backdoor            16017
worm                11679
trojandownloader     9643
virus                6259
virtool              5796
pws                  5603
adware               3715
trojanspy            2961
ransom               2688
trojandropper        2502
hacktool             1905
rogue                 712
ddos                  413
softwarebundler       395
trojanproxy           384
browsermodifier       241
monitoringtool        206
dialer                136
exploit               136
spammer               127
trojanclicker         107
program                55
constructor            51
joke                   19
tool                   17
settingsmodifier        9
spyware                 8
dos                     6
remoteaccess            3
misleading              2
Name: md5, dtype: int64

90808

### Trendmicro

`PREFIX_THREATNAME.SUFFIX`

e.g.: `troj_fakeav.smby`

In [170]:
trendmicro_samples = pe32_av[list(pe32_av.columns[:2]) + ['trendmicro']]
trendmicro_samples = trendmicro_samples[trendmicro_samples != 'clean'].dropna()

trendmicro_re = re.compile(r'((?P<prefix>[^_]+)_)?(?P<family>[^.]+)(\.(?P<suffix>(.+)))?')

def trendmicro_parse_naming(x):
    match = re.match(trendmicro_re, x)
    name = dict()
    attr = ['prefix', 'family', 'suffix']
    for a in attr:
        name[a] = match.group(a) if match.group(a) else None
    return name

parsed_trend = pd.DataFrame(list(map(trendmicro_parse_naming, trendmicro_samples.trendmicro)), index=trendmicro_samples.link)
trendmicro_samples = trendmicro_samples.join(parsed_trend, on='link').drop(['trendmicro'], axis=1)

trendmicro_samples.to_csv(path_or_buf='data/trendmicro_classification.csv')

display(trendmicro_samples.groupby('prefix').count().md5.sort_values(ascending=False))
display(len(trendmicro_samples))

prefix
troj            52525
bkdr             7760
worm             7703
pe               6209
tspy             4905
mal              4177
pak              3826
adw              1274
ransom           1087
hktl              993
cryp              530
crck              497
possible          227
rtkt              182
dial              136
pua               134
heurspy           128
hb                114
hs                 59
gray               51
joke               43
spyw               40
ht                 35
ddos               24
h2                 24
ptch               22
adware             20
dialer             14
vbs                13
hackingtools       13
heur               12
test               11
crackingapps        8
spyware             7
js                  6
rap                 5
freeloader          4
apw                 3
brex                3
jokeprograms        3
java                3
expl                3
reg                 2
w2km                2
lnk                 1
bat

92865

### Symantec

`PREFIX_THREATNAME.SUFFIX[!variant]`

e.g.: `troj_fakeav.smby`

In [166]:
symantec_samples = pe32_av[list(pe32_av.columns[:2]) + ['symantec']]
symantec_samples = symantec_samples[symantec_samples != 'clean'].dropna()

symantec_re = re.compile(r'(?P<prefix>[^.!]+)(\.(?P<family>[^!.]+))?(\.(?P<suffix>(.+)))?(!(?P<variant>(.+)))?')

def symantec_parse_naming(x):
    match = re.match(symantec_re, x)
    name = dict()
    attr = ['prefix', 'family', 'variant', 'suffix']
    for a in attr:
        name[a] = match.group(a) if match.group(a) else None
    return name

parsed_syman = pd.DataFrame(list(map(symantec_parse_naming, symantec_samples.symantec)), index=symantec_samples.link)
symantec_samples = symantec_samples.join(parsed_syman, on='link').drop(['symantec'], axis=1)

symantec_samples.to_csv(path_or_buf='data/symantec_classification.csv')

display(symantec_samples.groupby('prefix').count().md5.sort_values(ascending=False))
display(len(symantec_samples))

prefix
trojan                34065
ws                    20860
w32                   17004
backdoor               9622
suspicious             7009
infostealer            5928
downloader             5527
trojan horse           5212
adware                 3958
packed                 2903
sape                   2111
heur                   1789
hacktool                778
pua                     668
smg                     371
bloodhound              313
spyware                 310
securityrisk            265
dialer                  198
ransom                   96
passwordrevealer         72
w95                      64
yontoo                   63
download                 50
meterpreter              43
searchprotect            37
secshieldfraud           33
netcat                   32
virusdoctor              29
bitcoinminer             29
                      ...  
fake bleem trojan         1
torntv                    1
iminent                   1
iframe                    1
iepatch      

120092