# Normalize AV Classifcation

Trying to figure out the naming convention each vendor uses, this allows a finer filter on choosing malware.

In [22]:
import pandas as pd
import numpy as np
import datetime
import re
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer

pe32_av = pd.read_csv('data/pe32_static_av.csv', dtype=str)
pe32_av['date'] = pd.to_datetime(pe32_av['date'], format='%Y/%m/%d')
pe32_av = pe32_av.set_index('date').sort_index()

# Filter vendors with high presence
vendors_presence = pd.Series.from_csv('data/vendors_presence.csv')
vendors = vendors_presence[vendors_presence > 0.7].keys()

# Remove unused columns
pe32_av.drop([v for v in pe32_av.columns if v not in vendors], axis=1, inplace=True)
pe32_av.fillna('clean', inplace=True)

print('Total samples: {}'.format(len(pe32_av)))

Total samples: 388513


In [130]:
# Vendors to filter
display(vendors_presence.sort_values())
display(', '.join(vendors))

 win32_virut_xp (v)     0.000003
pandab3                 0.000010
drwebse                 0.000013
antivir7                0.000015
t3                      0.000026
f-prot4                 0.000033
etrust-inoculateit      0.000039
una                     0.000059
escan                   0.000142
fortinetbeta            0.000152
mcafeebeta              0.000160
fileadvisor             0.000198
savmail                 0.000237
command                 0.000335
secureweb-gateway       0.000345
webwasher-gateway       0.000618
nod32v2                 0.000677
ahnlab                  0.000695
ewido                   0.000777
pandabeta               0.000852
nod32beta               0.001372
prevx1                  0.001506
mcafee+artemis          0.004978
avast5                  0.005058
a-squared               0.005897
sunbelt                 0.007596
authentium              0.007660
prevx                   0.010435
crowdstrike             0.013565
etrust-vet              0.013925
          

'ahnlab-v3, antiy-avl, avast, avg, bitdefender, cat-quickheal, clamav, comodo, drweb, emsisoft, eset-nod32, f-prot, f-secure, fortinet, gdata, ikarus, jiangmin, k7antivirus, k7gw, kaspersky, malwarebytes, mcafee, mcafee-gw-edition, microsoft, microworld-escan, nano-antivirus, nprotect, panda, sophos, superantispyware, symantec, thehacker, trendmicro, trendmicro-housecall, vba32, vipre, virobot'

In [45]:
# Testing the CV on naming
sample = pe32_av[:1]
token_pattern = r'\b[a-z0-9@_~-]{2,}\b'
stop_words = ['clean', 'heuristic', 'variant', 'lookslike', 'suspicious', 'file', 'heur', 'malware',
              'behaveslike', 'win32', 'win', 'w32', 'possible', 'threat', 'mal']

relevant_types = ['trojan', 'backdoor', 'worm']

sample = list(map(lambda x: ' '.join(x), sample.values))
# sample = [' '.join(sample.values)]
# display(sample)

cv = CountVectorizer(token_pattern=token_pattern, stop_words=stop_words)
vector = cv.fit_transform(sample)
occurrences = sorted(zip(cv.get_feature_names(),
                         np.asarray(vector.sum(axis=0)).ravel()), key=lambda x: x[1], reverse=True)
display(occurrences)

[('runouce', 12),
 ('chir', 11),
 ('b@mm', 10),
 ('worm', 5),
 ('virus', 4),
 ('emailworm', 3),
 ('runonce', 3),
 ('10748', 2),
 ('b-o', 2),
 ('email-worm', 2),
 ('pe_chir', 2),
 ('6652', 1),
 ('agent', 1),
 ('brontok', 1),
 ('bxafx', 1),
 ('chihack', 1),
 ('chir-a', 1),
 ('cnpeace', 1),
 ('dannado', 1),
 ('gen', 1),
 ('gen-runonce', 1),
 ('suspiciouspe', 1),
 ('thecid', 1),
 ('trj', 1),
 ('trojan', 1),
 ('v001', 1)]

## Known Naming

Start with vendors that provide their naming convention

### Microsoft

`TYPE DELIM PLATFORM DELIM FAMILY DELIM VARIANT DELIM INFO`

e.g.: `Backdoor:Win32/Caphaw.D!lnk`

In [117]:
microsoft_samples = pe32_av['microsoft']
microsoft_samples = microsoft_samples[microsoft_samples != 'clean'].dropna()

display(microsoft_samples[:10])

def microsoft_get_types(x):
    return x.split(':')[0]

def microsoft_get_platf(x):
    return x.split('/')[0].split(':')[-1]

def microsoft_get_families(x):
    return x.split('/')[-1].split('.')[0]

types = set(map(microsoft_get_types, microsoft_samples.values))
platf = set(map(microsoft_get_platf, microsoft_samples.values))
famls = set(map(microsoft_get_families, microsoft_samples.values))

date
2013-04-16              virus:win32/chir.b@mm
2013-04-16             trojan:win32/sefnit.aj
2013-04-16           backdoor:win32/poison.bf
2013-04-16           backdoor:win32/ursap!rts
2013-04-16                worm:win32/boychi.a
2013-04-16    trojandownloader:win32/kuluoz.b
2013-04-16              trojan:win32/meredrop
2013-04-16    trojandownloader:win32/dofoil.r
2013-04-16              virus:win32/chir.b@mm
2013-04-16              hacktool:win32/wpepro
Name: microsoft, dtype: object

## Ahnlab-v3

In [131]:
ahnlab_samples = pe32_av[vendors[0]]
# ahnlab_samples = pe32_av['bitdefender']
# Remove clean and NaN samples
ahnlab_samples = ahnlab_samples[ahnlab_samples != 'clean'].dropna()

display(ahnlab_samples[:10])

# Filter the delimiters
# '/' seems like the first one
replaceables = ['win-', 'i-']
delims = ['-', '.', '/']

def filter_func(x):
    # x = x.split('.')[0]
    
    for d in delims:
        x = x.replace(d, ' DELIM ')

    for t in types:
        x = x.replace(t + ' ', 'TYPE ')
    
    for p in platf:
        x = x.replace(p + ' ', 'PLAT ')
        
    for f in list(famls)[:1000]:
        x = x.replace(f + ' ', 'FAM ')
        
    return x

display(set(map(filter_func, ahnlab_samples.values)))

date
2013-04-16                      win32.runouce.b@mm
2013-04-16                  gen:variant.kazy.44812
2013-04-16               trojan.generic.kdv.380263
2013-04-16                   gen:variant.symmi.689
2013-04-16                  gen:variant.kazy.61786
2013-04-16               trojan.generic.kdv.787211
2013-04-16                  gen:variant.symmi.3565
2013-04-16                 gen:variant.kazy.161265
2013-04-16                      win32.runouce.b@mm
2013-04-16    dropped:application.sniffer.wpepro.e
Name: bitdefender, dtype: object

{'TYPE DELIM generic DELIM 12847796',
 'TYPE DELIM generickd DELIM 1167723',
 'TYPE DELIM TYPE DELIM agent DELIM scw',
 'gen:variant DELIM symmi DELIM 41940',
 'gen:variant DELIM kazy DELIM 8023',
 'gen:variant DELIM razy DELIM 13238',
 'TYPE DELIM dropper DELIM vrm',
 'gen:variant DELIM kazy DELIM 324841',
 'gen:variant DELIM kazy DELIM 546085',
 'TYPE DELIM generickd DELIM 2058223',
 'generic DELIM malware DELIM sp!pk!g DELIM e605539d',
 'TYPE DELIM downloader DELIM zlob DELIM acmz',
 'gen:variant DELIM barys DELIM 9064',
 'TYPE DELIM generic DELIM kdz DELIM 10704',
 'PLAT DELIM hllo DELIM nwu',
 'gen:variant DELIM graftor DELIM 208988',
 'TYPE DELIM agent DELIM nxr',
 'TYPE DELIM generic DELIM kd DELIM 21607',
 'TYPE DELIM generic DELIM 13407588',
 'gen:variant DELIM symmi DELIM 50307',
 'gen:variant DELIM graftor DELIM 194161',
 'generic DELIM poisonivy DELIM df6c8887',
 'memscan:TYPE DELIM sdbot DELIM dflt',
 'TYPE DELIM generickd DELIM 2901541',
 'TYPE DELIM htmclip DELIM a',
 'g

In [None]:
test = pe32_av[:10000]
test.fillna(' ', inplace=True)
trojan = ['trojan', 'troj']
generic = ['generic', 'genetic']
weird = ['not-a-virus']

names = []
for _, sample in test.iterrows():
    classification = ' '.join(sample[vendors].values)
    for trj in trojan:
        classification = classification.replace(trj, 'trj')
    classification
    for gen in generic:
        classification = classification.replace(gen, 'gen')
    for w in weird:
        classification = classification.replace(w, '')
    names.append(classification)

# vendors_naming = dict()
# for vendor in vendors:
#     vendors_naming[vendor] = np.unique(pe32_av[vendor])
# unique_names = np.unique(np.concatenate(list(vendors_naming.values())))

In [None]:
cv_token_pattern = r'\b[a-zA-Z]{3,}\b'
# cv_token_pattern = r'\b\w{3,}\b'
stop_words = ['clean', 'heuristic', 'variant', 'lookslike', 'suspicious', 'file', 'heur', 'malware',
              'behaveslike', 'win32', 'win', 'w32', 'gen', 'possible', 'threat', 'mal']
cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, stop_words=stop_words, ngram_range=(2, 2))
cv.fit(names)

display(cv.get_feature_names())

In [None]:
first_vendor = pe32_av['kaspersky'].dropna()
first_vendor = first_vendor[first_vendor != 'clean']
display(first_vendor)

In [None]:
irrelevant = ['generic', 'gen']
mtype = ['email-worm', 'trojan-downloader', 'trojan', 'adware', 'worm', 'backdoor', 'ransom', 'dangerousobject',
        'trojan-dropper', 'virus']
# Sort by length so that longer matches are replaced first
mtype.sort(key=len, reverse=True)
platf = ['win32']
info = ['heur', 'uds', 'not-a-virus']
# func = ['downloader']
delim = re.compile(r'[^a-zA-Z0-9-]')
spaces= re.compile(r'\s+')

def tokenize(x):
    x = x.lower()
    
    tmp = x.split(':')
    if len(tmp) == 2:
        tmp[0] = 'INFO'
        tmp = [tmp[0]] + tmp[1].split('.')
    else:
        tmp = x.split('.')

    for i, t in enumerate(tmp):
        if t in irrelevant:
            tmp[i] = ''
            continue
        if t in mtype:
            tmp[i] = 'TYPE'
            continue
        if t in platf:
            tmp[i] = 'PLAT'
    
    # tmp = x.split('.')
    # for i in irrelevant:
#         x = x.replace(i, '')
        
    # x = re.sub(delim, ' DELIM ', x)
    
    # for t in mtype:
#         x = x.replace(t, ' TYPE ')
#     for p in platf:
#         x = x.replace(p, ' PLAT ')
#     for i in info:
#         x = x.replace(m, ' INFO ')
        
#     x = re.sub(spaces, ' ', x).strip()
    
  #   tmp = x.split(' ')
    # if len(tmp) >= 5 and tmp[0] == 'TYPE' and tmp[2] == 'PLAT':
#         tmp[4] = 'FAMILY'
#         if len(tmp) == 7:
#             tmp[6] = 'VARIANT'
    return ' '.join(tmp)

tmp = first_vendor.apply(tokenize).values
display(set(tmp))

# stop_words = ['delim', 'type', 'plat', 'meth', 'func']
# vocabulary = ['delim', 'type', 'plat', 'meth', 'func']
# cv = CountVectorizer(vocabulary=vocabulary, ngram_range=(2, 2))
# cv.fit(tmp)
display(first_vendor)
# display(cv.get_feature_names())