# Malware names filtering

Focus on filtering malware names, removing things like "gen", "generic" and "virus", "worm" in the middle of family names.

In [1]:
from IPython.display import display
import pandas as pd
import numpy as np
import re
import itertools

data_folder = 'data/'
vendors_selected_file = data_folder + 'vendors_selected.csv.gz'

In [2]:
# Load the names with only relevant vendors
vendors = pd.read_csv(vendors_selected_file, dtype=str)
vendors = vendors.set_index('link')
vendors.replace('Clean', np.nan, inplace=True)

In [88]:
# Name filter function
def filter_func(name):
    # To split the name into words
    regex = re.compile(r'\W')
    # To remove pairs that contain at least a number
    regex2= re.compile(r'^\S\d|\d\S$')
    # To remove hex values
    regex3= re.compile(r'^[A-Fa-f0-9]{3,}$')
    # generic keywords
    gen_keys = ['gen', 'generic', 'variant', 'of', 'heur', 'pe', 'potentially', 'unsafe']
    
    if name is np.nan:
        return np.nan
    # Convert to lower
    name = str(name).lower()
    
    name = regex.split(name)
    # Remove whitespaces
    name = map(str.strip, name)
    # Filter words with at least 2 len
    name = filter(lambda x: len(x) > 1, name)
    # Filter pairs
    name = filter(lambda x: not regex2.search(x), name)
    # Filter hex values
    name = filter(lambda x: not regex3.search(x), name)
    
    # Remove keywords
    # name = [n for n in name if n not in gen_keys]
    
    name = filter(lambda x: x != '', name)
    name = '.'.join(name)
    return name

# Redundant keywords
malware = ['malware', 'unwanted.program', 'mal', 'unclassifiedmalware', 'ws.reputation', 'artemis']
trojan = ['troj_gen', 'trojan.generickd', 'trojan.bt', 'trj.genetic']

In [89]:
%%time
# Apply filter
vendors_filtered = vendors[:10000].applymap(filter_func)
# Replace redundant
# just 'malware' gives no information about the sample
# vendors_filtered.replace(malware, np.nan, inplace=True)
# vendors_filtered.replace(trojan, 'trojan', inplace=True)

CPU times: user 1.49 s, sys: 25 µs, total: 1.49 s
Wall time: 1.49 s


In [90]:
display(vendors_filtered)

Unnamed: 0_level_0,Rising,NANO-Antivirus,Microsoft,VBA32,TrendMicro,Comodo,AhnLab-V3,Panda,Malwarebytes,K7GW,...,Avast,BitDefender,AVG,Kaspersky,GData,VIPRE,Sophos,McAfee,McAfee-GW-Edition,ESET-NOD32
link,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MDRlOGFlOTRjMzNjNGJhNmEwMWQ0MWFiMDYzYmJhNmI,pe.backdoor.undef,trojan.inject.sang,trojan.pugeju,bscope.trojan.kidalo,troj_gen,trojware.trojan.agent.gen,,trj.downloader.tcn,,riskware,...,obfuscated.ehl.trj,gen.explorerhijack.bmhfagxume,small.bo,heur.trojan.invader,gen.explorerhijack.bmhfagxume,trojan.generic.bt,mal.behav,generic.bg.exq,generic.bg.exq,variant.of.obfuscated.nax
YzE2YTA3ZmNkOTIyNDkyMTk1YmY5ZTEyNTkzZTc5ODI,,,,,,,,suspicious.file,rootkit.0access,,...,,gen.variant.symmi,,packed.katusha,gen.variant.symmi,,,,,variant.of.kryptik.bkbb
MjEwYmY3OGExMGMyNGM0Y2JhN2NlNjg2ZjY1MzAxMTI,,trojan.fraudrop.cwxrdl,,,,,,,,,...,msil.genmalicious.gg.trj,,,,,,troj.msil.bht,,behaveslike.suspect.vc,variant.of.msil.injector.cuz
YzViZWZlMWJhNjU5NDIyMWFjNWU1NDY2OWRmYTM1ZWM,pe.trojan.generic,trojan.xdr.czpdjp,virtool.vbinder,sscope.trojan.zbot.gen,,,,trj.chgt,,trojan,...,malware.gen,gen.trojan.heur.zuz.xwr.jwii,backdoor.wjc,heur.trojan.generic,gen.trojan.heur.zuz.xwr.jwii,detect.trojan.small.nmm,mal.generic,artemis,,trojandropper.small.nmm
MGM2OTY4ZGRhM2U0NGFjYWFmOGJkODk2ODU1NzI1Y2I,,,,,,,,,,,...,,,,,,,,,,
YjA2YzRiODk5YmZkNDY4NTliMjI4ZDFmMGUxYWVjYWY,,trojan.injector.dobjgo,,,troj_gen,,,trj.genetic.gen,,unwanted.program,...,malware.gen,trojan.generic,crypt3.bwef,trojan.dropper.injector.lezv,trojan.generic,trojan.generic.bt,mal.generic,rdn.generic.hra.ce,rdn.generic.hra.ce,variant.of.kryptik.cwcw
M2Y4MTg2YjZmODVhNGUyZGJhZTQ4YmZkZjRjZGJmNzI,,,,suspected.of.trojan.downloader.gen,,,,,,,...,,gen.variant.symmi,,heur.trojan.generic,gen.variant.symmi,,,,,variant.of.trojandownloader.banload.uro
Y2Y4MTJjODk2N2Y3NGI2ODgzYTZkNWZjMDViYzQzODQ,,trojan.click2.dfnbfk,,,,,,,,,...,malware.gen,,omc,,,,,artemis,behaveslike.rontokbro.tc,
YmU1YzRhYzZhZDkwNDdmZjhiY2NjOTAyNWMxOWZjMGQ,pe.trojan.spammer.aew,trojan.agent.tjzy,pws.fignotok,trojan.dropper.resgen,bkdr_bifrose.smn,trojware.psw.agent.szx,trojan.agent,trj.downloader.mdw,trojan.pws,trojan,...,spyware.gen.spy,trojan.agent.aolf,agent4.ajsr,trojan.agent.cccr,trojan.agent.aolf,trojan.agent.cccr,troj.spy.ed,generic.dropper.ny,heuristic.lookslike.suspicious,psw.agent.nly
ZjA4ZWNhMWQwODIxNGYyZTg4Y2E1ZDMzMjM4YzgxM2U,,,,,,,,,,,...,,,,,,,,,,variant.of.msil.surveyer


In [91]:
unique_names = []

for vendor in vendors_filtered:
    unique_names += list(vendors_filtered[vendor].dropna())

In [92]:
display(pd.Series(unique_names).value_counts()[10:])

mal.generic                           1016
trojan.gen                             955
unclassifiedmalware                    891
riskware                               869
trj.genetic.gen                        799
ws.reputation                          734
gen.variant.graftor                    732
heur.trojan.generic                    685
gen.variant.zusy                       626
trj.ci                                 624
adware.bettersurf                      607
trojan.downloader                      586
pe.trojan.generic                      549
backdoor                               528
gen.variant.barys                      485
trojan.agent                           483
troj_spnr                              477
trojan.packed                          470
uds.dangerousobject.multi.generic      415
generic.malware                        379
worm.generic                           377
install.core.click.run.software        322
installcore.fs                         320
virus      