# Extracting malware/goodware

Extract MD5s and links of malware and goodware, unique submissions only.

Malware submissions have an AV score of 0.8 or more and goodware submissions must have an AV score of 0.0

The following takes some time to run due to `jcfg_data_loading.check_link`. At the end the frame is saved to a file, no need to re-run the cell.

In [None]:
import pandas as pd
import lib.data_loading as jcfg_data_loading
import lib.helpers as jcfg_helpers
from IPython.display import display

# All other frames are based on this one
headers = jcfg_data_loading.loadHeadersCSV('data/header_analyses.csv')

# Antivirus score to classify sample as malware
AV_MLW_THRESHOLD = 0.8
# Antivirus score to classify sample as goodware
AV_GDW_THRESHOLD = 0.0

# Filter PE32 samples only
crit_pe32 = headers.file_type.map(lambda x: str(x).startswith('PE32 '))
pe32 = headers[crit_pe32]
pe32.is_copy = False

# Remove duplicates
# No full timestamp is available, so for same day submissions keep the one with the highest
# AV score, might not always be true, but for the current test is not relevant
pe32 = pe32.assign(colIndex=pe32.index)
pe32.sort_values(by=['colIndex', 'md5', 'antivirus'], inplace=True)
pe32.drop_duplicates(subset=['md5', 'colIndex'], inplace=True, keep='last')
pe32.drop_duplicates(subset=['md5'], inplace=True)
# Remove unneeded columns
pe32.drop(['colIndex', 'file_name', 'file_type'], axis=1, inplace=True)

# Normalize the AV classification
pe32.antivirus = pe32.antivirus.apply(jcfg_helpers.normalize)

## Used criteria
# Unknown submissions
crit_unknown = pe32.antivirus == -1
# Goodware submissions
crit_goodware = pe32.antivirus == AV_GDW_THRESHOLD
# Malware submissions
crit_malware = pe32.antivirus >= AV_MLW_THRESHOLD

# Start filtering
pe32 = pe32[~crit_unknown & (crit_goodware | crit_malware)]

# Remove full link, just keep submission ID
pe32.link = pe32.link.apply(lambda x: x.split('/')[2])

# Assign malware class and drop antivirus score
crit_malware = pe32.antivirus >= AV_MLW_THRESHOLD
crit_goodware = pe32.antivirus == AV_GDW_THRESHOLD
pe32.loc[crit_malware, 'malware'] = 1
pe32.loc[crit_goodware, 'malware'] = 0
pe32.drop(['antivirus'], axis=1, inplace=True)

# Remove samples that were not downloaded (for some unknown reason)
pe32 = pe32[~pe32.link.isin(jcfg_data_loading.check_link(pe32.link.values))]

# Save to a file
pe32.to_csv(path_or_buf='md5_links.csv')

# Extract the imports

Running the following code directly in python takes at least 30min on my laptop.

In [None]:
import numpy as np
import pandas as pd
import gevent
import datetime
from gevent import monkey; monkey.patch_all()
import lib.data_loading as jcfg_data_loading
from IPython.display import display

samples = pd.read_csv('md5_links.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
# Set date as index
samples = samples.set_index('date')

# Split into 100 workers and load imports
n_workers = 100
buckets = np.array_split(samples, n_workers)
print('[{0}] Spawning jobs...'.format(datetime.datetime.now()))
jobs = [gevent.spawn(lambda y: y.assign(
            imports=lambda x: [';'.join(jcfg_data_loading.parse_static_imports(l)) for l in x.link.values]), b) for b in buckets]
print('[{0}] Done spawning.'.format(datetime.datetime.now()))
gevent.joinall(jobs)
print('[{0}] All jobs done.'.format(datetime.datetime.now()))
result = pd.concat([j.value for j in jobs])
print('[{0}] Concat done.'.format(datetime.datetime.now()))
result.to_csv(path_or_buf='data/imports.csv')