# Malware classifier - Part 1

The following creates a dataset based on static imports. Samples are extracted up to the last submission of sample **bbb445901d3ec280951ac12132afd87c**, split by the date of each submission of the target sample.

The following code prepares the dataframe for usage

In [1]:
import lib.data_loading as jcfg_data_loading
import lib.helpers as jcfg_helpers
import pandas as pd
from IPython.display import display

# Target sample
target_md5 = 'bbb445901d3ec280951ac12132afd87c'

# All other frames are based on this one
headers = jcfg_data_loading.loadHeadersCSV('data/header_analyses.csv')

# Antivirus score to classify sample as malware
AV_MLW_THRESHOLD = 0.8
# Antivirus score to classify sample as goodware
AV_GDW_THRESHOLD = 0.0

# Filter PE32 samples only
crit_pe32 = headers.file_type.map(lambda x: str(x).startswith('PE32 '))
pe32 = headers[crit_pe32]
pe32.is_copy = False

# Normalize the AV classification
pe32.antivirus = pe32.antivirus.apply(jcfg_helpers.normalize)

# No full timestamp is available, so for same day submissions keep the one with the highest
# AV score, might not always be true, but for the current test is not relevant
pe32 = pe32.assign(colIndex=pe32.index)
pe32.sort_values(by=['colIndex', 'md5', 'antivirus'], inplace=True)
pe32.drop_duplicates(subset=['md5', 'colIndex'], inplace=True, keep='last')
# Remove unneeded columns
pe32.drop(['colIndex', 'file_name', 'file_type'], axis=1, inplace=True)

Next we generate the dataset, split by the submissions of the sample to test

In [2]:
# Target submissions (remove last submission, AV score doesn't change
# significantly and time difference is high )
target_submissions = pe32[pe32.md5 == target_md5][:-1].index
# display(target_submissions)
## Used criteria
# Unknown submissions
crit_unknown = pe32.antivirus == -1
# Goodware submissions
crit_goodware = pe32.antivirus == AV_GDW_THRESHOLD
# Malware submissions
crit_malware = pe32.antivirus >= AV_MLW_THRESHOLD

# Start filtering
pe32_relevant = pe32[~crit_unknown & (crit_goodware | crit_malware) & (pe32.index <= target_submissions[-1])]
pe32_relevant.is_copy = False
# Remove full link, just keep submission ID
pe32_relevant.link = pe32_relevant.link.apply(lambda x: x.split('/')[2])
# Remove samples that were not downloaded (for some unknown reason)
pe32_relevant = pe32_relevant[~pe32_relevant.link.isin(jcfg_data_loading.check_link(pe32_relevant.link.values))]

# Add class
pe32_relevant = pe32_relevant.assign(malware=0)
crit_goodware = pe32_relevant.antivirus == AV_GDW_THRESHOLD
crit_malware = pe32_relevant.antivirus >= AV_MLW_THRESHOLD
pe32_relevant.loc[crit_goodware, 'malware'] = 0
pe32_relevant.loc[crit_malware, 'malware'] = 1

# Generate array where each n position contains samples up to the nth submission
# of the target sample
pe32_by_date = [pe32_relevant[pe32_relevant.index <= target_submissions[0]]]
for idx, date in enumerate(target_submissions[1:]):
    # idx is starting at 0 although we enumerate from the second position (no need to do idx - 1)
    pe32_by_date.append(pe32_relevant[(pe32_relevant.index > target_submissions[idx]) & (pe32_relevant.index <= date)])

Balance the dataset such that there is no more than a 20% difference between size of malware & goodware

In [3]:
for idx, data in enumerate(pe32_by_date):
    good_size = len(data[data.antivirus == AV_GDW_THRESHOLD])
    mal_size = len(data[data.antivirus >= AV_MLW_THRESHOLD])
    diff = abs(good_size - mal_size) / min(good_size, mal_size)
    if diff > 0.2:
        # Take a sample from goodware with same size as malware
        if good_size > mal_size:
            temp = pe32_by_date[idx][data.antivirus == AV_GDW_THRESHOLD].sample(mal_size)
            temp2= pe32_by_date[idx][data.antivirus >= AV_MLW_THRESHOLD]
        # Take a sample from malware with same size as goodware
        else:
            temp2 = pe32_by_date[idx][data.antivirus == AV_GDW_THRESHOLD].sample(good_size)
            temp= pe32_by_date[idx][data.antivirus >= AV_MLW_THRESHOLD]
        pe32_by_date[idx] = pd.concat([temp, temp2])

Extract the imports and set the data ready for training

In [4]:
pe32_by_date = list(map(
        lambda y: y.assign(
            imports=lambda x: [';'.join(jcfg_data_loading.parse_static_imports(l)) for l in x.link.values]),
        pe32_by_date))
for idx, data in enumerate(pe32_by_date):
    data.index = data.md5
    data.drop(['link', 'antivirus', 'md5'], inplace=True, axis=1)

Save the datasets for good measure

In [5]:
for idx, data in enumerate(pe32_by_date):
    data.to_csv(path_or_buf='dataset' + str(idx) + '.csv')