# Labeled Dataset

This notebook focuses on creating an usable dataset for malware classification. It takes the AV classification and Imports, classifying a sample as clean if vendors with high presence say it's clean, and classifying as malware if the majority of vendors with high presence say it's malware.

For duplicated submissions, only the last submission is taken into account.

In [18]:
import numpy as np
import pandas as pd
from IPython.display import display

pe32_av = pd.read_csv('data/pe32_static_av.csv', dtype=str)
pe32_av['date'] = pd.to_datetime(pe32_av['date'], format='%Y/%m/%d')

pe32_imports = pd.read_csv('data/pe32_static_imports.csv', dtype=str)
pe32_imports.drop(['md5', 'date'], axis=1, inplace=True)

# Just keep last submission of duplicates
pe32_av.drop_duplicates(subset=['md5'], inplace=True, keep='last')

# Filter vendors with high presence
vendors_presence = pd.Series.from_csv('data/vendors_presence.csv')
vendors = vendors_presence[vendors_presence > 0.7].keys()
# Remove unused columns
pe32_av.drop([v for v in pe32_av.columns[3:] if v not in vendors], axis=1, inplace=True)

# Just keep samples where all vendors gave a classification
pe32_av.dropna(how='any', subset=vendors, inplace=True)

# Join classification with imports
pe32 = pd.merge(pe32_av, pe32_imports, on=['link'])
pe32 = pe32.set_index(['date', 'md5'])

In [19]:
# Filter samples where all vendors say it's clean
clean_samples = pe32[(pe32 == 'clean').T.sum() == len(vendors)]
clean_samples.is_copy = False
clean_samples.dropna(inplace=True)
clean_samples.drop(vendors, inplace=True, axis=1)
clean_samples = clean_samples.assign(malware=0)
clean_samples = clean_samples.reset_index().set_index('date')

# Filter samples where the majority say it's malware
malware_samples = pe32[(pe32 == 'clean').T.sum() / len(vendors) < 0.5]
malware_samples.is_copy = False
malware_samples.dropna(inplace=True)
malware_samples = malware_samples[malware_samples.imports != '']
malware_samples.drop(vendors, inplace=True, axis=1)
malware_samples = malware_samples.assign(malware=1)
malware_samples = malware_samples.reset_index().set_index('date')

result = pd.concat([clean_samples, malware_samples])
result.to_csv(path_or_buf='data/pe32_imports_labeled.csv')