# Classifier - Imports as Features

The remaining step before being able to train a classifier is to choose the features for the samples. In this case we'll use the static imports (DLL imports) from the binary files.

We start with a simple analysis to the number of features.

In [1]:
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

malware_threshold = 5
data_folder = '../data/'
vendors_file = data_folder + 'vendors.csv.gz'
reports_file = data_folder + 'reports.csv.gz'
dlls_file = data_folder + 'dlls.csv.gz'
# From notebook 2
top_vendors = ["ESET-NOD32", "McAfee-GW-Edition", "McAfee", "Sophos",
               "VIPRE", "GData", "Kaspersky", "AVG", "BitDefender",
               "Avast", "TrendMicro-HouseCall", "Fortinet", "Emsisoft",
               "DrWeb", "Ikarus", "F-Secure", "MicroWorld-eScan",
               "Symantec", "K7AntiVirus", "K7GW"]
tfidf_token_pattern = u'[^;]+'

In [2]:
# Contains all header information
reports = pd.read_csv(reports_file)
reports = reports.set_index('link')
# Sanity check
assert len(reports[~reports.file_type.str.startswith('PE32')]) == 0
reports.drop(['file_name', 'file_size', 'file_type'], axis=1, inplace=True)
reports.date = pd.to_datetime(reports.date, infer_datetime_format=True)
# Now it contains:
# reports = (link, md5, date)

# Contains link as vendors classification
vendors = pd.read_csv(vendors_file, dtype=str)
vendors = vendors.set_index('link')
vendors.dropna(how='all', inplace=True)

# Contains imports
imports = pd.read_csv(dlls_file)
imports = imports.set_index('link')

# Joins on link
# reports_vendors = (link, md5, date, ..vendors..)
reports_vendors = reports.join(vendors, how='inner')

In [3]:
# Filter columns for top vendors only
reports_vendors = reports_vendors[['md5', 'date'] + top_vendors]
# Order by date
reports_vendors.sort_values(by='date', inplace=True)
# Keep the last duplicates
reports_vendors.drop_duplicates(subset='md5', keep='last', inplace=True)
# Turn clean into NaN
reports_vendors.replace('Clean', np.nan, inplace=True)

### Filter malware/goodware

In [4]:
goodware = reports_vendors[reports_vendors.count(axis=1) == 3].index
malware = reports_vendors[reports_vendors.count(axis=1) >= malware_threshold + 3].index

In [16]:
# Get the labeled imports
imports = imports[imports.index.isin(list(goodware) + list(malware))].dropna()

In [27]:
tfidf = TfidfVectorizer(token_pattern=tfidf_token_pattern, max_features=1000, max_df=0.8)
tfidf.fit(imports.dlls)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='[^;]+', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
display(tfidf.get_feature_names())

['*invalid*',
 'abfwpubg.bpl',
 'abpubg.bpl',
 'ace.dll',
 'acledit.dll',
 'aclui.dll',
 'activeds.dll',
 'adbwinapi.dll',
 'adsldpc.dll',
 'advapi32',
 'advapi32.dll',
 'advpack.dll',
 'afbase.dll',
 'afcore.dll',
 'afutil.dll',
 'agentinvqry.bpl',
 'altiumcomponents.bpl',
 'altiumcore.bpl',
 'altiummisc.bpl',
 'aowc.bpl',
 'api-ms-win-core-apiquery-l1-1-0.dll',
 'api-ms-win-core-appcompat-l1-1-1.dll',
 'api-ms-win-core-atoms-l1-1-0.dll',
 'api-ms-win-core-com-l1-1-0.dll',
 'api-ms-win-core-com-l1-1-1.dll',
 'api-ms-win-core-com-private-l1-1-0.dll',
 'api-ms-win-core-console-l1-1-0.dll',
 'api-ms-win-core-console-l2-1-0.dll',
 'api-ms-win-core-crt-l1-1-0.dll',
 'api-ms-win-core-crt-l2-1-0.dll',
 'api-ms-win-core-datetime-l1-1-1.dll',
 'api-ms-win-core-debug-l1-1-0.dll',
 'api-ms-win-core-debug-l1-1-1.dll',
 'api-ms-win-core-delayload-l1-1-0.dll',
 'api-ms-win-core-delayload-l1-1-1.dll',
 'api-ms-win-core-errorhandling-l1-1-0.dll',
 'api-ms-win-core-errorhandling-l1-1-1.dll',
 'api-ms-