In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
data_folder = '../data/csv_data/'
extension = '.csv.gz'
build_path = lambda x: data_folder + x + extension

cv_token_pattern = u'[^;]+'

real_dataset = pd.read_csv(build_path('real_dataset')).set_index('link')
real_dataset = real_dataset.astype(float)

# Static Imports Pipeline

In [3]:
df = pd.read_csv(build_path('malwr_imports')).set_index('link').dropna()
df = df[df.index.isin(real_dataset.index)]

vocabulary = ['ADVAPI32.DLL', 'ADVAPI32.dll', 'COMCTL32.DLL', 'COMCTL32.dll',
       'COMDLG32.dll', 'CRYPT32.dll', 'GDI32.DLL', 'GDI32.dll',
       'IMM32.dll', 'IPHLPAPI.DLL', 'KERNEL32.DLL', 'KERNEL32.dll',
       'MFC42.DLL', 'MPR.dll', 'MSIMG32.dll', 'MSVBVM60.DLL',
       'MSVCRT.dll', 'NETAPI32.dll', 'OLEAUT32.dll', 'PSAPI.DLL',
       'RPCRT4.dll', 'SETUPAPI.dll', 'SHELL32.DLL', 'SHELL32.dll',
       'SHFolder.dll', 'SHLWAPI.dll', 'URLMON.DLL', 'USER32.DLL',
       'USER32.dll', 'USERENV.dll', 'UxTheme.dll', 'VERSION.dll',
       'WINHTTP.dll', 'WININET.dll', 'WINMM.dll', 'WINSPOOL.DRV',
       'WS2_32.DLL', 'WS2_32.dll', 'WSOCK32.dll', 'WTSAPI32.dll',
       'advapi32.dll', 'comctl32.dll', 'comdlg32.dll', 'gdi32.dll',
       'gdiplus.dll', 'kernel32.dll', 'msacm32.dll', 'mscoree.dll',
       'msimg32.dll', 'msvcrt.dll', 'netapi32.dll', 'ntdll.dll',
       'ole32.dll', 'oleaut32.dll', 'oledlg.dll', 'shell32.dll',
       'shlwapi.dll', 'urlmon.dll', 'user32.dll', 'version.dll',
       'wininet.dll', 'winmm.dll', 'winspool.drv', 'wsock32.dll']

static_pipe = Pipeline(steps=[('cv', CountVectorizer(
    token_pattern=cv_token_pattern, lowercase=False, binary=True, vocabulary=vocabulary))])

In [4]:
static_pipe.fit(df.imports)
joblib.dump(static_pipe, 'static_pipe.pk', compress=9, protocol=2)

['static_pipe.pk']

# Categories Pipeline

In [3]:
df2 = pd.read_csv(build_path('malwr_behav_categories')).set_index('link').fillna(0)
df2 = df2[df2.index.isin(real_dataset.index)]
categories_pipe = Pipeline(steps=[('qt', QuantileTransformer(output_distribution='normal'))])

In [12]:
print(list(df2.columns))

['anomaly', 'device', 'filesystem', 'hooking', 'misc', 'network', 'process', 'registry', 'services', 'socket', 'synchronization', 'system', 'threading', 'windows']


In [6]:
categories_pipe.fit(df2)
joblib.dump(categories_pipe, 'categories_pipe.pk', compress=9, protocol=2)

['categories_pipe.pk']

# API Calls Pipeline

In [6]:
df3 = pd.read_csv(build_path('malwr_behav_api_calls')).set_index('link').fillna(0)
df3 = df3[df3.index.isin(real_dataset.index)]
api_pipe = Pipeline(steps=[('qt', QuantileTransformer(output_distribution='normal'))])

In [14]:
print(list(df3.columns))

['ControlService', 'CopyFileA', 'CopyFileExW', 'CopyFileW', 'CreateDirectoryExW', 'CreateDirectoryW', 'CreateProcessInternalW', 'CreateRemoteThread', 'CreateServiceA', 'CreateServiceW', 'CreateThread', 'DeleteFileA', 'DeleteFileW', 'DeleteService', 'DeviceIoControl', 'DnsQuery_A', 'DnsQuery_UTF8', 'DnsQuery_W', 'ExitProcess', 'ExitThread', 'ExitWindowsEx', 'FindFirstFileExA', 'FindFirstFileExW', 'FindWindowA', 'FindWindowExA', 'FindWindowExW', 'FindWindowW', 'GetAddrInfoW', 'GetCursorPos', 'GetSystemMetrics', 'HttpOpenRequestA', 'HttpOpenRequestW', 'HttpSendRequestA', 'HttpSendRequestW', 'InternetCloseHandle', 'InternetConnectA', 'InternetConnectW', 'InternetOpenA', 'InternetOpenUrlA', 'InternetOpenUrlW', 'InternetOpenW', 'InternetReadFile', 'InternetWriteFile', 'IsDebuggerPresent', 'LdrGetDllHandle', 'LdrGetProcedureAddress', 'LdrLoadDll', 'LookupPrivilegeValueW', 'MoveFileWithProgressW', 'NtCreateDirectoryObject', 'NtCreateFile', 'NtCreateKey', 'NtCreateMutant', 'NtCreateNamedPipeFil

In [8]:
api_pipe.fit(df3)
joblib.dump(api_pipe, 'api_pipe.pk', compress=9, protocol=2)

['api_pipe.pk']

# Signatures Pipeline

In [9]:
# df = pd.read_csv(build_path('malwr_signatures')).set_index('link').fillna(0)
# df = df[df.index.isin(real_dataset.index)]
# signatures_pipe = Pipeline

# Base Classifiers

In [67]:
classes = ['other', 'ransom', 'spyware', 'trojan', 'virus', 'worm']
static = pd.DataFrame(static_pipe.transform(df.sort_index().imports).todense())
cats = pd.DataFrame(categories_pipe.transform(df2.sort_index()))
apis = pd.DataFrame(api_pipe.transform(df3.sort_index()))
features = static.join(cats, rsuffix='_').join(apis, rsuffix='__')

In [79]:
layer0_features = pd.DataFrame(columns=classes, index=df.sort_index().index)

In [81]:
%%time
for c in classes:
    mal = real_dataset[c].sort_index()
    mal[mal.notnull()] = 1
    mal.fillna(0, inplace=True)
    # cria pipeline para esta classe
    # treina classificador
    # guarda classificador
    
    class_pipe = Pipeline(steps=[('lr', LogisticRegression())])
    class_pipe.fit(features, mal)
    layer0_features.loc[:, c] = class_pipe.predict_proba(features)[:,1]
    joblib.dump(class_pipe, 'class_{}.pk'.format(c), compress=9, protocol=2)

CPU times: user 6min 10s, sys: 4.59 s, total: 6min 14s
Wall time: 6min 11s


In [89]:
malware = real_dataset.dropna(how='all').index
goodware = real_dataset[~real_dataset.index.isin(malware)].index

In [96]:
layer0_features.loc[malware, 'malware'] = 1
layer0_features.loc[goodware, 'malware'] = 0

In [114]:
layer1_pipe = Pipeline(steps=[('lr', LogisticRegression())])
layer1_pipe.fit(layer0_features.drop('malware', axis=1), layer0_features.malware)
joblib.dump(layer1_pipe, 'class_layer1.pk', compress=9, protocol=2)

['class_layer1.pk']