# Dataset Code Testbed

This notebook will conduct code prototyping for executing dataset related processes.

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#Filenames
DATASET_FILENAME = "oliveira_lite.csv" #Change accordingly
API_LIST = "api_calls.txt"

#APIs List
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(str(np.nan)) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

# Loading Dataset File

In [2]:
oli = pd.read_csv(DATASET_FILENAME, low_memory=False, memory_map=True)

#Dataset Information
print("Dataset Information:")
print(oli.info())
print("\nDataset Shape:", oli.shape)

#Dataset Labels:
print('\nDataset Labels:\n',oli['malware'].value_counts())

oli.head()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2610 entries, 0 to 2609
Columns: 102 entries, hash to malware
dtypes: int64(101), object(1)
memory usage: 2.0+ MB
None

Dataset Shape: (2610, 102)

Dataset Labels:
 malware
1    2540
0      70
Name: count, dtype: int64


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,5b51d65972a349f90a86984c26b12b30,286,110,172,240,117,240,117,240,117,...,215,114,215,117,261,106,144,297,117,0
1,ceb8cc125478fad641daa4e04e9b2f19,198,208,106,271,144,194,257,127,114,...,215,86,215,172,117,215,86,215,297,0
2,f108600edf46d7c20f6acc522aeba6df,82,228,16,29,82,29,82,29,82,...,286,73,286,208,286,73,286,257,114,0
3,711be6337cb78a948f04759a0bd210ce,82,240,117,240,117,240,117,240,117,...,117,208,117,35,240,117,35,208,240,0
4,6de26f67ceb1e3303b889489010f4c3f,286,110,172,240,117,240,117,240,117,...,215,114,215,117,71,25,71,275,260,0


# Dataset Preparation

In [3]:
#Dataset Cleaning: Nothing much here as the dataset is already clean enough for our purposes.

#Dataset Formatting: Changing order of columns to malware (label), hash, t_1, t_2, ..., t_99
label_col = oli.pop('malware')
oli.insert(0, label_col.name, label_col)
label_col = None

#Inverse Label Encoding
def inverse_label(item):
    return APIS[item]
for r in range(oli.shape[0]):
    row = oli.iloc[r, 2:].to_list()
    row = list(map(int, row))
    row = list(map(inverse_label, row))
    oli.iloc[r, 2:] = row

oli.head()

Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,0,5b51d65972a349f90a86984c26b12b30,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,NtCreateFile,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,CoCreateInstance,NtOpenSection,CreateDirectoryW,NtCreateFile,LdrGetProcedureAddress
1,0,ceb8cc125478fad641daa4e04e9b2f19,GetSystemInfo,NtAllocateVirtualMemory,NtOpenSection,GetTempPathW,CreateDirectoryW,GetFileAttributesW,FindFirstFileExW,DeleteFileW,...,LdrGetProcedureAddress,NtClose,NtCreateMutant,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,NtClose,NtCreateMutant,NtClose,NtCreateFile
2,0,f108600edf46d7c20f6acc522aeba6df,GetSystemTimeAsFileTime,NtProtectVirtualMemory,SetUnhandledExceptionFilter,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,...,GetFileAttributesExW,SetErrorMode,GetFileAttributesExW,SetErrorMode,NtAllocateVirtualMemory,SetErrorMode,GetFileAttributesExW,SetErrorMode,FindFirstFileExW,NtQueryDirectoryFile
3,0,711be6337cb78a948f04759a0bd210ce,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,GetSystemMetrics,LdrLoadDll,LdrGetProcedureAddress,GetSystemMetrics,NtAllocateVirtualMemory,LdrLoadDll
4,0,6de26f67ceb1e3303b889489010f4c3f,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,NtCreateFile,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW


# Feature Duplicate Processing

In [4]:
TB = oli.copy(deep=True) #Time-based behavior
IB = oli.copy(deep=True) #Instance-based behavior (to be created)

for r in range(oli.shape[0]):
    row = oli.iloc[r, 2:].transpose().drop_duplicates(keep='first', inplace=False).transpose().to_list()
    for i in range(100-len(row)):
        row.append(np.NaN)
    IB.iloc[r, 2:] = row

In [5]:
TB.head()

Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,0,5b51d65972a349f90a86984c26b12b30,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,NtCreateFile,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,CoCreateInstance,NtOpenSection,CreateDirectoryW,NtCreateFile,LdrGetProcedureAddress
1,0,ceb8cc125478fad641daa4e04e9b2f19,GetSystemInfo,NtAllocateVirtualMemory,NtOpenSection,GetTempPathW,CreateDirectoryW,GetFileAttributesW,FindFirstFileExW,DeleteFileW,...,LdrGetProcedureAddress,NtClose,NtCreateMutant,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,NtClose,NtCreateMutant,NtClose,NtCreateFile
2,0,f108600edf46d7c20f6acc522aeba6df,GetSystemTimeAsFileTime,NtProtectVirtualMemory,SetUnhandledExceptionFilter,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,...,GetFileAttributesExW,SetErrorMode,GetFileAttributesExW,SetErrorMode,NtAllocateVirtualMemory,SetErrorMode,GetFileAttributesExW,SetErrorMode,FindFirstFileExW,NtQueryDirectoryFile
3,0,711be6337cb78a948f04759a0bd210ce,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,GetSystemMetrics,LdrLoadDll,LdrGetProcedureAddress,GetSystemMetrics,NtAllocateVirtualMemory,LdrLoadDll
4,0,6de26f67ceb1e3303b889489010f4c3f,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,NtCreateFile,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW


In [6]:
IB.head()

Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,0,5b51d65972a349f90a86984c26b12b30,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,...,,,,,,,,,,
1,0,ceb8cc125478fad641daa4e04e9b2f19,GetSystemInfo,NtAllocateVirtualMemory,NtOpenSection,GetTempPathW,CreateDirectoryW,GetFileAttributesW,FindFirstFileExW,DeleteFileW,...,,,,,,,,,,
2,0,f108600edf46d7c20f6acc522aeba6df,GetSystemTimeAsFileTime,NtProtectVirtualMemory,SetUnhandledExceptionFilter,GetTimeZoneInformation,GetFileAttributesW,CreateDirectoryW,NtCreateFile,GetFileType,...,,,,,,,,,,
3,0,711be6337cb78a948f04759a0bd210ce,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,NtQuerySystemInformation,LdrGetDllHandle,CoInitializeEx,...,,,,,,,,,,
4,0,6de26f67ceb1e3303b889489010f4c3f,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,...,,,,,,,,,,


# Building Reserve Test as "External Dataset"

Using 90:10 split for Train:Reserve respectively

## Time-Based Dataset

In [7]:
#Time-based
X = TB.iloc[:,1:] #Features (including hash for now)
y = TB.iloc[:,0] #Labels
TB_Train_X, TB_Reserve_X, TB_Train_y, TB_Reserve_y = train_test_split(X, y, test_size=.10, random_state=True, shuffle=True)

TB_Train_X.insert(0, 'malware', TB_Train_y)
TB_Reserve_X.insert(0, 'malware', TB_Reserve_y)

TB = TB_Train_X.copy(deep=True)
TB_Reserve = TB_Reserve_X.copy(deep=True)

TB_Reserve.drop(axis=1, labels='hash', inplace=True)
TB_Reserve.to_csv(mode='w', path_or_buf='TB/TB_Reserve.csv')

In [8]:
print("TB Shape:", TB.shape)
TB.head()

TB Shape: (2349, 102)


Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
2530,1,bb31d354705eba393338ede3ffc1f908,LdrGetDllHandle,GetSystemInfo,NtDelayExecution,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,...,NtClose,NtAllocateVirtualMemory,RegOpenKeyExW,RegCloseKey,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegCloseKey,RegQueryValueExW,RegCloseKey
2346,1,9cfd5f4d96b84df576285452bea5ba2b,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,...,GetCursorPos,NtAllocateVirtualMemory,NtFreeVirtualMemory,GetCursorPos,NtAllocateVirtualMemory,NtFreeVirtualMemory,GetCursorPos,NtAllocateVirtualMemory,NtFreeVirtualMemory,GetCursorPos
1260,1,03a70804f5247567d44633410482068f,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll
1875,1,84bde4396d6a67c7df0339fb614c4fc8,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW
415,1,9e1c9b91dce6d7fed8c12e1e0f849bdb,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,LdrGetProcedureAddress,GetSystemMetrics,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemDirectoryW,LdrLoadDll,CreateThread,FindResourceExW,NtClose,GetSystemMetrics


In [9]:
print("TB_Reserve Shape:", TB_Reserve.shape)
TB_Reserve.head()

TB_Reserve Shape: (261, 101)


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
1826,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW
1196,1,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW
1985,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,NtAllocateVirtualMemory,SetUnhandledExceptionFilter,...,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,NtOpenProcess,NtClose,GetSystemMetrics
675,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW
607,1,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtCreateMutant,GetNativeSystemInfo,GetSystemWindowsDirectoryW,NtClose,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll


## Instance-Based Dataset

In [10]:
#Instance-based
X = IB.iloc[:,1:] #Features (including hash for now)
y = IB.iloc[:,0] #Labels
IB_Train_X, IB_Reserve_X, IB_Train_y, IB_Reserve_y = train_test_split(X, y, test_size=.10, random_state=True, shuffle=True)

IB_Train_X.insert(0, 'malware', IB_Train_y)
IB_Reserve_X.insert(0, 'malware', IB_Reserve_y)

IB = IB_Train_X.copy(deep=True)
IB_Reserve = IB_Reserve_X.copy(deep=True)

IB_Reserve.drop(axis=1, labels='hash', inplace=True)
IB_Reserve.to_csv(mode='w', path_or_buf='IB/IB_Reserve.csv')

In [11]:
print("IB Shape:", IB.shape)
IB.head()

IB Shape: (2349, 102)


Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
2530,1,bb31d354705eba393338ede3ffc1f908,LdrGetDllHandle,GetSystemInfo,NtDelayExecution,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,LdrGetProcedureAddress,CreateToolhelp32Snapshot,...,,,,,,,,,,
2346,1,9cfd5f4d96b84df576285452bea5ba2b,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,GetSystemDirectoryW,NtDelayExecution,...,,,,,,,,,,
1260,1,03a70804f5247567d44633410482068f,LdrLoadDll,LdrGetProcedureAddress,NtProtectVirtualMemory,NtClose,NtOpenKey,NtQueryValueKey,LdrGetDllHandle,GetSystemInfo,...,,,,,,,,,,
1875,1,84bde4396d6a67c7df0339fb614c4fc8,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,...,,,,,,,,,,
415,1,9e1c9b91dce6d7fed8c12e1e0f849bdb,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,SHGetFolderPathW,SetErrorMode,GetFileAttributesW,...,,,,,,,,,,


In [12]:
print("IB_Reserve Shape:", IB_Reserve.shape)
IB_Reserve.head()

IB_Reserve Shape: (261, 101)


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
1826,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,...,,,,,,,,,,
1196,1,LdrLoadDll,LdrGetProcedureAddress,NtProtectVirtualMemory,NtClose,NtOpenKey,NtQueryValueKey,LdrGetDllHandle,GetSystemInfo,NtAllocateVirtualMemory,...,,,,,,,,,,
1985,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,LdrLoadDll,RegOpenKeyExA,RegQueryValueExA,...,,,,,,,,,,
675,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,...,,,,,,,,,,
607,1,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,...,,,,,,,,,,


# Data Rebalancing

To be applied to the train split (i.e., TB & IB)

RandomOverSampler is less prone to oversampling a sample/set of samples as is the case shown in [Balance Oliveira](https://github.com/jm55DLSU/THESIS/blob/main/ML%20Test/Balancing%20Oliveira/Balancing_Oliveira.ipynb).

In [13]:
def sample_distribution(dataset):
    distribution = []
    total_size = dataset.shape[0]
    dataset = str(dataset[['hash']].value_counts()[0:10]).split('\n')
    dataset.pop(0)
    dataset.pop(len(dataset)-1)
    for d in dataset:
        hs = d.split(' ')[0]
        qt = int(d.split(' ')[len(d.split(' '))-1])
        ratio = str(round(qt/total_size*100,2))+"%"
        distribution.append([hs,qt,ratio])
        print([hs,qt,ratio])
    
ros = RandomOverSampler(sampling_strategy='minority', random_state=1, shrinkage=None)

print("Time-based Label Counts:")
print(TB['malware'].value_counts())

print("Instance-based Label Counts:")
print(IB['malware'].value_counts())

Time-based Label Counts:
malware
1    2287
0      62
Name: count, dtype: int64
Instance-based Label Counts:
malware
1    2287
0      62
Name: count, dtype: int64


In [14]:
#Time-based
X = TB.iloc[:,1:]
y = TB.iloc[:,0]
X, y = ros.fit_resample(X, y)
X.insert(0, 'malware', y)
TB = X.copy(deep=True)

#Check sample distribution
print("TB Sample Distribution (Top 10)")
sample_distribution(TB)

TB.drop(axis=1, labels='hash', inplace=True)
print("Time-based Label Counts:")
print(TB['malware'].value_counts())
TB.head()

TB.to_csv(mode='w', path_or_buf='TB/TB.csv')

TB Sample Distribution (Top 10)
['0d2ab02c993ea29a1989b442bf7150c7', 52, '1.14%']
['349aae8db20b24d14a90038d5c4c5549', 51, '1.11%']
['0b2ec965cee44e5bf3030bd1a61214f8', 48, '1.05%']
['7923bd4c45291be19db81d93049aae68', 47, '1.03%']
['022fee1488e6a0d06bc5882b998663f3', 47, '1.03%']
['711be6337cb78a948f04759a0bd210ce', 46, '1.01%']
['6e51234733dec1e25f2fc3245aea3d7c', 46, '1.01%']
['37e31a84967f6e5135ff0cfd10bfe487', 45, '0.98%']
['12e8d1d2f8c3c7df6e2c8ed37217b5de', 45, '0.98%']
['2d2e4fd9a96a6638a6592ecf0a3bc846', 44, '0.96%']
Time-based Label Counts:
malware
1    2287
0    2287
Name: count, dtype: int64


  X.insert(0, 'malware', y)


In [15]:
#Instance-based
X = IB.iloc[:,1:]
y = IB.iloc[:,0]
X, y = ros.fit_resample(X, y)
X.insert(0, 'malware', y)
IB = X.copy(deep=True)

#Check sample distribution
print("IB Sample Distribution (Top 10)")
sample_distribution(IB)

IB.drop(axis=1, labels='hash', inplace=True)
print("Instance-based Label Counts:")
print(IB['malware'].value_counts())
IB.head()

IB.to_csv(mode='w', path_or_buf='IB/IB.csv')

  X.insert(0, 'malware', y)


IB Sample Distribution (Top 10)
['0d2ab02c993ea29a1989b442bf7150c7', 52, '1.14%']
['349aae8db20b24d14a90038d5c4c5549', 51, '1.11%']
['0b2ec965cee44e5bf3030bd1a61214f8', 48, '1.05%']
['7923bd4c45291be19db81d93049aae68', 47, '1.03%']
['022fee1488e6a0d06bc5882b998663f3', 47, '1.03%']
['711be6337cb78a948f04759a0bd210ce', 46, '1.01%']
['6e51234733dec1e25f2fc3245aea3d7c', 46, '1.01%']
['37e31a84967f6e5135ff0cfd10bfe487', 45, '0.98%']
['12e8d1d2f8c3c7df6e2c8ed37217b5de', 45, '0.98%']
['2d2e4fd9a96a6638a6592ecf0a3bc846', 44, '0.96%']
Instance-based Label Counts:
malware
1    2287
0    2287
Name: count, dtype: int64


# Data Splitting for Train and Test Subsets of TB & IB Datasets

Using 70:30 for Train:Test.

In [16]:
#Time-based (TB) Dataset
X = TB.iloc[:,1:]
y = TB.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=True, shuffle=True)

X_train.insert(0, 'malware', y_train)
TB_Train = X_train.copy()
TB_Train.to_csv(mode='w', path_or_buf='TB/TB_Train.csv')

X_test.insert(0, 'malware', y_test)
TB_Test = X_train.copy()
TB_Train.to_csv(mode='w', path_or_buf='TB/TB_Test.csv')

In [17]:
#Instance-based (IB) Dataset
X = IB.iloc[:,1:]
y = IB.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=True, shuffle=True)

X_train.insert(0, 'malware', y_train)
IB_Train = X_train.copy()
IB_Train.to_csv(mode='w', path_or_buf='IB/IB_Train.csv')

X_test.insert(0, 'malware', y_test)
IB_Test = X_train.copy()
IB_Train.to_csv(mode='w', path_or_buf='IB/IB_Test.csv')

# Label Encoding

For LightGBM's use

In [18]:
def label_encode(raw, enc_filename):
    global APIS
    encoded = [None, None, None, None]
    le = LabelEncoder()
    le.fit(APIS)
    for i, tb in enumerate(raw):
        encoded[i] = tb.copy(deep=True)
        for row in range(encoded[i].shape[0]):
            encoded[i].iloc[row,1:] = le.transform(encoded[i].iloc[row,1:])
        encoded[i].to_csv(mode='w', path_or_buf=enc_filename[i])
    return encoded

In [19]:
#Time-based
print("Label Encoding Time-based Datasets...", end='')
tb_raw = [TB, TB_Reserve, TB_Train, TB_Test]
tb_enc_filename = ['TB/TB_Enc.csv', 'TB/TB_Reserve_Enc.csv', 'TB/TB_Train_Enc.csv', 'TB/TB_Test_Enc.csv']
tb_encoded = label_encode(tb_raw, tb_enc_filename)
print("Finished!")

Label Encoding Time-based Datasets...Finished!


In [20]:
#Instance-based
print("Label Encoding Instance-based Datasets...", end='')
ib_raw = [IB, IB_Reserve, IB_Train, IB_Test]
ib_enc_filename = ['IB/IB_Enc.csv', 'IB/IB_Reserve_Enc.csv', 'IB/IB_Train_Enc.csv', 'IB/IB_Test_Enc.csv']
ib_encoded = label_encode(ib_raw, ib_enc_filename)
print("Finished!")

Label Encoding Instance-based Datasets...Finished!
