# Dataset Code Testbed

This notebook will conduct code prototyping for executing dataset related processes.

Lite version: `oliveira_lite.csv`

Whole version: `oliveira.csv`

*Note that the hyperparameter `random_state` was set to `1` instead of `None` for test repeatability.*

In [1]:
#Python Libraries
import time
import threading

#Data/Dataset libraries
import pandas as pd
import numpy as np

#Split Sampler/Data Splitting
from sklearn.model_selection import train_test_split

#Oversampler
from imblearn.over_sampling import RandomOverSampler, SMOTEN

#Label Encoding
from sklearn.preprocessing import LabelEncoder

#Filenames
filename = ["oliveira_lite.csv", "oliveira.csv"]
DATASET_FILENAME = filename[0] #Change accordingly;
API_LIST = "api_calls.txt"

#APIs List
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append("NaN") #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Timer
start = end = 0
def startTime():
    global start
    start = time.time()
def endTime():
    global start
    elapse = time.time()-start
    start = 0
    print(f"Elapsed Time: {round(elapse, 6)}s\n")
    return round(elapse, 6)

# Loading Dataset File

In [2]:
startTime()
oli = pd.read_csv(DATASET_FILENAME, low_memory=False, memory_map=True)
endTime()

#Dataset Information
print("Dataset Information:")
print(oli.info())
print("\nDataset Shape:", oli.shape)

#Dataset Labels:
print('\nDataset Labels:\n',oli['malware'].value_counts())

oli.head()

Elapsed Time: 0.706485s

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43876 entries, 0 to 43875
Columns: 102 entries, hash to malware
dtypes: int64(101), object(1)
memory usage: 34.1+ MB
None

Dataset Shape: (43876, 102)

Dataset Labels:
 malware
1    42797
0     1079
Name: count, dtype: int64


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,1
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,1
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,1
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,1
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,1


# Dataset Preparation

In [3]:
startTime()
#Dataset Cleaning: Nothing much here as the dataset is already clean enough for our purposes.
#Dataset Formatting: Changing order of columns to malware (label), hash, t_1, t_2, ..., t_99
label_col = oli.pop('malware')
oli.insert(0, label_col.name, label_col)
label_col = None
#Inverse Label Encoding
def inverse_label(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
oli.iloc[:, 2:] = oli.iloc[:, 2:].apply(inverse_label, axis=1, result_type='reduce')
print("")
endTime()
    
oli.head()

KeyboardInterrupt: 

# Feature Duplicate Processing

In [None]:
TB = oli.copy(deep=True) #Time-based behavior
IB = oli.copy(deep=True) #Instance-based behavior (to be created)

startTime()
print("Transposing IB...")
IB.transpose()
print("IB Transposed!")
print("Removing duplicates...")
print("Row:", end=" ")
for r in range(oli.shape[0]):
    #Per row (sample) removal of duplicates, thus cannot scale into the whole dataframe (which is way faster)
    row = IB.iloc[r, 2:].drop_duplicates(keep='first', inplace=False).to_list() 
    for i in range(100-len(row)):
        row.append("NaN")
    IB.iloc[r, 2:] = row
    print(r, end=" ")
print("\nDuplicates removed!")
print("Retransposing IB (revert)...")
IB.transpose()
print("IB Retransposed!")
endTime()

In [None]:
TB.head()

In [None]:
IB.head()

# Building Reserve Test as "External Dataset"

Using 90:10 split for Train:Reserve respectively

## Time-Based Dataset

In [None]:
startTime()

#Time-based
X = TB.iloc[:,1:] #Features (including hash for now)
y = TB.iloc[:,0] #Labels
TB_Train_X, TB_Reserve_X, TB_Train_y, TB_Reserve_y = train_test_split(X, y, test_size=.10, random_state=1, shuffle=True)

TB_Train_X.insert(0, 'malware', TB_Train_y)
TB_Reserve_X.insert(0, 'malware', TB_Reserve_y)

TB = TB_Train_X.copy(deep=True)
TB_Reserve = TB_Reserve_X.copy(deep=True)

TB_Reserve.drop(axis=1, labels='hash', inplace=True)
TB_Reserve.to_csv(index=False, chunksize=100, mode='w', path_or_buf='TB/TB_Reserve.csv')

endTime()

In [None]:
print("TB Shape:", TB.shape)
TB.head()

In [None]:
print("TB_Reserve Shape:", TB_Reserve.shape)
TB_Reserve.head()

## Instance-Based Dataset

In [None]:
startTime()

#Instance-based
X = IB.iloc[:,1:] #Features (including hash for now)
y = IB.iloc[:,0] #Labels
IB_Train_X, IB_Reserve_X, IB_Train_y, IB_Reserve_y = train_test_split(X, y, test_size=.10, random_state=1, shuffle=True)

IB_Train_X.insert(0, 'malware', IB_Train_y)
IB_Reserve_X.insert(0, 'malware', IB_Reserve_y)

IB = IB_Train_X.copy(deep=True)
IB_Reserve = IB_Reserve_X.copy(deep=True)

IB_Reserve.drop(axis=1, labels='hash', inplace=True)
IB_Reserve.to_csv(index=False, chunksize=100, mode='w', path_or_buf='IB/IB_Reserve.csv')

endTime()

In [None]:
print("IB Shape:", IB.shape)
IB.head()

In [None]:
print("IB_Reserve Shape:", IB_Reserve.shape)
IB_Reserve.head()

# Data Rebalancing

To be applied to the train split (i.e., TB & IB)

RandomOverSampler is less prone to oversampling a sample/set of samples as is the case shown in [Balance Oliveira](https://github.com/jm55DLSU/THESIS/blob/main/ML%20Test/Balancing%20Oliveira/Balancing_Oliveira.ipynb).

In [None]:
def sample_distribution(dataset):
    distribution = []
    total_size = dataset.shape[0]
    dataset = str(dataset[['hash']].value_counts()[0:10]).split('\n')
    dataset.pop(0)
    dataset.pop(len(dataset)-1)
    for d in dataset:
        hs = d.split(' ')[0]
        qt = int(d.split(' ')[len(d.split(' '))-1])
        ratio = str(round(qt/total_size*100,2))+"%"
        distribution.append([hs,qt,ratio])
        print([hs,qt,ratio])
def TB_rebalance(balancer):
    global TB
    #Time-based
    X = TB.iloc[:,1:]
    y = TB.iloc[:,0]
    print("TB Rebalance...")
    X, y = balancer.fit_resample(X, y)
    print("TB Rebalancing Finished!")
    X.insert(0, 'malware', y)
    TB = X.copy(deep=True)
    #Check sample distribution
    print("TB Sample Distribution (Top 10)")
    sample_distribution(TB)
    #Drop hash
    TB.drop(axis=1, labels='hash', inplace=True)
    print("Time-based Label Counts:")
    print(TB['malware'].value_counts())
    TB.head()
    #Save to CSV
    TB.to_csv(index=False, chunksize=100, mode='w', path_or_buf='TB/TB.csv')
def IB_rebalance(balancer):
    global IB
    #Instance-based
    X = IB.iloc[:,1:]
    y = IB.iloc[:,0]
    print("IB Rebalance...")
    X, y = balancer.fit_resample(X, y)
    print("IB Rebalancing Finished!")
    X.insert(0, 'malware', y)
    IB = X.copy(deep=True)
    #Check sample distribution
    print("IB Sample Distribution (Top 10)")
    sample_distribution(IB)
    #Drop hash
    IB.drop(axis=1, labels='hash', inplace=True)
    print("Instance-based Label Counts:")
    print(IB['malware'].value_counts())
    IB.head()
    #Save to CSV
    IB.to_csv(index=False, chunksize=100, mode='w', path_or_buf='IB/IB.csv')
print("Time-based Label Counts:")
print(TB['malware'].value_counts())
print("Instance-based Label Counts:")
print(IB['malware'].value_counts())

In [None]:
#ros = RandomOverSampler(sampling_strategy='minority', random_state=1, shrinkage=None)
#smoten = SMOTEN(sampling_strategy='minority', random_state=1, k_neighbors=5)

tb_thread = threading.Thread(target=TB_rebalance, args=(SMOTEN(sampling_strategy='minority', random_state=1, k_neighbors=2),))
ib_thread = threading.Thread(target=IB_rebalance, args=(SMOTEN(sampling_strategy='minority', random_state=1, k_neighbors=2),))

startTime()
tb_thread.start()
ib_thread.start()
tb_thread.join()
ib_thread.join()
endTime()

In [None]:
TB

In [None]:
IB

# Label Encoding

For LightGBM's use

In [None]:
def label_encode(raw, enc_filename):
    global APIS
    encoded = [None, None, None, None]
    le = LabelEncoder()
    le.fit(APIS)
    for i, tb in enumerate(raw):
        print(f"Encoding: {enc_filename[i]}...")
        encoded[i] = tb.copy(deep=True)
        encoded[i].iloc[:,1:] = encoded[i].iloc[:,1:].apply(le.transform)
        print(f"\nWriting to file: {enc_filename[i]}...")
        encoded[i].to_csv(index=False, chunksize=100, mode='w', path_or_buf=enc_filename[i])
    return encoded
def TB_encode():
    global TB, TB_Reserve
    #Time-based
    print("Label Encoding Time-based Datasets...")
    tb_raw = [TB, TB_Reserve]
    tb_enc_filename = ['TB/TB_Enc.csv', 'TB/TB_Reserve_Enc.csv']
    tb_encoded = label_encode(tb_raw, tb_enc_filename)
    print("Label Encoding Time-based Datasets Finished!")
    print("")
def IB_encode():
    global IB, IB_Reserve
    #Instance-based
    print("Label Encoding Instance-based Datasets...")
    ib_raw = [IB, IB_Reserve]
    ib_enc_filename = ['IB/IB_Enc.csv', 'IB/IB_Reserve_Enc.csv']
    ib_encoded = label_encode(ib_raw, ib_enc_filename)
    print("Label Encoding Instance-based Datasets Finished!")
    print("")

In [None]:
tb_thread = threading.Thread(target=TB_encode)
ib_thread = threading.Thread(target=IB_encode)

startTime()
tb_thread.start()
ib_thread.start()
tb_thread.join()
ib_thread.join()
endTime()