In [1]:
# Import Libraries
import pandas as pd
import warnings
import lightgbm as lgbm
import catboost as catb
import sklearn.svm as svc
import sklearn.neural_network as mlpc
import sklearn.metrics as metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

df = pd.read_csv('./Dataset/oliveira_labelled.csv')

API_LIST = "./Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
#df = inject_patterns(df)

mal_df = df[df['malware'] == 1]
ben_df = df[df['malware'] == 0]

  from pandas.core import (


In [2]:
X = ben_df.iloc[:,1:] #Features
y = ben_df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.10, random_state=1, shuffle=True)

ben_train = pd.concat([train_labels,train_features], axis=1)
ben_test = pd.concat([test_labels,test_features], axis=1)

print("Benign for Training:", ben_train['type'].value_counts().sum())
print("Bening for Test: ", ben_test['type'].value_counts().sum())

Benign for Training: 969
Bening for Test:  108


In [3]:
X = mal_df.iloc[:,1:] #Features
y = mal_df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.08, random_state=1, shuffle=True)

mal_test = pd.concat([train_labels,train_features], axis=1)
mal_train = pd.concat([test_labels,test_features], axis=1)

print("Malicious for Training:", mal_train['type'].value_counts().sum())
print("Malicious for Testing:", mal_test['type'].value_counts().sum())

Malicious for Training: 3213
Malicious for Testing: 36946


## mal_test + ben_test

To not undergo SMOTETonek

In [4]:
test_tb = pd.concat([mal_test, ben_test], axis=0, ignore_index=True)
print(test_tb.shape)
print(test_tb['malware'].value_counts())
display(test_tb.head())
display(test_tb[test_tb['malware']==0])

(37054, 103)
malware
1    36946
0      108
Name: count, dtype: int64


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,ba21b9378d594b044470e1eb89e846db,trojan
1,1,82,172,117,16,240,117,240,117,99,...,141,65,260,65,141,65,117,215,5d883b9aabe16c16c97c6e5d04b333e2,trojan
2,1,215,117,208,117,208,117,240,117,240,...,117,260,65,141,65,260,65,141,23455429246e698971b4d9fdbe1ce2fd,trojan
3,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,0f25b6e10708d379c09eb06bb01bb077,trojan
4,1,82,172,117,16,81,252,81,208,257,...,303,39,303,39,303,39,303,39,d42963113be901a2fd140eb2f505fc73,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
36946,0,82,16,208,240,117,215,274,158,215,...,172,117,172,117,172,117,172,117,59147b8b8abf9768ca96badfd91d7bb9,benign
36947,0,240,117,240,117,240,117,240,117,240,...,117,35,60,81,208,60,81,60,483b022e6f2805d0cdf4e1db7d1237af,benign
36948,0,240,117,240,117,240,117,240,117,240,...,172,117,240,117,172,117,29,25,76457240c1640a0812a3ef57159708b4,benign
36949,0,286,110,172,240,117,240,117,240,117,...,114,215,117,71,25,71,275,260,25a904a73a9c6548c39351f3bbfac641,benign
36950,0,82,16,35,240,117,86,208,86,31,...,25,60,81,25,60,81,25,60,39dfc1401b7db273933b5fb08e8394f8,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37049,0,82,16,31,172,117,274,215,106,171,...,89,215,172,117,172,117,172,117,46691ecd93d1ba38de8eb68ab281603e,benign
37050,0,82,16,172,117,194,240,117,172,117,...,215,187,215,50,274,158,215,283,824e84ac88ac9f82d772960657e094d1,benign
37051,0,297,8,135,215,171,215,172,117,208,...,117,215,208,297,93,240,117,215,9b7a7f7e6df8ae601c75adb56f0ba994,benign
37052,0,286,110,172,240,117,240,117,240,117,...,114,215,117,261,106,144,297,117,223d7689bbf3fbf0dc2ead33ad704689,benign


## mal_train + ben_train

To undergo *SMOTETomek*

In [5]:
train_tb = pd.concat([mal_train, ben_train], axis=0, ignore_index=True)
print(train_tb.shape)
print(train_tb['malware'].value_counts())

(4182, 103)
malware
1    3213
0     969
Name: count, dtype: int64


In [6]:
# from imblearn.combine import SMOTETomek

# smt = SMOTETomek(random_state=1, n_jobs=8, sampling_strategy='auto')

# X = train_tb.iloc[:,1:101]
# y = train_tb.iloc[:,0]

# X_res, y_res = smt.fit_resample(X, y)
# train_tb = pd.concat([y_res,X_res], axis=1)
# print(train_tb.shape)
# print(train_tb['malware'].value_counts())
# display(train_tb.head())
# display(train_tb[train_tb['malware']==0])

## Creating IB versions of dataset

In [7]:
train_ib = train_tb.copy(deep=True)
test_ib = test_tb.copy(deep=True)

train_ib = ib_convert(train_ib).copy(deep=True)
print("\n")
test_ib = ib_convert(test_ib).copy(deep=True)

Transposing IB...
IB Transposed!
Removing duplicates...
Row: 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 
Duplicates removed!
Retransposing IB (revert)...
IB Retransposed!


Transposing IB...
IB Transposed!
Removing duplicates...
Row: 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 120

In [8]:
# train_ib.iloc[:,1:101] = train_ib.iloc[:,1:101].astype('str')
# train_ib.replace("nan", "NaN", inplace=True)
# test_ib.iloc[:,1:101] = test_ib.iloc[:,1:101].astype('str')
# test_ib.replace("nan", "NaN", inplace=True)
# display(train_ib.head())
# display(test_ib.head())

## Converting results to usable for models

Encoded APIs to Unencoded/String APIs

In [9]:
# train_tb_enc = train_tb.copy(deep=True)
# test_tb_enc = test_tb.copy(deep=True)
# train_ib_enc = train_ib.copy(deep=True)
# test_ib_enc = test_ib.copy(deep=True)

# train_tb_enc.iloc[:, 1:101] = train_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_tb_enc.iloc[:, 1:101] = test_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# train_ib_enc.iloc[:, 1:101] = train_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_ib_enc.iloc[:, 1:101] = test_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')

# display(train_tb_enc.head())
# display(test_tb_enc.head())
# display(train_ib_enc.head())
# display(test_ib_enc.head())

## Trying it out on LightGBM, CatBoost, and SVM

In [10]:
def get_indexes():
    indexes = []
    for i in range(100):
        indexes.append(f"t_{i}")
    return indexes

def train_test(train, test, model, model_str:str):
    X = train.iloc[:,1:101]
    y = train.iloc[:,0]
    X_test = test.iloc[:,1:101]
    y_test = test.iloc[:,0]
    print(f"Model: {model_str}")
    #MODEL ROBUSTNESS
    model.fit(X,y)
    y_pred = model.predict(X_test)
    #print("\nModel, Fold, Accuracy, Precision, F1-Score, Recall, ROC-AUC")
    #print(f"{model_str}, {'T'}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
    print(f"Fold: T")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"ROC-AUC: {metrics.roc_auc_score(y_test, y_pred):.4f}")
    #STRATIFIED K-FOLDS
    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    ctr = 0
    for train_idx, test_idx in skf.split(train.iloc[:,1:101], train.iloc[:,0]):
        X_train = train.iloc[train_idx, 1:101]
        y_train = train.iloc[train_idx, 0]
        model.fit(X_train, y_train)
        y_pred = model.predict(train.iloc[test_idx, 1:101])
        y_test = train.iloc[test_idx, 0]
        #print(f"{model_str}, {ctr}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
        print(f"Fold: {ctr}")
        print(classification_report(y_test, y_pred, digits=4))
        ctr += 1
    print('-------------------------------------------------------')
    print("\n")

In [11]:
train_test(train_tb, test_tb, lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=0), "LGBM")
train_test(train_tb, test_tb, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                                    nan_mode='Min', custom_metric=['Logloss', 'AUC', 'Precision'], one_hot_max_size=256), "CATB")
train_test(train_tb, test_tb, svc.SVC(random_state=1, verbose=0), "SVM")
train_test(train_tb, test_tb, mlpc.MLPClassifier(random_state=1, verbose=0), "MLPC")

Model: LGBM
Fold: T
              precision    recall  f1-score   support

           0     0.1114    0.7593    0.1943       108
           1     0.9993    0.9823    0.9907     36946

    accuracy                         0.9816     37054
   macro avg     0.5553    0.8708    0.5925     37054
weighted avg     0.9967    0.9816    0.9884     37054

ROC-AUC: 0.8708
Fold: 0
              precision    recall  f1-score   support

           0     0.9240    0.8144    0.8658       194
           1     0.9459    0.9798    0.9626       643

    accuracy                         0.9415       837
   macro avg     0.9350    0.8971    0.9142       837
weighted avg     0.9409    0.9415    0.9401       837

Fold: 1
              precision    recall  f1-score   support

           0     0.9146    0.7732    0.8380       194
           1     0.9346    0.9782    0.9559       643

    accuracy                         0.9307       837
   macro avg     0.9246    0.8757    0.8970       837
weighted avg     0.930

In [12]:
train_test(train_ib, test_ib, lgbm.LGBMClassifier(random_state=1, n_jobs=0,verbose=0), "LGBM")
train_test(train_ib, test_ib, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                                    nan_mode='Min', custom_metric=['Logloss', 'AUC', 'Precision'], one_hot_max_size=256), "CATB")
train_test(train_ib, test_ib, svc.SVC(random_state=1,verbose=0), "SVM")
train_test(train_ib, test_ib, mlpc.MLPClassifier(random_state=1,verbose=0), "MLPC")

Model: LGBM
Fold: T
              precision    recall  f1-score   support

           0     0.0852    0.8148    0.1543       108
           1     0.9994    0.9744    0.9868     36946

    accuracy                         0.9740     37054
   macro avg     0.5423    0.8946    0.5705     37054
weighted avg     0.9968    0.9740    0.9843     37054

ROC-AUC: 0.8946
Fold: 0
              precision    recall  f1-score   support

           0     0.9524    0.8247    0.8840       194
           1     0.9492    0.9876    0.9680       643

    accuracy                         0.9498       837
   macro avg     0.9508    0.9062    0.9260       837
weighted avg     0.9499    0.9498    0.9485       837

Fold: 1
              precision    recall  f1-score   support

           0     0.8927    0.8144    0.8518       194
           1     0.9455    0.9705    0.9578       643

    accuracy                         0.9343       837
   macro avg     0.9191    0.8924    0.9048       837
weighted avg     0.933