In [1]:
N_COMPONENTS=68

In [2]:
# Import Libraries
import pandas as pd
import warnings
import lightgbm as lgbm
import catboost as catb
import sklearn.svm as svc
import sklearn.neural_network as mlpc
import sklearn.metrics as metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

df = pd.read_csv('./Dataset/oliveira_labelled.csv')

API_LIST = "./Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:N_COMPONENTS+1].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:N_COMPONENTS+1].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:N_COMPONENTS+1] = row + ([307]*(N_COMPONENTS-len(row)))
        if r % N_COMPONENTS == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

for i in range(N_COMPONENTS,100):
    remove = f"t_{i}"
    df.pop(remove)
display(df.head())

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#df.iloc[:, 1:N_COMPONENTS+1] = df.iloc[:, 1:N_COMPONENTS+1].apply(inverse_label, axis=1, result_type='reduce')
#df = inject_patterns(df)

mal_df = df[df['malware'] == 1]
ben_df = df[df['malware'] == 0]

  from pandas.core import (


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_60,t_61,t_62,t_63,t_64,t_65,t_66,t_67,malware,type
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,60,81,60,81,225,35,60,81,1,trojan
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,208,35,215,35,208,240,117,35,1,pua
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,123,65,112,123,65,112,123,65,1,trojan
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,158,215,240,117,82,240,117,240,1,trojan
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,141,65,260,141,65,31,159,224,1,trojan


In [3]:
X = ben_df.iloc[:,1:] #Features
y = ben_df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.10, random_state=1, shuffle=True)

ben_train = pd.concat([train_labels,train_features], axis=1)
ben_test = pd.concat([test_labels,test_features], axis=1)

print("Benign for Training:", ben_train['type'].value_counts().sum())
print("Bening for Test: ", ben_test['type'].value_counts().sum())

Benign for Training: 969
Bening for Test:  108


In [4]:
X = mal_df.iloc[:,1:] #Features
y = mal_df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.08, random_state=1, shuffle=True)

mal_test = pd.concat([train_labels,train_features], axis=1)
mal_train = pd.concat([test_labels,test_features], axis=1)

print("Malicious for Training:", mal_train['type'].value_counts().sum())
print("Malicious for Testing:", mal_test['type'].value_counts().sum())

Malicious for Training: 3213
Malicious for Testing: 36946


## mal_test + ben_test

To not undergo SMOTETonek

In [5]:
test_tb = pd.concat([mal_test, ben_test], axis=0, ignore_index=True)
print(test_tb.shape)
print(test_tb['malware'].value_counts())
display(test_tb.head())
display(test_tb[test_tb['malware']==0])

(37054, 71)
malware
1    36946
0      108
Name: count, dtype: int64


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_60,t_61,t_62,t_63,t_64,t_65,t_66,t_67,hash,type
0,1,82,240,117,240,117,240,117,240,117,...,141,65,260,141,65,31,261,172,ba21b9378d594b044470e1eb89e846db,trojan
1,1,82,172,117,16,240,117,240,117,99,...,240,117,71,297,135,171,215,112,5d883b9aabe16c16c97c6e5d04b333e2,trojan
2,1,215,117,208,117,208,117,240,117,240,...,194,117,240,117,260,141,65,260,23455429246e698971b4d9fdbe1ce2fd,trojan
3,1,82,240,117,240,117,240,117,240,117,...,141,65,260,141,65,31,159,224,0f25b6e10708d379c09eb06bb01bb077,trojan
4,1,82,172,117,16,81,252,81,208,257,...,303,39,303,39,303,39,303,39,d42963113be901a2fd140eb2f505fc73,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_60,t_61,t_62,t_63,t_64,t_65,t_66,t_67,hash,type
36946,0,82,16,208,240,117,215,274,158,215,...,172,117,172,117,172,117,172,117,59147b8b8abf9768ca96badfd91d7bb9,benign
36947,0,240,117,240,117,240,117,240,117,240,...,82,245,112,240,117,50,240,117,483b022e6f2805d0cdf4e1db7d1237af,benign
36948,0,240,117,240,117,240,117,240,117,240,...,240,117,240,117,228,215,274,158,76457240c1640a0812a3ef57159708b4,benign
36949,0,286,110,172,240,117,240,117,240,117,...,141,65,117,9,117,260,65,141,25a904a73a9c6548c39351f3bbfac641,benign
36950,0,82,16,35,240,117,86,208,86,31,...,65,141,260,65,141,65,298,297,39dfc1401b7db273933b5fb08e8394f8,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37049,0,82,16,31,172,117,274,215,106,171,...,65,274,158,215,274,158,215,274,46691ecd93d1ba38de8eb68ab281603e,benign
37050,0,82,16,172,117,194,240,117,172,117,...,252,199,252,39,199,252,199,252,824e84ac88ac9f82d772960657e094d1,benign
37051,0,297,8,135,215,171,215,172,117,208,...,297,93,240,117,215,297,93,240,9b7a7f7e6df8ae601c75adb56f0ba994,benign
37052,0,286,110,172,240,117,240,117,240,117,...,141,65,117,9,117,260,65,141,223d7689bbf3fbf0dc2ead33ad704689,benign


## mal_train + ben_train

To undergo *SMOTETomek*

In [6]:
train_tb = pd.concat([mal_train, ben_train], axis=0, ignore_index=True)
print(train_tb.shape)
print(train_tb['malware'].value_counts())

(4182, 71)
malware
1    3213
0     969
Name: count, dtype: int64


In [7]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=1, n_jobs=8, sampling_strategy=0.4)

X = train_tb.iloc[:,1:N_COMPONENTS+1]
y = train_tb.iloc[:,0]

X_res, y_res = smt.fit_resample(X, y)
train_tb = pd.concat([y_res,X_res], axis=1)
print(train_tb.shape)
print(train_tb['malware'].value_counts())
display(train_tb.head())
display(train_tb[train_tb['malware']==0])

(4486, 69)
malware
1    3207
0    1279
Name: count, dtype: int64


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_58,t_59,t_60,t_61,t_62,t_63,t_64,t_65,t_66,t_67
0,1,82,240,117,240,117,240,117,240,117,...,172,117,260,294,240,117,198,208,240,117
1,1,286,110,172,240,117,240,117,240,117,...,117,260,141,65,117,260,141,65,117,9
2,1,82,172,117,16,29,208,228,117,228,...,117,172,117,172,117,172,117,172,117,172
3,1,82,240,117,240,117,240,117,240,117,...,159,224,82,141,65,260,141,65,260,141
4,1,82,240,117,240,117,240,117,240,117,...,65,260,141,65,260,141,65,31,261,172


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_58,t_59,t_60,t_61,t_62,t_63,t_64,t_65,t_66,t_67
3207,0,286,110,172,240,117,240,117,240,117,...,117,260,141,65,117,9,117,260,65,141
3208,0,82,274,158,215,86,82,37,70,37,...,262,228,275,172,240,275,172,274,158,215
3209,0,208,286,76,110,240,117,208,187,208,...,240,286,71,56,172,117,240,286,117,286
3210,0,240,117,240,117,240,117,240,117,240,...,99,198,82,208,86,260,16,112,172,117
3211,0,286,110,172,240,117,240,117,240,117,...,117,260,141,65,117,9,117,260,65,141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4481,0,286,110,172,240,117,240,117,240,117,...,117,260,141,65,117,9,117,260,65,141
4482,0,106,183,172,117,172,117,178,141,195,...,171,131,113,145,135,98,289,226,149,49
4483,0,149,91,216,157,110,157,106,189,117,...,180,114,103,188,184,188,246,269,135,110
4484,0,215,274,158,215,274,158,215,172,117,...,60,81,60,81,60,81,60,81,98,64


## Creating IB versions of dataset

In [8]:
train_ib = train_tb.copy(deep=True)
test_ib = test_tb.copy(deep=True)

train_ib = ib_convert(train_ib).copy(deep=True)
print("\n")
test_ib = ib_convert(test_ib).copy(deep=True)

Transposing IB...
IB Transposed!
Removing duplicates...
Row: 0 68 136 204 272 340 408 476 544 612 680 748 816 884 952 1020 1088 1156 1224 1292 1360 1428 1496 1564 1632 1700 1768 1836 1904 1972 2040 2108 2176 2244 2312 2380 2448 2516 2584 2652 2720 2788 2856 2924 2992 3060 3128 3196 3264 3332 3400 3468 3536 3604 3672 3740 3808 3876 3944 4012 4080 4148 4216 4284 4352 4420 
Duplicates removed!
Retransposing IB (revert)...
IB Retransposed!


Transposing IB...
IB Transposed!
Removing duplicates...
Row: 0 68 136 204 272 340 408 476 544 612 680 748 816 884 952 1020 1088 1156 1224 1292 1360 1428 1496 1564 1632 1700 1768 1836 1904 1972 2040 2108 2176 2244 2312 2380 2448 2516 2584 2652 2720 2788 2856 2924 2992 3060 3128 3196 3264 3332 3400 3468 3536 3604 3672 3740 3808 3876 3944 4012 4080 4148 4216 4284 4352 4420 4488 4556 4624 4692 4760 4828 4896 4964 5032 5100 5168 5236 5304 5372 5440 5508 5576 5644 5712 5780 5848 5916 5984 6052 6120 6188 6256 6324 6392 6460 6528 6596 6664 6732 6800 6868 6936 

In [9]:
# train_ib.iloc[:,1:N_COMPONENTS+1] = train_ib.iloc[:,1:N_COMPONENTS+1].astype('str')
# train_ib.replace("nan", "NaN", inplace=True)
# test_ib.iloc[:,1:N_COMPONENTS+1] = test_ib.iloc[:,1:N_COMPONENTS+1].astype('str')
# test_ib.replace("nan", "NaN", inplace=True)
# display(train_ib.head())
# display(test_ib.head())

## Converting results to usable for models

Encoded APIs to Unencoded/String APIs

In [10]:
# train_tb_enc = train_tb.copy(deep=True)
# test_tb_enc = test_tb.copy(deep=True)
# train_ib_enc = train_ib.copy(deep=True)
# test_ib_enc = test_ib.copy(deep=True)

# train_tb_enc.iloc[:, 1:N_COMPONENTS+1] = train_tb.iloc[:, 1:N_COMPONENTS+1].apply(inverse_label, axis=1, result_type='reduce')
# test_tb_enc.iloc[:, 1:N_COMPONENTS+1] = test_tb.iloc[:, 1:N_COMPONENTS+1].apply(inverse_label, axis=1, result_type='reduce')
# train_ib_enc.iloc[:, 1:N_COMPONENTS+1] = train_ib.iloc[:, 1:N_COMPONENTS+1].apply(inverse_label, axis=1, result_type='reduce')
# test_ib_enc.iloc[:, 1:N_COMPONENTS+1] = test_ib.iloc[:, 1:N_COMPONENTS+1].apply(inverse_label, axis=1, result_type='reduce')

# display(train_tb_enc.head())
# display(test_tb_enc.head())
# display(train_ib_enc.head())
# display(test_ib_enc.head())

## Trying it out on LightGBM, CatBoost, and SVM

In [11]:
def get_indexes():
    indexes = []
    for i in range(N_COMPONENTS):
        indexes.append(f"t_{i}")
    return indexes

def train_test(train, test, model, model_str:str):
    X = train.iloc[:,1:N_COMPONENTS+1]
    y = train.iloc[:,0]
    X_test = test.iloc[:,1:N_COMPONENTS+1]
    y_test = test.iloc[:,0]
    print(f"Model: {model_str}")
    #MODEL ROBUSTNESS
    model.fit(X,y)
    y_pred = model.predict(X_test)
    #print("\nModel, Fold, Accuracy, Precision, F1-Score, Recall, ROC-AUC")
    #print(f"{model_str}, {'T'}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
    print(f"Fold: T")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"ROC-AUC: {metrics.roc_auc_score(y_test, y_pred):.4f}")
    #STRATIFIED K-FOLDS
    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    ctr = 0
    for train_idx, test_idx in skf.split(train.iloc[:,1:N_COMPONENTS+1], train.iloc[:,0]):
        X_train = train.iloc[train_idx, 1:N_COMPONENTS+1]
        y_train = train.iloc[train_idx, 0]
        model.fit(X_train, y_train)
        y_pred = model.predict(train.iloc[test_idx, 1:N_COMPONENTS+1])
        y_test = train.iloc[test_idx, 0]
        #print(f"{model_str}, {ctr}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
        print(f"Fold: {ctr}")
        print(classification_report(y_test, y_pred, digits=4))
        ctr += 1
    print('-------------------------------------------------------')
    print("")

In [12]:
train_test(train_tb, test_tb, lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=0), "LGBM")
train_test(train_tb, test_tb, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                                    nan_mode='Min', custom_metric=['Logloss', 'AUC', 'Precision'], one_hot_max_size=256), "CATB")
train_test(train_tb, test_tb, svc.SVC(random_state=1, verbose=0), "SVM")
train_test(train_tb, test_tb, mlpc.MLPClassifier(random_state=1, verbose=0), "MLPC")

Model: LGBM
Fold: T
              precision    recall  f1-score   support

           0     0.0352    0.8241    0.0675       108
           1     0.9994    0.9339    0.9656     36946

    accuracy                         0.9336     37054
   macro avg     0.5173    0.8790    0.5165     37054
weighted avg     0.9966    0.9336    0.9630     37054

ROC-AUC: 0.8790
Fold: 0
              precision    recall  f1-score   support

           0     0.8015    0.8516    0.8258       256
           1     0.9393    0.9159    0.9274       642

    accuracy                         0.8976       898
   macro avg     0.8704    0.8837    0.8766       898
weighted avg     0.9000    0.8976    0.8985       898

Fold: 1
              precision    recall  f1-score   support

           0     0.7924    0.8980    0.8419       255
           1     0.9572    0.9065    0.9312       642

    accuracy                         0.9041       897
   macro avg     0.8748    0.9023    0.8866       897
weighted avg     0.910

In [13]:
train_test(train_ib, test_ib, lgbm.LGBMClassifier(random_state=1, n_jobs=0,verbose=0), "LGBM")
train_test(train_ib, test_ib, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                                    nan_mode='Min', custom_metric=['Logloss', 'AUC', 'Precision'], one_hot_max_size=256), "CATB")
train_test(train_ib, test_ib, svc.SVC(random_state=1,verbose=0), "SVM")
train_test(train_ib, test_ib, mlpc.MLPClassifier(random_state=1,verbose=0), "MLPC")

Model: LGBM
Fold: T
              precision    recall  f1-score   support

           0     0.0328    0.8333    0.0631       108
           1     0.9995    0.9282    0.9625     36946

    accuracy                         0.9279     37054
   macro avg     0.5161    0.8808    0.5128     37054
weighted avg     0.9967    0.9279    0.9599     37054

ROC-AUC: 0.8808
Fold: 0
              precision    recall  f1-score   support

           0     0.7832    0.8750    0.8266       256
           1     0.9477    0.9034    0.9250       642

    accuracy                         0.8953       898
   macro avg     0.8655    0.8892    0.8758       898
weighted avg     0.9008    0.8953    0.8970       898

Fold: 1
              precision    recall  f1-score   support

           0     0.7951    0.8980    0.8435       255
           1     0.9573    0.9081    0.9321       642

    accuracy                         0.9052       897
   macro avg     0.8762    0.9031    0.8878       897
weighted avg     0.911