In [1]:
# Import Libraries
import pandas as pd
import warnings
import lightgbm as lgbm
import catboost as catb
import sklearn.svm as svc
import sklearn.neural_network as mlpc
import sklearn.metrics as metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# df = pd.read_csv('./Dataset/oliveira_labelled.csv')
df = pd.read_csv('./oliveira_pca.csv')

API_LIST = "./Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:70].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:70] = row + ([307]*(69-len(row)))
        if r % 69 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
#df = inject_patterns(df)

In [2]:
X = df.iloc[:,1:] #Features
y = df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.30, random_state=1, shuffle=True)

train = pd.concat([train_labels,train_features], axis=1)
test = pd.concat([test_labels,test_features], axis=1)

display(train)
display(test)

Unnamed: 0,malware,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8,comp_9,...,comp_62,comp_63,comp_64,comp_65,comp_66,comp_67,comp_68,comp_69,hash,type
8628,1,-0.751289,2.108183,-0.463304,-0.586019,1.433001,-0.201310,-0.509418,-0.710051,0.675827,...,0.058265,-1.568352,1.123505,0.463561,0.272231,0.390588,0.292222,-0.969078,4f20d8222a65402f4f80327059292ec8,trojan
37239,1,0.603777,3.964342,1.315623,1.244449,0.128842,2.147229,1.685145,-1.984058,1.342858,...,-1.892111,-0.464691,1.532331,0.555052,0.353589,-0.361327,0.228989,0.558167,ae32f71bc453c8caf36a083c2f68927b,trojan
6678,1,0.779362,1.423579,0.332833,1.624983,-0.269148,0.488941,1.810075,-0.027576,0.008709,...,0.805483,0.973605,-0.174999,-0.334120,0.307765,0.224385,0.024216,0.266614,191906dd0816413259422b78f00acefa,trojan
3312,1,-1.552835,0.692778,-0.892020,1.824428,0.963705,1.856180,0.443885,0.216401,-0.289200,...,-0.196696,-0.866355,-0.165536,0.380808,0.891227,-0.419906,-0.409557,-0.200337,8e34ba3c5511120ac3ad4aa49fdc6dcb,trojan
39586,1,0.676870,-0.924262,-0.945538,-0.335973,0.654173,-3.819685,-4.143609,1.896181,-5.050030,...,0.289139,-0.139621,-1.020200,0.111483,-0.453921,-0.366161,1.087971,-0.176248,163632bbec0ed021dbdd4e889bf8c255,trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,1,3.458777,1.641727,-3.677764,-4.693550,-1.424356,3.518347,-0.923014,-2.154381,-4.123081,...,-0.066920,0.225494,-0.003961,0.000228,0.012153,0.078730,0.010417,0.036808,40ff2a0fbca280db07984242ae1d7304,trojan
32511,1,-5.882868,-3.867033,1.650529,-2.728901,-1.147328,3.533687,1.299515,-2.025259,0.426001,...,0.097143,-0.310338,0.161481,0.037431,0.082701,0.016449,-0.028041,0.131073,d37cfba88e4ae04fbfd54a1253ff75bd,trojan
5192,1,-5.882868,-3.867033,1.650529,-2.728901,-1.147328,3.533687,1.299515,-2.025259,0.426001,...,0.097143,-0.310338,0.161481,0.037431,0.082701,0.016449,-0.028041,0.131073,fefa40021e0f77caf95ec177dbcfd122,trojan
12172,1,-1.336578,0.559716,-0.210020,-0.215959,3.927299,3.749335,0.779498,0.555210,2.045763,...,-0.266997,0.147924,-0.400714,-0.006210,0.069553,0.034200,0.596241,-0.046433,8cc4aff450308fae48f36fc16593565a,trojan


Unnamed: 0,malware,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8,comp_9,...,comp_62,comp_63,comp_64,comp_65,comp_66,comp_67,comp_68,comp_69,hash,type
29129,1,-3.259855,5.313084,2.619519,1.402969,3.001173,0.162021,-3.155307,-2.276918,0.827938,...,0.272743,-0.284483,0.807380,0.811452,0.392275,0.296296,0.051189,-0.000499,4e270486b92ccff8afa59935ba4f5adc,trojan
21539,1,8.027009,-5.098860,2.266475,1.431478,0.721348,0.021623,0.351406,-0.202911,-0.796159,...,-0.052055,0.003402,0.025420,-0.017097,0.019978,-0.021094,0.028846,0.043731,d92d0f24e15384541a0c3c72424fe3a8,trojan
23534,1,3.758269,-0.308626,-4.624355,-4.219603,-2.370985,-1.672065,-1.197641,-1.459425,1.181835,...,0.108574,-0.149569,0.089111,0.003936,0.153760,-0.028321,-0.076675,0.133120,1dcb9bd8dcdd50f6d07035ea895ecfd1,trojan
31261,1,-1.135336,0.703149,-1.229163,1.606711,0.695726,3.736133,-2.262386,-0.138004,0.236624,...,0.244160,-0.231865,-0.421068,0.485489,-0.119296,-0.148868,-0.317508,0.334643,24dd4677c14eb5828bda78749fded6b8,pua
8036,1,8.027009,-5.098860,2.266475,1.431478,0.721348,0.021623,0.351406,-0.202911,-0.796159,...,-0.052055,0.003402,0.025420,-0.017097,0.019978,-0.021094,0.028846,0.043731,cc5d38cb80faaf60d8efabecdc04f832,trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24610,1,1.693521,5.873467,6.774364,-3.002239,-3.831111,-0.036604,0.646219,1.596457,0.344190,...,-0.080314,0.155775,-0.065788,0.002996,0.152858,-0.069608,0.050860,0.227054,0977d284b51c719de296bb2487d90bf2,trojan
24601,1,0.822997,3.783739,4.545535,0.177546,-0.099230,0.427492,2.712837,2.189319,0.031989,...,0.465239,-0.241998,0.200244,0.050505,-0.283422,0.484466,0.081711,0.129284,fd3401c769c882be280b2daf7b230b80,trojan
31144,1,4.106487,2.161144,-4.258699,-3.057130,0.397058,-0.402997,-1.454662,3.950495,1.138814,...,0.098585,-0.043631,0.044828,0.116436,-0.093475,0.023702,0.230001,-0.174654,6e17a53f6e8d6f03855952281fa18456,trojan
15936,1,0.374175,1.446078,-1.092147,0.071500,-0.038361,-0.045104,0.579718,-0.317625,-1.000676,...,-0.248189,-0.630914,0.405837,0.929505,0.990540,-0.130581,0.579805,-0.671995,7c76c5d77f582ba1aca92eca4ae4048e,trojan


In [3]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=1, n_jobs=-1, sampling_strategy=0.4)

X = train.iloc[:,1:70]
y = train.iloc[:,0]

X_res, y_res = smt.fit_resample(X, y)
train = pd.concat([y_res,X_res], axis=1)
print(train.shape)
print(train['malware'].value_counts())
display(train.head())

(39355, 70)
malware
1    28111
0    11244
Name: count, dtype: int64


Unnamed: 0,malware,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8,comp_9,...,comp_60,comp_61,comp_62,comp_63,comp_64,comp_65,comp_66,comp_67,comp_68,comp_69
0,1,-0.751289,2.108183,-0.463304,-0.586019,1.433001,-0.20131,-0.509418,-0.710051,0.675827,...,1.659839,-1.464209,0.058265,-1.568352,1.123505,0.463561,0.272231,0.390588,0.292222,-0.969078
1,1,0.603777,3.964342,1.315623,1.244449,0.128842,2.147229,1.685145,-1.984058,1.342858,...,0.046362,-0.725666,-1.892111,-0.464691,1.532331,0.555052,0.353589,-0.361327,0.228989,0.558167
2,1,0.779362,1.423579,0.332833,1.624983,-0.269148,0.488941,1.810075,-0.027576,0.008709,...,0.003466,0.581865,0.805483,0.973605,-0.174999,-0.33412,0.307765,0.224385,0.024216,0.266614
3,1,-1.552835,0.692778,-0.89202,1.824428,0.963705,1.85618,0.443885,0.216401,-0.2892,...,-0.584939,1.593108,-0.196696,-0.866355,-0.165536,0.380808,0.891227,-0.419906,-0.409557,-0.200337
4,1,0.67687,-0.924262,-0.945538,-0.335973,0.654173,-3.819685,-4.143609,1.896181,-5.05003,...,0.714912,-1.09757,0.289139,-0.139621,-1.0202,0.111483,-0.453921,-0.366161,1.087971,-0.176248


## Creating IB versions of dataset

IB versions of the dataset is not suitable for PCA datasets as it is a continous rather than a nominal data.

In [4]:
# train_ib = train_tb.copy(deep=True)
# test_ib = test_tb.copy(deep=True)

# train_ib = ib_convert(train_ib).copy(deep=True)
# print("\n")
# test_ib = ib_convert(test_ib).copy(deep=True)

In [5]:
# train_ib.iloc[:,1:101] = train_ib.iloc[:,1:101].astype('str')
# train_ib.replace("nan", "NaN", inplace=True)
# test_ib.iloc[:,1:101] = test_ib.iloc[:,1:101].astype('str')
# test_ib.replace("nan", "NaN", inplace=True)
# display(train_ib.head())
# display(test_ib.head())

## Converting results to usable for models

Encoded APIs to Unencoded/String APIs

In [6]:
# train_tb_enc = train_tb.copy(deep=True)
# test_tb_enc = test_tb.copy(deep=True)
# train_ib_enc = train_ib.copy(deep=True)
# test_ib_enc = test_ib.copy(deep=True)

# train_tb_enc.iloc[:, 1:101] = train_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_tb_enc.iloc[:, 1:101] = test_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# train_ib_enc.iloc[:, 1:101] = train_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_ib_enc.iloc[:, 1:101] = test_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')

# display(train_tb_enc.head())
# display(test_tb_enc.head())
# display(train_ib_enc.head())
# display(test_ib_enc.head())

## Trying it out on LightGBM, CatBoost, and SVM

In [7]:
def get_indexes():
    indexes = []
    for i in range(70):
        indexes.append(f"t_{i}")
    return indexes

def train_test(train, test, model, model_str:str):
    X = train.iloc[:,1:70]
    y = train.iloc[:,0]
    X_test = test.iloc[:,1:70]
    y_test = test.iloc[:,0]
    print(f"Model: {model_str}")
    #MODEL ROBUSTNESS
    model.fit(X,y)
    y_pred = model.predict(X_test)
    #print("\nModel, Fold, Accuracy, Precision, F1-Score, Recall, ROC-AUC")
    #print(f"{model_str}, {'T'}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
    print(f"Fold: T")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"ROC-AUC: {metrics.roc_auc_score(y_test, y_pred):.4f}")
    #STRATIFIED K-FOLDS
    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    ctr = 0
    for train_idx, test_idx in skf.split(train.iloc[:,1:70], train.iloc[:,0]):
        X_train = train.iloc[train_idx, 1:70]
        y_train = train.iloc[train_idx, 0]
        model.fit(X_train, y_train)
        y_pred = model.predict(train.iloc[test_idx, 1:70])
        y_test = train.iloc[test_idx, 0]
        #print(f"{model_str}, {ctr}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
        print(f"Fold: {ctr}")
        print(classification_report(y_test, y_pred, digits=4))
        ctr += 1
    print('-------------------------------------------------------')
    print("")

In [8]:
train_test(train, test, lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=0), "LGBM TB")
train_test(train, test, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, 
                                                      nan_mode='Min', one_hot_max_size=256), "CATB TB")

Model: LGBM TB
Fold: T
              precision    recall  f1-score   support

           0     0.5569    0.5759    0.5662       323
           1     0.9886    0.9877    0.9882     12048

    accuracy                         0.9770     12371
   macro avg     0.7728    0.7818    0.7772     12371
weighted avg     0.9773    0.9770    0.9771     12371

ROC-AUC: 0.7818
Fold: 0
              precision    recall  f1-score   support

           0     0.9631    0.9746    0.9688      2248
           1     0.9898    0.9851    0.9874      5623

    accuracy                         0.9821      7871
   macro avg     0.9764    0.9799    0.9781      7871
weighted avg     0.9822    0.9821    0.9821      7871

Fold: 1
              precision    recall  f1-score   support

           0     0.9631    0.9858    0.9743      2249
           1     0.9943    0.9849    0.9895      5622

    accuracy                         0.9851      7871
   macro avg     0.9787    0.9853    0.9819      7871
weighted avg     0.

In [9]:
train_test(train, test, svc.SVC(random_state=1, verbose=0, cache_size=1024), "SVM TB")
train_test(test, test, mlpc.MLPClassifier(random_state=1, verbose=0), "MLPC TB")

Model: SVM TB
Fold: T
              precision    recall  f1-score   support

           0     0.6667    0.6130    0.6387       323
           1     0.9896    0.9918    0.9907     12048

    accuracy                         0.9819     12371
   macro avg     0.8282    0.8024    0.8147     12371
weighted avg     0.9812    0.9819    0.9815     12371

ROC-AUC: 0.8024
Fold: 0
              precision    recall  f1-score   support

           0     0.9730    0.9924    0.9826      2248
           1     0.9970    0.9890    0.9929      5623

    accuracy                         0.9900      7871
   macro avg     0.9850    0.9907    0.9878      7871
weighted avg     0.9901    0.9900    0.9900      7871

Fold: 1
              precision    recall  f1-score   support

           0     0.9760    0.9947    0.9852      2249
           1     0.9978    0.9902    0.9940      5622

    accuracy                         0.9915      7871
   macro avg     0.9869    0.9924    0.9896      7871
weighted avg     0.9