In [1]:
# Import Libraries
import pandas as pd
import warnings
import lightgbm as lgbm
import catboost as catb
import sklearn.svm as svc
import sklearn.neural_network as mlpc
import sklearn.metrics as metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

df = pd.read_csv('./Dataset/oliveira_labelled.csv')

API_LIST = "./Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
#df = inject_patterns(df)

mal_df = df[df['malware'] == 1]
ben_df = df[df['malware'] == 0]

In [2]:
X = ben_df.iloc[:,1:] #Features
y = ben_df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.07, random_state=1, shuffle=True)

ben_train = pd.concat([train_labels,train_features], axis=1)
ben_test = pd.concat([test_labels,test_features], axis=1)

print("Benign for Training:", ben_train['type'].value_counts().sum())
print("Bening for Test: ", ben_test['type'].value_counts().sum())

Benign for Training: 1001
Bening for Test:  76


In [3]:
X = mal_df.iloc[:,1:] #Features
y = mal_df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.05, random_state=1, shuffle=True)

mal_test = pd.concat([train_labels,train_features], axis=1)
mal_train = pd.concat([test_labels,test_features], axis=1)

print("Malicious for Training:", mal_train['type'].value_counts().sum())
print("Malicious for Testing:", mal_test['type'].value_counts().sum())

Malicious for Training: 2008
Malicious for Testing: 38151


## mal_test + ben_test

To not undergo SMOTETonek

In [4]:
test_tb = pd.concat([mal_test, ben_test], axis=0, ignore_index=True)
print(test_tb.shape)
print(test_tb['malware'].value_counts())
display(test_tb.head())
display(test_tb[test_tb['malware']==0])

(38227, 103)
malware
1    38151
0       76
Name: count, dtype: int64


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,0e7d7102340ce1be5da358fe7b5e26bc,trojan
1,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,a7ec21a4aa58d63df76b6038266c4c45,trojan
2,1,286,110,172,240,117,240,117,240,117,...,65,86,99,71,215,240,117,240,0a3ad6a2dba1b20f48bf99fa52d8e70e,dropper
3,1,82,198,86,82,274,37,240,117,260,...,260,141,65,257,215,260,141,65,dd583b879899de566de7c6062be60369,trojan
4,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,a49cc0b880f2dd6226164a00746549c0,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
38151,0,82,16,208,240,117,215,274,158,215,...,172,117,172,117,172,117,172,117,59147b8b8abf9768ca96badfd91d7bb9,benign
38152,0,240,117,240,117,240,117,240,117,240,...,117,35,60,81,208,60,81,60,483b022e6f2805d0cdf4e1db7d1237af,benign
38153,0,240,117,240,117,240,117,240,117,240,...,172,117,240,117,172,117,29,25,76457240c1640a0812a3ef57159708b4,benign
38154,0,286,110,172,240,117,240,117,240,117,...,114,215,117,71,25,71,275,260,25a904a73a9c6548c39351f3bbfac641,benign
38155,0,82,16,35,240,117,86,208,86,31,...,25,60,81,25,60,81,25,60,39dfc1401b7db273933b5fb08e8394f8,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38222,0,82,16,215,274,158,215,274,158,215,...,198,172,117,260,275,112,71,25,1cad25b4e90a2648fdd25f4f00be3c37,benign
38223,0,82,16,82,103,297,286,194,286,85,...,123,208,35,123,112,123,65,172,0b2ec965cee44e5bf3030bd1a61214f8,benign
38224,0,286,110,172,240,117,240,117,240,117,...,114,215,117,261,106,144,297,117,139ef237f3b7dced11e58252a96f64a7,benign
38225,0,117,172,117,228,172,117,172,117,172,...,274,158,215,240,117,50,172,117,349aae8db20b24d14a90038d5c4c5549,benign


## mal_train + ben_train

To undergo *SMOTETomek*

In [5]:
train_tb = pd.concat([mal_train, ben_train], axis=0, ignore_index=True)
print(train_tb.shape)
print(train_tb['malware'].value_counts())

(3009, 103)
malware
1    2008
0    1001
Name: count, dtype: int64


In [6]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=1, n_jobs=8, sampling_strategy=0.5)

X = train_tb.iloc[:,1:101]
y = train_tb.iloc[:,0]

X_res, y_res = smt.fit_resample(X, y)
train_tb = pd.concat([y_res,X_res], axis=1)
print(train_tb.shape)
print(train_tb['malware'].value_counts())
display(train_tb.head())
display(train_tb[train_tb['malware']==0])

(2988, 101)
malware
1    1996
0     992
Name: count, dtype: int64


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,82,240,117,240,117,240,117,240,117,...,172,117,93,208,16,31,215,108,215,35
1,1,286,110,172,240,117,240,117,240,117,...,65,202,65,117,260,297,215,114,215,71
2,1,82,172,117,16,29,208,228,117,228,...,117,172,117,172,117,172,117,172,117,172
3,1,82,240,117,240,117,240,117,240,117,...,261,208,240,117,260,40,209,260,40,209
4,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,141


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
1996,0,240,117,240,117,240,117,240,117,240,...,215,274,158,215,274,158,215,274,158,215
1997,0,82,274,158,215,86,82,37,70,37,...,158,215,82,240,117,82,240,117,297,8
1998,0,82,16,31,122,194,260,117,141,117,...,117,172,117,172,117,172,117,172,117,172
1999,0,82,208,187,208,172,117,172,208,93,...,50,260,141,65,260,141,65,141,297,47
2000,0,16,230,248,128,248,128,274,158,120,...,158,215,278,274,158,215,278,274,158,215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983,0,82,172,117,16,198,208,187,208,187,...,172,117,172,117,172,117,208,240,117,208
2984,0,215,274,158,215,274,158,215,172,117,...,240,117,15,117,15,240,117,240,117,240
2985,0,16,211,178,164,167,76,250,129,254,...,274,158,215,274,158,215,274,158,215,274
2986,0,208,187,208,93,208,172,117,82,60,...,225,35,60,81,35,225,35,225,215,260


## Creating IB versions of dataset

In [7]:
train_ib = train_tb.copy(deep=True)
test_ib = test_tb.copy(deep=True)

train_ib = ib_convert(train_ib).copy(deep=True)
print("\n")
test_ib = ib_convert(test_ib).copy(deep=True)

Transposing IB...
IB Transposed!
Removing duplicates...
Row: 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 
Duplicates removed!
Retransposing IB (revert)...
IB Retransposed!


Transposing IB...
IB Transposed!
Removing duplicates...
Row: 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 130

In [8]:
# train_ib.iloc[:,1:101] = train_ib.iloc[:,1:101].astype('str')
# train_ib.replace("nan", "NaN", inplace=True)
# test_ib.iloc[:,1:101] = test_ib.iloc[:,1:101].astype('str')
# test_ib.replace("nan", "NaN", inplace=True)
# display(train_ib.head())
# display(test_ib.head())

## Converting results to usable for models

Encoded APIs to Unencoded/String APIs

In [9]:
# train_tb_enc = train_tb.copy(deep=True)
# test_tb_enc = test_tb.copy(deep=True)
# train_ib_enc = train_ib.copy(deep=True)
# test_ib_enc = test_ib.copy(deep=True)

# train_tb_enc.iloc[:, 1:101] = train_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_tb_enc.iloc[:, 1:101] = test_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# train_ib_enc.iloc[:, 1:101] = train_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_ib_enc.iloc[:, 1:101] = test_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')

# display(train_tb_enc.head())
# display(test_tb_enc.head())
# display(train_ib_enc.head())
# display(test_ib_enc.head())

## Trying it out on LightGBM, CatBoost, and SVM

In [10]:
def get_indexes():
    indexes = []
    for i in range(100):
        indexes.append(f"t_{i}")
    return indexes

def train_test(train, test, model, model_str:str):
    X = train.iloc[:,1:101]
    y = train.iloc[:,0]
    X_test = test.iloc[:,1:101]
    y_test = test.iloc[:,0]
    print(f"Model: {model_str}")
    #MODEL ROBUSTNESS
    model.fit(X,y)
    y_pred = model.predict(X_test)
    #print("\nModel, Fold, Accuracy, Precision, F1-Score, Recall, ROC-AUC")
    #print(f"{model_str}, {'T'}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
    print(f"Fold: T")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"ROC-AUC: {metrics.roc_auc_score(y_test, y_pred):.4f}")
    #STRATIFIED K-FOLDS
    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    ctr = 0
    for train_idx, test_idx in skf.split(train.iloc[:,1:101], train.iloc[:,0]):
        X_train = train.iloc[train_idx, 1:101]
        y_train = train.iloc[train_idx, 0]
        model.fit(X_train, y_train)
        y_pred = model.predict(train.iloc[test_idx, 1:101])
        y_test = train.iloc[test_idx, 0]
        #print(f"{model_str}, {ctr}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
        print(f"Fold: {ctr}")
        print(classification_report(y_test, y_pred, digits=4))
        ctr += 1
    print('-------------------------------------------------------')
    print("")

In [11]:
train_test(train_tb, test_tb, lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=0), "LGBM TB")
train_test(train_tb, test_tb, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                                    nan_mode='Min', one_hot_max_size=256), "CATB TB")

Model: LGBM TB
Fold: T
              precision    recall  f1-score   support

           0     0.0469    0.8289    0.0889        76
           1     0.9996    0.9665    0.9828     38151

    accuracy                         0.9662     38227
   macro avg     0.5233    0.8977    0.5358     38227
weighted avg     0.9978    0.9662    0.9810     38227

ROC-AUC: 0.8977
Fold: 0
              precision    recall  f1-score   support

           0     0.9066    0.8333    0.8684       198
           1     0.9207    0.9575    0.9387       400

    accuracy                         0.9164       598
   macro avg     0.9136    0.8954    0.9036       598
weighted avg     0.9160    0.9164    0.9154       598

Fold: 1
              precision    recall  f1-score   support

           0     0.8844    0.8844    0.8844       199
           1     0.9424    0.9424    0.9424       399

    accuracy                         0.9231       598
   macro avg     0.9134    0.9134    0.9134       598
weighted avg     0.

In [12]:
train_test(train_ib, test_ib, lgbm.LGBMClassifier(random_state=1, n_jobs=0,verbose=0), "LGBM IB")
train_test(train_ib, test_ib, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                                    nan_mode='Min', one_hot_max_size=256), "CATB IB")

Model: LGBM IB
Fold: T
              precision    recall  f1-score   support

           0     0.0372    0.8289    0.0712        76
           1     0.9996    0.9573    0.9780     38151

    accuracy                         0.9570     38227
   macro avg     0.5184    0.8931    0.5246     38227
weighted avg     0.9977    0.9570    0.9762     38227

ROC-AUC: 0.8931
Fold: 0
              precision    recall  f1-score   support

           0     0.8918    0.8737    0.8827       198
           1     0.9381    0.9475    0.9428       400

    accuracy                         0.9231       598
   macro avg     0.9149    0.9106    0.9127       598
weighted avg     0.9228    0.9231    0.9229       598

Fold: 1
              precision    recall  f1-score   support

           0     0.9072    0.8844    0.8957       199
           1     0.9431    0.9549    0.9489       399

    accuracy                         0.9314       598
   macro avg     0.9251    0.9197    0.9223       598
weighted avg     0.

In [13]:
train_test(train_tb, test_tb, svc.SVC(random_state=1, verbose=0, cache_size=1024), "SVM TB")
train_test(train_tb, test_tb, mlpc.MLPClassifier(random_state=1, verbose=0), "MLPC TB")

Model: SVM TB
Fold: T
              precision    recall  f1-score   support

           0     0.0329    0.8158    0.0632        76
           1     0.9996    0.9522    0.9753     38151

    accuracy                         0.9519     38227
   macro avg     0.5163    0.8840    0.5193     38227
weighted avg     0.9977    0.9519    0.9735     38227

ROC-AUC: 0.8840
Fold: 0
              precision    recall  f1-score   support

           0     0.8978    0.8434    0.8698       198
           1     0.9248    0.9525    0.9384       400

    accuracy                         0.9164       598
   macro avg     0.9113    0.8980    0.9041       598
weighted avg     0.9158    0.9164    0.9157       598

Fold: 1
              precision    recall  f1-score   support

           0     0.8737    0.8693    0.8715       199
           1     0.9350    0.9373    0.9362       399

    accuracy                         0.9147       598
   macro avg     0.9044    0.9033    0.9039       598
weighted avg     0.9

In [14]:
train_test(train_ib, test_ib, svc.SVC(random_state=1, verbose=0, cache_size=1024), "SVM IB")
train_test(train_ib, test_ib, mlpc.MLPClassifier(random_state=1, verbose=0), "MLPC IB")

Model: SVM IB
Fold: T
              precision    recall  f1-score   support

           0     0.0258    0.7237    0.0498        76
           1     0.9994    0.9455    0.9717     38151

    accuracy                         0.9451     38227
   macro avg     0.5126    0.8346    0.5107     38227
weighted avg     0.9975    0.9451    0.9699     38227

ROC-AUC: 0.8346
Fold: 0
              precision    recall  f1-score   support

           0     0.8922    0.7525    0.8164       198
           1     0.8863    0.9550    0.9194       400

    accuracy                         0.8880       598
   macro avg     0.8893    0.8538    0.8679       598
weighted avg     0.8883    0.8880    0.8853       598

Fold: 1
              precision    recall  f1-score   support

           0     0.8678    0.7588    0.8097       199
           1     0.8868    0.9424    0.9137       399

    accuracy                         0.8813       598
   macro avg     0.8773    0.8506    0.8617       598
weighted avg     0.8