In [1]:
# Import Libraries
import pandas as pd
import warnings
import lightgbm as lgbm
import catboost as catb
import sklearn.svm as svc
import sklearn.neural_network as mlpc
import sklearn.metrics as metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

df = pd.read_csv('./Dataset/oliveira_labelled.csv')
# df = pd.read_csv('./oliveira_pca.csv')

API_LIST = "./Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:70].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:70] = row + ([307]*(69-len(row)))
        if r % 69 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
#df = inject_patterns(df)

In [2]:
X = df.iloc[:,1:] #Features
y = df.iloc[:,0] #Labels
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.30, random_state=1, shuffle=True)

train = pd.concat([train_labels,train_features], axis=1)
test = pd.concat([test_labels,test_features], axis=1)

display(train)
display(test)

Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
9109,1,215,274,158,215,274,158,215,240,117,...,199,264,215,271,297,255,240,117,4f20d8222a65402f4f80327059292ec8,trojan
39615,1,82,172,117,16,271,240,271,262,306,...,215,100,215,100,215,100,215,100,ae32f71bc453c8caf36a083c2f68927b,trojan
7039,1,172,117,172,117,172,117,172,117,172,...,172,117,172,117,172,117,172,112,191906dd0816413259422b78f00acefa,trojan
3521,1,82,172,117,16,255,151,60,151,208,...,172,117,172,117,172,117,172,117,8e34ba3c5511120ac3ad4aa49fdc6dcb,trojan
42123,1,82,215,240,117,71,297,135,171,215,...,29,262,29,262,29,262,29,262,163632bbec0ed021dbdd4e889bf8c255,trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8243,1,215,274,158,215,274,158,215,172,117,...,60,81,172,117,25,172,117,172,40ff2a0fbca280db07984242ae1d7304,trojan
34556,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,d37cfba88e4ae04fbfd54a1253ff75bd,trojan
5500,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,fefa40021e0f77caf95ec177dbcfd122,trojan
12877,1,82,240,117,240,117,240,117,240,117,...,16,31,215,108,208,80,240,117,8cc4aff450308fae48f36fc16593565a,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
30943,1,208,172,240,117,240,262,112,123,65,...,274,215,274,215,274,215,274,215,4e270486b92ccff8afa59935ba4f5adc,trojan
22843,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,d92d0f24e15384541a0c3c72424fe3a8,trojan
24989,1,215,274,158,215,274,158,215,172,117,...,15,240,117,240,117,240,117,172,1dcb9bd8dcdd50f6d07035ea895ecfd1,trojan
33223,1,82,240,117,240,117,240,117,240,117,...,208,93,208,16,31,215,108,208,24dd4677c14eb5828bda78749fded6b8,pua
8479,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,cc5d38cb80faaf60d8efabecdc04f832,trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26143,1,82,198,86,82,274,37,240,117,260,...,260,141,65,257,260,40,226,65,0977d284b51c719de296bb2487d90bf2,trojan
26133,1,82,86,82,37,70,37,240,117,260,...,178,215,260,65,260,141,65,20,fd3401c769c882be280b2daf7b230b80,trojan
33098,1,215,274,158,215,274,158,215,172,117,...,81,60,81,172,117,25,172,117,6e17a53f6e8d6f03855952281fa18456,trojan
16880,1,208,187,208,93,208,228,117,228,240,...,65,260,141,65,198,172,117,260,7c76c5d77f582ba1aca92eca4ae4048e,trojan


In [3]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=1, n_jobs=-1, sampling_strategy=0.4)

X = train.iloc[:,1:101]
y = train.iloc[:,0]

X_res, y_res = smt.fit_resample(X, y)
train = pd.concat([y_res,X_res], axis=1)
print(train.shape)
print(train['malware'].value_counts())
display(train.head())

(39355, 101)
malware
1    28111
0    11244
Name: count, dtype: int64


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,215,274,158,215,274,158,215,240,117,...,264,208,199,264,215,271,297,255,240,117
1,1,82,172,117,16,271,240,271,262,306,...,215,100,215,100,215,100,215,100,215,100
2,1,172,117,172,117,172,117,172,117,172,...,240,117,172,117,172,117,172,117,172,112
3,1,82,172,117,16,255,151,60,151,208,...,172,117,172,117,172,117,172,117,172,117
4,1,82,215,240,117,71,297,135,171,215,...,29,262,29,262,29,262,29,262,29,262


## Creating IB versions of dataset

IB versions of the dataset is not suitable for PCA datasets as it is a continous rather than a nominal data.

In [4]:
# train_ib = train_tb.copy(deep=True)
# test_ib = test_tb.copy(deep=True)

# train_ib = ib_convert(train_ib).copy(deep=True)
# print("\n")
# test_ib = ib_convert(test_ib).copy(deep=True)

In [5]:
# train_ib.iloc[:,1:101] = train_ib.iloc[:,1:101].astype('str')
# train_ib.replace("nan", "NaN", inplace=True)
# test_ib.iloc[:,1:101] = test_ib.iloc[:,1:101].astype('str')
# test_ib.replace("nan", "NaN", inplace=True)
# display(train_ib.head())
# display(test_ib.head())

## Converting results to usable for models

Encoded APIs to Unencoded/String APIs

In [6]:
# train_tb_enc = train_tb.copy(deep=True)
# test_tb_enc = test_tb.copy(deep=True)
# train_ib_enc = train_ib.copy(deep=True)
# test_ib_enc = test_ib.copy(deep=True)

# train_tb_enc.iloc[:, 1:101] = train_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_tb_enc.iloc[:, 1:101] = test_tb.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# train_ib_enc.iloc[:, 1:101] = train_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
# test_ib_enc.iloc[:, 1:101] = test_ib.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')

# display(train_tb_enc.head())
# display(test_tb_enc.head())
# display(train_ib_enc.head())
# display(test_ib_enc.head())

## Trying it out on LightGBM, CatBoost, and SVM

In [7]:
def get_indexes():
    indexes = []
    for i in range(100):
        indexes.append(f"t_{i}")
    return indexes

def train_test(train, test, model, model_str:str):
    X = train.iloc[:,1:101]
    y = train.iloc[:,0]
    X_test = test.iloc[:,1:101]
    y_test = test.iloc[:,0]
    print(f"Model: {model_str}")
    #MODEL ROBUSTNESS
    model.fit(X,y)
    y_pred = model.predict(X_test)
    #print("\nModel, Fold, Accuracy, Precision, F1-Score, Recall, ROC-AUC")
    #print(f"{model_str}, {'T'}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
    print(f"Fold: T")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"ROC-AUC: {metrics.roc_auc_score(y_test, y_pred):.4f}")
    #STRATIFIED K-FOLDS
    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    ctr = 0
    for train_idx, test_idx in skf.split(train.iloc[:,1:101], train.iloc[:,0]):
        X_train = train.iloc[train_idx, 1:101]
        y_train = train.iloc[train_idx, 0]
        model.fit(X_train, y_train)
        y_pred = model.predict(train.iloc[test_idx, 1:101])
        y_test = train.iloc[test_idx, 0]
        #print(f"{model_str}, {ctr}, {metrics.accuracy_score(y_test, y_pred):.4f}, {metrics.average_precision_score(y_test, y_pred):.4f}, {metrics.f1_score(y_test, y_pred):.4f}, {metrics.recall_score(y_test, y_pred):.4f}, {metrics.roc_auc_score(y_test, y_pred):.4f}")
        print(f"Fold: {ctr}")
        print(classification_report(y_test, y_pred, digits=4))
        ctr += 1
    print('-------------------------------------------------------')
    print("")

In [8]:
train_test(train, test, lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=0), "LGBM TB")
train_test(train, test, catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, 
                                                      nan_mode='Min', one_hot_max_size=256), "CATB TB")

Model: LGBM TB
Fold: T
              precision    recall  f1-score   support

           0     0.7822    0.7337    0.7572       323
           1     0.9929    0.9945    0.9937     12048

    accuracy                         0.9877     12371
   macro avg     0.8875    0.8641    0.8754     12371
weighted avg     0.9874    0.9877    0.9875     12371

ROC-AUC: 0.8641
Fold: 0
              precision    recall  f1-score   support

           0     0.9870    0.9791    0.9830      2248
           1     0.9917    0.9948    0.9933      5623

    accuracy                         0.9903      7871
   macro avg     0.9893    0.9870    0.9881      7871
weighted avg     0.9903    0.9903    0.9903      7871

Fold: 1
              precision    recall  f1-score   support

           0     0.9862    0.9849    0.9855      2249
           1     0.9940    0.9945    0.9942      5622

    accuracy                         0.9917      7871
   macro avg     0.9901    0.9897    0.9899      7871
weighted avg     0.

In [9]:
train_test(train, test, svc.SVC(random_state=1, verbose=0, cache_size=1024), "SVM TB")
train_test(test, test, mlpc.MLPClassifier(random_state=1, verbose=0), "MLPC TB")

Model: SVM TB
Fold: T
              precision    recall  f1-score   support

           0     0.7393    0.6409    0.6866       323
           1     0.9904    0.9939    0.9922     12048

    accuracy                         0.9847     12371
   macro avg     0.8648    0.8174    0.8394     12371
weighted avg     0.9838    0.9847    0.9842     12371

ROC-AUC: 0.8174
Fold: 0
              precision    recall  f1-score   support

           0     0.9807    0.9929    0.9867      2248
           1     0.9971    0.9922    0.9947      5623

    accuracy                         0.9924      7871
   macro avg     0.9889    0.9925    0.9907      7871
weighted avg     0.9924    0.9924    0.9924      7871

Fold: 1
              precision    recall  f1-score   support

           0     0.9859    0.9956    0.9907      2249
           1     0.9982    0.9943    0.9963      5622

    accuracy                         0.9947      7871
   macro avg     0.9921    0.9949    0.9935      7871
weighted avg     0.9