# Malware Outiers Test

This notebook tests model performance to determine if it is adversely affected by the existence of outliers (by quantity) among malicious samples.

In [1]:
import pandas as pd
import lightgbm as lgbm
import catboost as catb
import sklearn.metrics as metric
import sklearn.model_selection as select

import warnings
warnings.filterwarnings("ignore")

  from pandas.core import (


In [2]:
def get_indexes_catb():
    indexes = []
    for i in range(100):
        indexes.append(f"t_{i}")
    return indexes

def get_indexes_lgbm():
    indexes = []
    for i in range(100):
        indexes.append(f"{i}")
    return indexes

def get_lgbm(tb:bool=True):
    if tb:
        return lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=-1, categorical_feature=get_indexes_lgbm(), boost_from_average=True, boosting_type='gbdt', cat_l2=10, class_weight='balanced', data_sample_strategy='goss',
                                   device='gpu', enable_bundle=True, cpu_use_dp=False, learning_rate=0.1, max_bin=255, max_depth=5, n_estimators=500, num_leaves=16, objective='binary', tree_learner='feature')
    return lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=-1, categorical_feature=get_indexes_lgbm(), boost_from_average=True, boosting_type='gbdt', cat_l2=5, class_weight='balanced', data_sample_strategy='goss', device='gpu',
                               enable_bundle=True, cpu_use_dp=False, learning_rate=0.1, max_bin=255, max_depth=5, n_estimators=1000, num_leaves=32, objective='binary', tree_learner='feature')

def get_catb(tb:bool=True):
    if tb:
        return catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes_catb(), nan_mode='Min', one_hot_max_size=256, boosting_type='Ordered', bootstrap_type='Bayesian',
                                       gpu_cat_features_storage='CpuPinnedMemory', grow_policy='SymmetricTree', l2_leaf_reg=1, learning_rate=0.1, max_depth=10, n_estimators=500, objective='Logloss', task_type='CPU')    
    return catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes_catb(), nan_mode='Min', one_hot_max_size=256, boosting_type='Ordered', bootstrap_type='Bayesian', 
                                   gpu_cat_features_storage='CpuPinnedMemory', grow_policy='SymmetricTree', l2_leaf_reg=1, learning_rate=0.2, max_depth=10, n_estimators=100, objective='Logloss', task_type='CPU')

def train_eval(train, test, model, model_str:str):
    X_train = train.iloc[:,1:101]
    X_test = test.iloc[:,1:101]
    y_train = train['malware']
    y_test = test['malware']
    print(f"Training {model_str}...")
    model.fit(X_train, y_train)
    print(f"{model_str} Results\n", metric.classification_report(y_test, model.predict(X_test), digits=4))

## No False Only

Dataset without falsely-labelled malicious samples only.

In [3]:
lgbm_tb = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/TB/LGBM_TB.csv', low_memory=False)
lgbm_ib = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/IB/LGBM_IB.csv', low_memory=False)
catb_tb = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/TB/CATB_TB.csv', low_memory=False)
catb_ib = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/IB/CATB_IB.csv', low_memory=False)

lgbm_tb_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/TB/LGBM_TB_Test.csv', low_memory=False)
lgbm_ib_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/IB/LGBM_IB_Test.csv', low_memory=False)
catb_tb_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/TB/CATB_TB_Test.csv', low_memory=False)
catb_ib_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse/IB/CATB_IB_Test.csv', low_memory=False)

catb_tb.iloc[:,1:101] = catb_tb.iloc[:,1:101].astype('str')
catb_tb.replace("nan", "NaN", inplace=True)
catb_ib.iloc[:,1:101] = catb_ib.iloc[:,1:101].astype('str')
catb_ib.replace("nan", "NaN", inplace=True)
catb_tb_test.iloc[:,1:101] = catb_tb_test.iloc[:,1:101].astype('str')
catb_tb_test.replace("nan", "NaN", inplace=True)
catb_ib_test.iloc[:,1:101] = catb_ib_test.iloc[:,1:101].astype('str')
catb_ib_test.replace("nan", "NaN", inplace=True)

In [4]:
train_eval(lgbm_tb, lgbm_tb_test, get_lgbm(), "LightGBM TB")
train_eval(lgbm_ib, lgbm_ib_test, get_lgbm(False), "LightGBM IB")
train_eval(catb_tb, catb_tb_test, get_catb(), "CatBoost TB")
train_eval(catb_ib, catb_ib_test, get_catb(False), "CatBoost IB")

Training LightGBM TB...
LightGBM TB Results
               precision    recall  f1-score   support

           0     0.9390    0.6754    0.7857       114
           1     0.9908    0.9988    0.9948      4010

    accuracy                         0.9898      4124
   macro avg     0.9649    0.8371    0.8902      4124
weighted avg     0.9894    0.9898    0.9890      4124

Training LightGBM IB...
LightGBM IB Results
               precision    recall  f1-score   support

           0     0.9529    0.7105    0.8141       114
           1     0.9918    0.9990    0.9954      4010

    accuracy                         0.9910      4124
   macro avg     0.9724    0.8548    0.9047      4124
weighted avg     0.9908    0.9910    0.9904      4124

Training CatBoost TB...
CatBoost TB Results
               precision    recall  f1-score   support

           0     0.9737    0.6491    0.7789       114
           1     0.9901    0.9995    0.9948      4010

    accuracy                         0.9898    

## No False and No 1% (Outliers)

Dataset without falsely-labelled malicious samples **and** no outliers (by quantity). 

In [5]:
lgbm_tb = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/TB/LGBM_TB.csv', low_memory=False)
lgbm_ib = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/IB/LGBM_IB.csv', low_memory=False)
catb_tb = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/TB/CATB_TB.csv', low_memory=False)
catb_ib = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/IB/CATB_IB.csv', low_memory=False)

lgbm_tb_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/TB/LGBM_TB_Test.csv', low_memory=False)
lgbm_ib_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/IB/LGBM_IB_Test.csv', low_memory=False)
catb_tb_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/TB/CATB_TB_Test.csv', low_memory=False)
catb_ib_test = pd.read_csv('../Processed/Others/oliveira_labelled_nofalse_no1/IB/CATB_IB_Test.csv', low_memory=False)

catb_tb.iloc[:,1:101] = catb_tb.iloc[:,1:101].astype('str')
catb_tb.replace("nan", "NaN", inplace=True)
catb_ib.iloc[:,1:101] = catb_ib.iloc[:,1:101].astype('str')
catb_ib.replace("nan", "NaN", inplace=True)
catb_tb_test.iloc[:,1:101] = catb_tb_test.iloc[:,1:101].astype('str')
catb_tb_test.replace("nan", "NaN", inplace=True)
catb_ib_test.iloc[:,1:101] = catb_ib_test.iloc[:,1:101].astype('str')
catb_ib_test.replace("nan", "NaN", inplace=True)

In [6]:
train_eval(lgbm_tb, lgbm_tb_test, get_lgbm(), "LightGBM TB")
train_eval(lgbm_ib, lgbm_ib_test, get_lgbm(False), "LightGBM IB")
train_eval(catb_tb, catb_tb_test, get_catb(), "CatBoost TB")
train_eval(catb_ib, catb_ib_test, get_catb(False), "CatBoost IB")

Training LightGBM TB...
LightGBM TB Results
               precision    recall  f1-score   support

           0     0.9559    0.6373    0.7647       102
           1     0.9907    0.9992    0.9950      3961

    accuracy                         0.9902      4063
   macro avg     0.9733    0.8182    0.8798      4063
weighted avg     0.9899    0.9902    0.9892      4063

Training LightGBM IB...
LightGBM IB Results
               precision    recall  f1-score   support

           0     0.9167    0.6471    0.7586       102
           1     0.9910    0.9985    0.9947      3961

    accuracy                         0.9897      4063
   macro avg     0.9538    0.8228    0.8767      4063
weighted avg     0.9891    0.9897    0.9888      4063

Training CatBoost TB...
CatBoost TB Results
               precision    recall  f1-score   support

           0     0.9178    0.6569    0.7657       102
           1     0.9912    0.9985    0.9948      3961

    accuracy                         0.9899    