In [1]:
import pandas as pd
import sklearn as sk

In [2]:
df = pd.read_csv("feature_updated.csv", low_memory=False)

In [3]:
### STANDARD SETUP

features = df.columns
to_exclude = ['code', 'status', 'year', 'month', 'dataset', 'split', 'author', 'category', 'MV']
features = [x for x in features if x not in to_exclude]
# from 0 to 2152: metadata features; from 2152 to 6152 (end): code features
# if you want to use the "combined classifier", choose all features

label = 'status'
threshold = 0.908 # this was derived by Ben

In [4]:
def test_clf(clf, test_set, features=features, label=label, threshold=threshold):
    '''Handy function to test a classifier, compute the probabilities and print results'''
    pred = clf.predict(test_set[features])
    probabilities = clf.predict_proba(test_set[features])


    y_pred = []
    for prob in probabilities:
        if prob[0] >= threshold:
            y_pred.append("benign")
        else:
            y_pred.append("malware")


    acc = sk.metrics.accuracy_score(test_set[label], y_pred)
    prec = sk.metrics.precision_score(test_set[label], y_pred, pos_label='malware')
    rec = sk.metrics.recall_score(test_set[label], y_pred, pos_label='malware')
    fpr = 1-sk.metrics.recall_score(test_set[label], y_pred, pos_label='benign')



    print("Accuracy: {:.2f}%,\tPrecision: {:.2f}%\tRecall: {:.2f}%\tFPR: {:.5f}".format(acc*100, prec*100, rec*100, fpr))

    display(pd.crosstab(test_set[label], y_pred, rownames=['True'], colnames=['Pred']))


    return probabilities, acc, prec, rec, fpr

In [5]:
### First, let's re-create the datasets as they should be

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime


df_2022 = df[df['year']<=2022]
df_2022_benign = df_2022[df_2022['status']=='benign']
df_2022_malicious = df_2022[df_2022['status']=='malware']


In [6]:
# EXPERIMENT 1: changing the threshold to reduce false positives (we redo this 5 times)

In [7]:
fpr_list = []
rec_list = []

threshold_to_test = 0.65

for i in range(5):
    print("\n\nROUND {}\n\n".format(i))
    train_ben, test_ben = train_test_split(df_2022_benign, test_size=0.2, random_state=i)
    train_mal, test_mal = train_test_split(df_2022_malicious, test_size=0.2, random_state=i)
    train = pd.concat([train_ben, train_mal])
    test=pd.concat([test_ben,test_mal])
    ## Define and train the classifier
    rf = RandomForestClassifier(n_estimators=300, max_features="sqrt", criterion="gini",
                                       n_jobs=-2, class_weight="balanced", random_state=i)
    start = datetime.now()
    rf.fit(train[features], train[label])
    print(f"Training time: f{datetime.now()- start}")
    end = datetime.now()


    ## Test the classifier
    prob, acc_def, prec_def, rec_def, fpr_def = test_clf(rf, test)
    print(f"Test time: f{datetime.now()- end}")
    prob, acc_new, prec_new, rec_new, fpr_new = test_clf(rf, test, threshold=threshold_to_test)
    fpr_list.append((fpr_def, fpr_new))
    rec_list.append((rec_def, rec_new))



ROUND 0


Training time: f0:00:25.219978
Accuracy: 98.81%,	Precision: 92.21%	Recall: 96.19%	FPR: 0.00900


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12226,111
malware,52,1314


Test time: f0:00:01.229914
Accuracy: 99.07%,	Precision: 99.68%	Recall: 91.00%	FPR: 0.00032


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12333,4
malware,123,1243




ROUND 1


Training time: f0:00:19.408606
Accuracy: 98.66%,	Precision: 91.50%	Recall: 95.39%	FPR: 0.00981


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12216,121
malware,63,1303


Test time: f0:00:01.114001
Accuracy: 99.01%,	Precision: 99.68%	Recall: 90.34%	FPR: 0.00032


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12333,4
malware,132,1234




ROUND 2


Training time: f0:00:19.509310
Accuracy: 98.77%,	Precision: 92.12%	Recall: 95.83%	FPR: 0.00908


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12225,112
malware,57,1309


Test time: f0:00:01.080616
Accuracy: 99.04%,	Precision: 99.28%	Recall: 91.00%	FPR: 0.00073


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12328,9
malware,123,1243




ROUND 3


Training time: f0:00:19.214404
Accuracy: 98.66%,	Precision: 91.86%	Recall: 94.95%	FPR: 0.00932


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12222,115
malware,69,1297


Test time: f0:00:01.101001
Accuracy: 98.94%,	Precision: 99.51%	Recall: 89.82%	FPR: 0.00049


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12331,6
malware,139,1227




ROUND 4


Training time: f0:00:19.291793
Accuracy: 98.53%,	Precision: 90.54%	Recall: 95.24%	FPR: 0.01102


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12201,136
malware,65,1301


Test time: f0:00:01.008747
Accuracy: 99.01%,	Precision: 99.52%	Recall: 90.56%	FPR: 0.00049


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12331,6
malware,129,1237


In [8]:
rec_list.append((rec_def, rec_new))
avg_fpr_new = (fpr_list[0][1] + fpr_list[1][1] + fpr_list[2][1] + fpr_list[3][1] + fpr_list[4][1]) / 5
avg_fpr_def = (fpr_list[0][0] + fpr_list[1][0] + fpr_list[2][0] + fpr_list[3][0] + fpr_list[4][0]) / 5

avg_rec_new = (rec_list[0][1] + rec_list[1][1] + rec_list[2][1] + rec_list[3][1] + rec_list[4][1]) / 5
avg_rec_def = (rec_list[0][0] + rec_list[1][0] + rec_list[2][0] + rec_list[3][0] + rec_list[4][0]) / 5

print("Original FPR: {:.6f}\tNew FPR: {:.6f}".format(avg_fpr_def, avg_fpr_new))
print("Original TPR: {:.6f}\tNew TPR: {:.6f}".format(avg_rec_def, avg_rec_new))

Original FPR: 0.009646	New FPR: 0.000470
Original TPR: 0.955198	New TPR: 0.905417


In [9]:
# EXPERIMENT 2: subsampling benign extensions to balance the dataset (and hoping for a better baseline performance)

In [10]:
for i in range(5): # repeat it as many times as you want
    print("\n\nROUND {}\n\n".format(i))
    for j in [0.1, 0.2, 0.4, 0.8]: # specify the size of each draw


        sub_train_ben = train_ben.sample(n=(int(len(train_ben)*j)), random_state=i)
        sub_train = pd.concat([sub_train_ben, train_mal])

        print("\tSubset: {} --> Benign samples: {}\t Malicious samples: {}".format(j,len(sub_train_ben), len(train_mal)))

        ## Define and train the classifier
        rf_sub = RandomForestClassifier(n_estimators=300, max_features="sqrt", criterion="gini",
                                           n_jobs=-2, class_weight="balanced", random_state=1)
        start = datetime.now()
        rf_sub.fit(sub_train[features], sub_train[label])
        print(f"Training time: f{datetime.now()- start}")


        ## Test the classifier

        prob_, acc, prec, rec, fpr = test_clf(rf_sub, test)



ROUND 0


	Subset: 0.1 --> Benign samples: 4934	 Malicious samples: 5464
Training time: f0:00:02.535192
Accuracy: 60.94%,	Precision: 20.29%	Recall: 99.63%	FPR: 0.43341


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,6990,5347
malware,5,1361


	Subset: 0.2 --> Benign samples: 9869	 Malicious samples: 5464
Training time: f0:00:04.290366
Accuracy: 81.54%,	Precision: 34.96%	Recall: 99.05%	FPR: 0.20402


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,9820,2517
malware,13,1353


	Subset: 0.4 --> Benign samples: 19738	 Malicious samples: 5464
Training time: f0:00:08.509002
Accuracy: 93.69%,	Precision: 61.58%	Recall: 97.73%	FPR: 0.06752


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,11504,833
malware,31,1335


	Subset: 0.8 --> Benign samples: 39477	 Malicious samples: 5464
Training time: f0:00:17.537782
Accuracy: 98.05%,	Precision: 86.08%	Recall: 95.97%	FPR: 0.01718


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12125,212
malware,55,1311




ROUND 1


	Subset: 0.1 --> Benign samples: 4934	 Malicious samples: 5464
Training time: f0:00:02.415853
Accuracy: 61.02%,	Precision: 20.32%	Recall: 99.63%	FPR: 0.43260


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,7000,5337
malware,5,1361


	Subset: 0.2 --> Benign samples: 9869	 Malicious samples: 5464
Training time: f0:00:04.110822
Accuracy: 81.54%,	Precision: 34.94%	Recall: 98.83%	FPR: 0.20378


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,9823,2514
malware,16,1350


	Subset: 0.4 --> Benign samples: 19738	 Malicious samples: 5464
Training time: f0:00:08.827987
Accuracy: 93.65%,	Precision: 61.41%	Recall: 97.73%	FPR: 0.06801


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,11498,839
malware,31,1335


	Subset: 0.8 --> Benign samples: 39477	 Malicious samples: 5464
Training time: f0:00:18.051117
Accuracy: 98.14%,	Precision: 86.72%	Recall: 96.05%	FPR: 0.01629


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12136,201
malware,54,1312




ROUND 2


	Subset: 0.1 --> Benign samples: 4934	 Malicious samples: 5464
Training time: f0:00:02.742002
Accuracy: 60.46%,	Precision: 20.11%	Recall: 99.78%	FPR: 0.43892


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,6922,5415
malware,3,1363


	Subset: 0.2 --> Benign samples: 9869	 Malicious samples: 5464
Training time: f0:00:05.453038
Accuracy: 81.47%,	Precision: 34.86%	Recall: 98.90%	FPR: 0.20459


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,9813,2524
malware,15,1351


	Subset: 0.4 --> Benign samples: 19738	 Malicious samples: 5464
Training time: f0:00:08.233396
Accuracy: 93.86%,	Precision: 62.24%	Recall: 97.73%	FPR: 0.06566


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,11527,810
malware,31,1335


	Subset: 0.8 --> Benign samples: 39477	 Malicious samples: 5464
Training time: f0:00:16.257787
Accuracy: 98.04%,	Precision: 86.21%	Recall: 95.68%	FPR: 0.01694


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12128,209
malware,59,1307




ROUND 3


	Subset: 0.1 --> Benign samples: 4934	 Malicious samples: 5464
Training time: f0:00:02.408521
Accuracy: 60.81%,	Precision: 20.24%	Recall: 99.71%	FPR: 0.43495


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,6971,5366
malware,4,1362


	Subset: 0.2 --> Benign samples: 9869	 Malicious samples: 5464
Training time: f0:00:04.317991
Accuracy: 81.33%,	Precision: 34.68%	Recall: 98.83%	FPR: 0.20613


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,9794,2543
malware,16,1350


	Subset: 0.4 --> Benign samples: 19738	 Malicious samples: 5464
Training time: f0:00:07.544001
Accuracy: 93.85%,	Precision: 62.21%	Recall: 97.51%	FPR: 0.06558


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,11528,809
malware,34,1332


	Subset: 0.8 --> Benign samples: 39477	 Malicious samples: 5464
Training time: f0:00:15.029589
Accuracy: 98.10%,	Precision: 86.37%	Recall: 96.05%	FPR: 0.01678


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12130,207
malware,54,1312




ROUND 4


	Subset: 0.1 --> Benign samples: 4934	 Malicious samples: 5464
Training time: f0:00:02.319963
Accuracy: 60.22%,	Precision: 19.99%	Recall: 99.63%	FPR: 0.44144


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,6891,5446
malware,5,1361


	Subset: 0.2 --> Benign samples: 9869	 Malicious samples: 5464
Training time: f0:00:04.138001
Accuracy: 81.40%,	Precision: 34.77%	Recall: 98.90%	FPR: 0.20540


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,9803,2534
malware,15,1351


	Subset: 0.4 --> Benign samples: 19738	 Malicious samples: 5464
Training time: f0:00:08.094621
Accuracy: 93.79%,	Precision: 61.97%	Recall: 97.58%	FPR: 0.06630


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,11519,818
malware,33,1333


	Subset: 0.8 --> Benign samples: 39477	 Malicious samples: 5464
Training time: f0:00:15.073698
Accuracy: 98.18%,	Precision: 86.86%	Recall: 96.34%	FPR: 0.01613


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12138,199
malware,50,1316
