In [1]:
# This notebook serves to kickstart some ML experiments. Note that results may vary w.r.t. in the paper because we're using different seeds and any experiment is, in some way, affected by some degree of randomness

In [2]:
import pandas as pd
import sklearn as sk

In [3]:
# First, let's load the dataset in a dataframe

df = pd.read_csv("feature_updated.csv", low_memory=False)

In [4]:
## The first 9 columns are NOT used in our experiments, and they can be considered as "headers"

df.iloc[:, :9].head()

Unnamed: 0,code,author,category,MV,year,month,status,dataset,split
0,iaemcpeoioekjbbephpefmdoncmpdcdc,Takahiro Maeda,7_productivity,3.0,2023,8,benign,U,
1,migdhldfbapmodfbmgpofnikfbfpbbon,https://www.highlighty.app,7_productivity,3.0,2023,11,benign,U,
2,haldmdihfigeapcdbibpndmfjpfkgmpo,Order Hàng Trung Quốc,12_shopping,3.0,2023,7,benign,U,
3,gnmcapmdidpeafddbhecmkafinofffdb,GPTDeveloper,7_productivity,3.0,2023,4,benign,U,
4,nlipoenfbbikpbjkfpfillcgkoblgpmj,http://www.awesomescreenshot.com,7_productivity,3.0,2023,11,benign,U,


In [5]:
# To provide more context:
# - the "dataset" column refers to which dataset (either "L" or "U") an extension was put in
# - the "split" column denotes extensions in the "L" dataset that were used in the "train" or "test" portion for the experiments done by Ben Rozenweig
# - the "year" and "month" are the month and year of the extension's last update
# - the "status" denotes the ground truth of an extension (for those in dataset "U", the status is "benign" simply because they had not been taken down from the CWS; in practice, we treated all of these as unlabeled)

In [6]:
### STANDARD SETUP

to_exclude = ['code', 'status', 'year', 'month', 'dataset', 'split', 'author', 'category', 'MV']
features = [x for x in df.columns if x not in to_exclude]

comb_features = features
metadata_features = features[:2152]
sourcecode_features = features[2152:]
# from 0 to 2152: metadata features; from 2152 to 6152 (end): code features
# if you want to use the "combined classifier", choose all features

label = 'status'
threshold_cb = 0.908
threshold_md = 0.884
threshold_sc = 0.912

In [8]:
def test_clf(clf, test_set, features=features, label=label, threshold=threshold_cb):
    '''Handy function to test a classifier, compute the probabilities and print results'''
    pred = clf.predict(test_set[features])
    probabilities = clf.predict_proba(test_set[features])


    y_pred = []
    for prob in probabilities:
        if prob[0] >= threshold:
            y_pred.append("benign")
        else:
            y_pred.append("malware")


    acc = sk.metrics.accuracy_score(test_set[label], y_pred)
    prec = sk.metrics.precision_score(test_set[label], y_pred, pos_label='malware')
    rec = sk.metrics.recall_score(test_set[label], y_pred, pos_label='malware')
    fpr = 1-sk.metrics.recall_score(test_set[label], y_pred, pos_label='benign')



    print("Accuracy: {:.2f}%,\tPrecision: {:.2f}%\tRecall: {:.2f}%\tFPR: {:.5f}".format(acc*100, prec*100, rec*100, fpr))

    display(pd.crosstab(test_set[label], y_pred, rownames=['True'], colnames=['Pred']))


    return probabilities, acc, prec, rec, fpr

In [9]:
### First, let's re-create the datasets as they should be

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime


df_2022 = df[df['year']<=2022]
df_2022_benign = df_2022[df_2022['status']=='benign']
df_2022_malicious = df_2022[df_2022['status']=='malware']


In [10]:
i = 42 # random state

train_ben, test_ben = train_test_split(df_2022_benign, test_size=0.2, random_state=i)
train_mal, test_mal = train_test_split(df_2022_malicious, test_size=0.2, random_state=i)
train = pd.concat([train_ben, train_mal])
test=pd.concat([test_ben,test_mal])
## Define and train the classifier
rf_cb = RandomForestClassifier(n_estimators=300, max_features="sqrt", criterion="gini",
                                   n_jobs=-2, class_weight="balanced", random_state=i)
start = datetime.now()
rf_cb.fit(train[comb_features], train[label])
print(f"Training time: f{datetime.now()- start}")
end = datetime.now()

Training time: f0:00:19.541323


In [11]:
## Test the classifier
prob, acc_new, prec_new, rec_new, fpr_new = test_clf(rf_cb, test, features = comb_features, threshold=threshold_cb)
print(f"Test time: f{datetime.now()- end}")


Accuracy: 98.72%,	Precision: 91.85%	Recall: 95.68%	FPR: 0.00940


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12221,116
malware,59,1307


Test time: f0:00:01.131951


In [12]:
## Let's assess the metadata classifier

i = 42 # random state

train_ben, test_ben = train_test_split(df_2022_benign, test_size=0.2, random_state=i)
train_mal, test_mal = train_test_split(df_2022_malicious, test_size=0.2, random_state=i)
train = pd.concat([train_ben, train_mal])
test=pd.concat([test_ben,test_mal])

## Define and train the classifier
rf_md = RandomForestClassifier(n_estimators=300, max_features="sqrt", criterion="gini",
                                   n_jobs=-2, class_weight="balanced", random_state=i)
start = datetime.now()
rf_md.fit(train[metadata_features], train[label])
print(f"Training time: f{datetime.now()- start}")
end = datetime.now()

## Test the classifier
prob, acc_new, prec_new, rec_new, fpr_new = test_clf(rf_md, test, features = metadata_features, threshold=threshold_md)
print(f"Test time: f{datetime.now()- end}")

Training time: f0:00:11.217091
Accuracy: 98.45%,	Precision: 89.24%	Recall: 95.97%	FPR: 0.01281


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12179,158
malware,55,1311


Test time: f0:00:00.530999


In [13]:
## Let's assess the source-code classifier

i = 42 # random state

train_ben, test_ben = train_test_split(df_2022_benign, test_size=0.2, random_state=i)
train_mal, test_mal = train_test_split(df_2022_malicious, test_size=0.2, random_state=i)
train = pd.concat([train_ben, train_mal])
test=pd.concat([test_ben,test_mal])

## Define and train the classifier
rf_sc = RandomForestClassifier(n_estimators=300, max_features="sqrt", criterion="gini",
                                   n_jobs=-2, class_weight="balanced", random_state=i)
start = datetime.now()
rf_sc.fit(train[sourcecode_features], train[label])
print(f"Training time: f{datetime.now()- start}")
end = datetime.now()

## Test the classifier
prob, acc_new, prec_new, rec_new, fpr_new = test_clf(rf_sc, test, features = sourcecode_features, threshold=threshold_sc)
print(f"Test time: f{datetime.now()- end}")

Training time: f0:00:14.357451
Accuracy: 98.42%,	Precision: 90.66%	Recall: 93.78%	FPR: 0.01070


Pred,benign,malware
True,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,12205,132
malware,85,1281


Test time: f0:00:00.804970


In [13]:
## From here on, you can do whatever you want