In [1]:
! pip install pefile
! pip install pandas



In [2]:
import pandas as pd

malData = pd.read_csv("/Users/julie-anneharris/Desktop/MalwareData.csv", sep="|")

legit = malData[0:41323].drop(["legitimate"], axis=1)
mal = malData[41323::].drop(["legitimate"], axis=1)

print("The legitimate dataset contains: %s samples, %s features"%(legit.shape[0],legit.shape[1]))
print("The malware dataset contains: %s samples, %s features"%(mal.shape[0],mal.shape[1]))

The legitimate dataset contains: 41323 samples, 56 features
The malware dataset contains: 96724 samples, 56 features


In [3]:
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [4]:
data_in = malData.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = malData['legitimate'].values
extratrees = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratrees,prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape,data_in_new.shape)

(138047, 54) (138047, 13)


In [5]:
import numpy as np
features = data_in_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print("%d"%(f+1),malData.columns[3+indices[f]],importances[indices[f]])

1 SizeOfStackReserve 0.17929563392587905
2 SizeOfOptionalHeader 0.10375605514795588
3 MajorLinkerVersion 0.08849657871116152
4 DllCharacteristics 0.06928707591477432
5 legitimate 0.06778408026929764
6 SectionAlignment 0.06105527123111167
7 SectionsMeanRawsize 0.050885309060419465
8 MinorSubsystemVersion 0.04522487253656382
9 Characteristics 0.041566249579157916
10 ResourcesMeanSize 0.03477149471614869
11 ResourcesMaxEntropy 0.027005624927164473
12 MinorOperatingSystemVersion 0.02646487588844905
13 SectionsMaxEntropy 0.0238899470469455


In [6]:
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train,mal_train)

RandomForestClassifier(n_estimators=50)

In [7]:
print("The score of the algorithm: ",classif.score(legit_test,mal_test)*100)

The score of the algorithm:  99.40601231437884


In [8]:
from sklearn.metrics import confusion_matrix

result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test,result)

In [9]:
print("False positives: ",conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ",conf_mat[1][0]/sum(conf_mat[1])*100)

False positives:  0.45274771024146543
False negatives:  0.9173218965928045
