In [2]:
! pip install pefile
! pip install pandas



In [3]:
import pandas as pd

malData = pd.read_csv("/Users/julie-anneharris/Desktop/MalwareData.csv", sep="|")

legit = malData[0:41323].drop(["legitimate"], axis=1)
mal = malData[41323::].drop(["legitimate"], axis=1)

print("The legitimate dataset contains: %s samples, %s features"%(legit.shape[0],legit.shape[1]))
print("The malware dataset contains: %s samples, %s features"%(mal.shape[0],mal.shape[1]))

The legitimate dataset contains: 41323 samples, 56 features
The malware dataset contains: 96724 samples, 56 features


In [4]:
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [5]:
data_in = malData.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = malData['legitimate'].values
extratrees = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratrees,prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape,data_in_new.shape)

(138047, 54) (138047, 13)


In [6]:
import numpy as np
features = data_in_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print("%d"%(f+1),malData.columns[3+indices[f]],importances[indices[f]])

1 SizeOfStackReserve 0.151455279502898
2 SizeOfOptionalHeader 0.10516803673638675
3 MajorLinkerVersion 0.08482457035009794
4 SectionsMeanRawsize 0.06103374711272363
5 DllCharacteristics 0.05896833470375248
6 SectionAlignment 0.056437549787553505
7 MinorSubsystemVersion 0.05626213803437774
8 legitimate 0.054404145468217965
9 ResourcesMaxEntropy 0.042419970695734985
10 Characteristics 0.04184289260396536
11 SizeOfStackCommit 0.03664481791294168
12 ResourcesMeanSize 0.03405796337025805
13 MinorOperatingSystemVersion 0.027391534078774118


In [7]:
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train,mal_train)

RandomForestClassifier(n_estimators=50)

In [8]:
print("The score of the algorithm: ",classif.score(legit_test,mal_test)*100)

The score of the algorithm:  99.37341542919232


In [9]:
from sklearn.metrics import confusion_matrix

result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test,result)

In [10]:
print("False positives: ",conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ",conf_mat[1][0]/sum(conf_mat[1])*100)

False positives:  0.44034606019789674
False negatives:  1.0593475382207778


In [11]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train,mal_train)

GradientBoostingClassifier(n_estimators=50)

In [12]:
print("The score of the algorithm is: ", grad_boost.score(legit_test,mal_test)*100)

The score of the algorithm is:  98.77580586743933
