# Malware Detection - binary classification

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

df = pd.read_csv('security_data/malware.csv', index_col=0)
df.head()

KeyboardInterrupt: 

In [None]:
df.shape

In [None]:
df['legitimate'].value_counts()

In [None]:
df.columns

- Name : 제외
- md5 : hash value 제외

In [None]:
df = df.iloc[:, 2:]

In [None]:
df.shape

In [None]:
y = df.pop('legitimate').values
X = df.values

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test  = sc.transform(X_test)

In [None]:
model = tf.keras.Sequential()

model.add(Dense(64, input_shape=(54,), activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])

In [None]:
history = model.fit(X_train, y_train, epochs=20, 
                    validation_data=(X_test, y_test))

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)

print(model.metrics_names)
print("Test score : {:.2f}".format(score[0]))
print("Test accuracy : {:.2f}".format(score[1]))

In [None]:
y_pred = model.predict(X_test)[:, 0] > 0.5
y_pred

In [None]:
y_test

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))
print("balanced_accuracy: {:.2f}".format(balanced_accuracy_score(y_test, y_pred)))
print("precision: {:.2f}".format(precision_score(y_test, y_pred)))
print("recall: {:.2f}".format(recall_score(y_test, y_pred)))

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['train', 'test'])

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'test'])

### confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)

ax = sns.heatmap(cm, annot=True, fmt='d')
ax.xaxis.set_ticklabels(['t', 'a'])

In [None]:
confusion_matrix?