<a href="https://colab.research.google.com/github/hikmatfarhat-ndu/veronica-thesis/blob/master/malware.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Packages

In [None]:
import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt
from tensorflow.keras import models,layers
from tensorflow.keras.utils import Sequence
#from tensorflow.python.keras.utils import data_utils
import math
import os
import pandas
from PIL import Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Conv3D,Conv2D,InputLayer,MaxPooling3D,Dropout,Flatten,MaxPooling2D,BatchNormalization,LayerNormalization


In [1]:
%%bash
fileid="1fjB9yNDIlRMm2Y4v2Ta2N60yuDBN2D7r&export=download" 
filename="resized.7z"
curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id='$fileid | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt

curl -L -b cookies.txt -o $filename 'https://docs.google.com/uc?export=download&id='$fileid'&confirm='$(<confirm.txt)

rm -f confirm.txt cookies.txt

## Install 7z

In [None]:
!apt-get -y install p7zip-full

## Uncompress data

In [None]:
!rm -rf test train
!7z x resized.7z

## Read the data from directory

The data are the original code converted to images and then resized to 256x256ove sets.

In [None]:
image_size=(256,256)
dir="./"
dataset=tf.keras.preprocessing.image_dataset_from_directory(dir+"train",batch_size=32,image_size=image_size,
                                                            shuffle=True,color_mode="grayscale")
test_dataset=tf.keras.preprocessing.image_dataset_from_directory(dir+"test",batch_size=32,image_size=image_size,
                                                            shuffle=False,color_mode="grayscale")

## Build the model

In [None]:
def createModelYuan():
    
    model = Sequential()
    model.add(InputLayer(input_shape=(256,256,1)))
    #model.add(tf.keras.layers.experimental.preprocessing.Rescaling(1./255))
   
    model.add(Conv2D(64, kernel_size = (3,3),strides=(1,1), padding='same',activation = 'relu'))
    model.add(Conv2D(64, kernel_size = (3,3),strides=(1,1), padding='same',activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2,2)))

    model.add(Conv2D(128, kernel_size = (3,3),strides=(1,1), padding='same',activation = 'relu'))
    model.add(Conv2D(128, kernel_size = (3,3),strides=(1,1), padding='same',activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2,2)))
    

    model.add(Conv2D(256, kernel_size = (3,3),strides=(1,1), padding='same',activation = 'relu'))
    model.add(Conv2D(256, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(Conv2D(256, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2,2)))


    model.add(Conv2D(512, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(Conv2D(512, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(Conv2D(512, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2,2)))

    model.add(Conv2D(512, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(Conv2D(512, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(Conv2D(512, kernel_size = (3,3), strides=(1,1),padding='same',activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2,2)))


    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(9, activation = 'softmax', name = 'Output'))
    return model


### Instantiate the model

In [None]:

model=createModelYuan()
model.summary()

## Optimization

Keras can use many optimization method. In this notebook we use the __Adam__ method which can be described loosely as __adaptive__ gradient descent.

Also since the labels are __NOT__ in one_hot_encoding we use the "Sparse" version of the crossentropy loss: __SparseCategoricalCrossentropy__. Finally, if we don't specify from_logits=False then the loss function would compute softwmax before computing the loss. Since we are computing softwmax in our model already we turn this step off by specifying from_logits=False

In [None]:
# if we don't use softmax in the last layer, i.e. if the output of the
# model is NOT probabilities then use from_logits=True

# where the weights are saved periodically 
filepath="checkpoints/cp-{epoch}.ckpt"
cb=tf.keras.callbacks.ModelCheckpoint(filepath,save_weights_only=True,save_freq='epoch',verbose=0)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

#latest = tf.train.latest_checkpoint("checkpoints")
#model.load_weights(latest)


### Training

In [None]:
history=model.fit(dataset,callbacks=[cb],epochs=30)


### Testing the Accuracy

In [None]:

_,test_accuracy=model.evaluate(test_dataset)

## Confusion matrix

In [None]:
probabilities=model.predict(test_dataset)
predictions=np.argmax(probabilities,axis=1)


In [None]:
y = np.concatenate([y for x, y in test_dataset], axis=0)
m=tf.math.confusion_matrix(y,predictions)

In [None]:
import seaborn as sb
plt.figure(figsize=(10, 7))
sb.heatmap(m/np.sum(m,axis=1).reshape(9,1), xticklabels=dataset.class_names, yticklabels=dataset.class_names, 
            annot=True,fmt=".2f")
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()