In [1]:
import os
import random 
import numpy as np
import tensorflow as tf
import random as python_random
import tensorflow_addons as tfa
import tensorflow.keras as keras
from PIL import Image

random_seed = 123456

np.random.seed(random_seed)
python_random.seed(random_seed)
tf.random.set_seed(random_seed)


import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

import sys
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import Flatten, Dense
import argparse

2024-11-20 08:29:44.381549: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-20 08:29:44.427401: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-20 08:29:44.428231: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [9]:
def split_dataset(goodware_dir, malware_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # Ensure the ratios sum to 1
    assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1"
    
    # Get lists of hashes
    goodware_hashes = ["goodware/" + f.split('.')[0] for f in os.listdir(goodware_dir) if f.endswith('.png')]
    malware_hashes = ["malware/" + f.split('.')[0] for f in os.listdir(malware_dir) if f.endswith('.png')]
    
    # Shuffle the hashes
    random.seed(42)  # For reproducibility
    random.shuffle(goodware_hashes)
    random.shuffle(malware_hashes)
    
    # Split goodware and malware
    def split_hashes(hashes, train_ratio, val_ratio):
        n = len(hashes)
        train_end = int(n * train_ratio)
        val_end = train_end + int(n * val_ratio)
        return hashes[:train_end], hashes[train_end:val_end], hashes[val_end:]
    
    goodware_train, goodware_val, goodware_test = split_hashes(goodware_hashes, train_ratio, val_ratio)
    malware_train, malware_val, malware_test = split_hashes(malware_hashes, train_ratio, val_ratio)
    
    # Combine goodware and malware for each split
    train_set = goodware_train + malware_train
    val_set = goodware_val + malware_val
    test_set = goodware_test + malware_test
    
    # Write to files
    os.makedirs(output_dir, exist_ok=True)
    for filename, data in zip(
        ['train.txt', 'validation.txt', 'test.txt'], 
        [train_set, val_set, test_set]
    ):
        with open(os.path.join(output_dir, filename), 'w') as f:
            f.write('\n'.join(data))
    
    # Print percentages
    total_samples = len(goodware_hashes) + len(malware_hashes)
    print("Training: {:.2f}%".format(len(train_set) / total_samples * 100))
    print("Validation: {:.2f}%".format(len(val_set) / total_samples * 100))
    print("Testing: {:.2f}%".format(len(test_set) / total_samples * 100))

In [10]:
goodware_dir = "/scratch/users/mbenali/download_apk/images/goodware"
malware_dir = "/scratch/users/mbenali/download_apk/images/malware"
output_dir = "/scratch/users/mbenali/DexRay/data_splits"

split_dataset(goodware_dir, malware_dir, output_dir)

Training: 70.00%
Validation: 14.99%
Testing: 15.01%


In [4]:
EPOCHS = 200
BATCH_SIZE = 10
IMG_SIZE = 128
PATH_FILES = "/scratch/users/mbenali/DexRay/data_splits"
IMG_SIZE = 128

In [11]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    if parts[-2] == "goodware":
        return [0]
    else:
        return [1]


def get_image(path_img):
    image = np.asarray(Image.open(path_img))
    image = tf.convert_to_tensor(image, dtype_hint=None, name=None)
    return image


def get_shape(image):
    return image.shape[0]


def decode_img(path_img):
    image = tf.numpy_function(get_image, [path_img], tf.uint8)
    shape = tf.numpy_function(get_shape, [image], tf.int64)
    image = tf.reshape(image, [shape, 1, 1])
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [IMG_SIZE * IMG_SIZE, 1])
    return tf.reshape(image, [IMG_SIZE * IMG_SIZE, 1])


def process_path(file_path):
    label = get_label(file_path)
    img = decode_img(file_path)
    return img, label

def train_model(
    path_images, # path to directory containing images
    dir_name, # path where to save the model
    file_name, # results file
    EPOCHS,
    BATCH_SIZE,
    IMG_SIZE,
    PATH_FILES,
):
    recall_list, precision_list, accuracy_list, f1_list = [], [], [], []

    model_architecture = Sequential()
    model_architecture.add(
        Conv1D(
            filters=64,
            kernel_size=12,
            activation="relu",
            input_shape=(IMG_SIZE * IMG_SIZE, 1),
        )
    )
    model_architecture.add(MaxPooling1D(pool_size=12))
    model_architecture.add(Conv1D(filters=128, kernel_size=12, activation="relu"))
    model_architecture.add(MaxPooling1D(pool_size=12))
    model_architecture.add(Flatten())
    model_architecture.add(Dense(64, activation="sigmoid"))
    model_architecture.add(Dense(1, activation="sigmoid"))

    file_results = open(file_name, "w")
    file_results.write(
        "Scores of the performance evaluation are: Accuracy, Precision, Recall, F1-score\n"
    )


    file_results.write("Run:\n")
    print("Run:")
    with open(os.path.join(PATH_FILES, "train.txt")) as f:
        train_hashes = f.read().splitlines()
        train_imgs = [
            os.path.join(path_images, image_hash + ".png") for image_hash in train_hashes
        ]
    f.close()

    with open(os.path.join(PATH_FILES, "validation.txt")) as f:
        valid_hashes = f.read().splitlines()
        valid_imgs = [
            os.path.join(path_images, image_hash + ".png") for image_hash in valid_hashes
        ]
    f.close()

    with open(os.path.join(PATH_FILES, "test.txt")) as f:
        test_hashes = f.read().splitlines()
        test_imgs = [
            os.path.join(path_images, image_hash + ".png") for image_hash in test_hashes
        ]
    f.close()

    train_dataset = tf.data.Dataset.from_tensor_slices(train_imgs)
    train_dataset = train_dataset.map(
        process_path, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    length_train = len(train_imgs)
    batch_train = length_train // BATCH_SIZE
    train_dataset = train_dataset.cache()
    train_dataset = train_dataset.shuffle(
        buffer_size=length_train, seed=random_seed, reshuffle_each_iteration=False
    )
    train_dataset = train_dataset.batch(batch_train)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    valid_dataset = tf.data.Dataset.from_tensor_slices(valid_imgs)
    valid_dataset = valid_dataset.map(
        process_path, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    length_valid = len(valid_imgs)
    batch_valid = length_valid // BATCH_SIZE
    valid_dataset = valid_dataset.cache()
    valid_dataset = valid_dataset.shuffle(
        buffer_size=length_valid, seed=random_seed, reshuffle_each_iteration=False
    )
    valid_dataset = valid_dataset.batch(batch_valid)
    valid_dataset = valid_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    test_dataset = tf.data.Dataset.from_tensor_slices(test_imgs)
    test_dataset = test_dataset.map(
        process_path, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    length_test = len(test_imgs)
    batch_test = length_test // BATCH_SIZE
    test_dataset = test_dataset.cache()
    test_dataset = test_dataset.shuffle(
        buffer_size=length_test, seed=random_seed, reshuffle_each_iteration=False
    )
    test_dataset = test_dataset.batch(batch_test)
    test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    model = keras.models.clone_model(model_architecture)
    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[
            "accuracy",
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tfa.metrics.F1Score(num_classes=2, average="micro", threshold=0.5),
        ],
    )

    es_callback = tf.keras.callbacks.EarlyStopping(
        monitor="val_accuracy", patience=50, restore_best_weights=True
    )
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(dir_name, "cp"),
        save_weights_only=True,
        monitor="val_accuracy",
        mode="max",
        save_best_only=True,
    )
    path_save_model = os.path.join(dir_name, "model")

    model.fit(
        train_dataset,
        shuffle=True,
        validation_data=valid_dataset,
        epochs=EPOCHS,
        callbacks=[es_callback, cp_callback],
        verbose=2,
    )
    model.save(path_save_model)
    print("Evaluate the model")
    evaluation_scores = model.evaluate(test_dataset, verbose=2)
    file_results.write("%s  \n" % evaluation_scores[1:])
    file_results.write("#" * 50 + "\n")
    accuracy_list.append(evaluation_scores[1])
    precision_list.append(evaluation_scores[2])
    recall_list.append(evaluation_scores[3])
    f1_list.append(evaluation_scores[4])
    
    
    file_results.write(
        "Average scores: %f %f %f %f"
        % (
            np.mean(accuracy_list),
            np.mean(precision_list),
            np.mean(recall_list),
            np.mean(f1_list),
        )
    )

    file_results.close()

In [12]:
train_model(
    "/scratch/users/mbenali/download_apk/images", 
    'model', 
    '/scratch/users/mbenali/DexRay/results/scores.txt', 
    EPOCHS, 
    BATCH_SIZE, 
    IMG_SIZE, 
    '/scratch/users/mbenali/DexRay/data_splits' 
)

Run:
Epoch 1/200


2024-11-20 08:35:33.122797: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 40 of 7613
2024-11-20 08:35:44.334910: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 99 of 7613
2024-11-20 08:35:50.683722: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 146 of 7613
2024-11-20 08:36:02.279286: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 197 of 7613
2024-11-20 08:36:10.822636: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 243 of 7613
2024-11-20 08:36:21.643453: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 312 of 7613
2024-11-20 08:36:31.652002: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buf

11/11 - 1690s - loss: 0.5080 - accuracy: 0.8048 - precision_2: 0.0000e+00 - recall_2: 0.0000e+00 - f1_score: 0.0000e+00 - val_loss: 0.4952 - val_accuracy: 0.8049 - val_precision_2: 0.0000e+00 - val_recall_2: 0.0000e+00 - val_f1_score: 0.0000e+00 - 1690s/epoch - 154s/step
Epoch 2/200
11/11 - 36s - loss: 0.4787 - accuracy: 0.8048 - precision_2: 0.0000e+00 - recall_2: 0.0000e+00 - f1_score: 0.0000e+00 - val_loss: 0.4858 - val_accuracy: 0.8049 - val_precision_2: 0.0000e+00 - val_recall_2: 0.0000e+00 - val_f1_score: 0.0000e+00 - 36s/epoch - 3s/step
Epoch 3/200
11/11 - 36s - loss: 0.4598 - accuracy: 0.8183 - precision_2: 0.7004 - recall_2: 0.1211 - f1_score: 0.2065 - val_loss: 0.4818 - val_accuracy: 0.8110 - val_precision_2: 1.0000 - val_recall_2: 0.0314 - val_f1_score: 0.0610 - 36s/epoch - 3s/step
Epoch 4/200
11/11 - 36s - loss: 0.4581 - accuracy: 0.8248 - precision_2: 0.9524 - recall_2: 0.1077 - f1_score: 0.1935 - val_loss: 0.4429 - val_accuracy: 0.8221 - val_precision_2: 0.9118 - val_reca

INFO:tensorflow:Assets written to: model/model/assets


Evaluate the model


2024-11-20 10:25:40.258750: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 59 of 1633
2024-11-20 10:25:53.370407: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 83 of 1633
2024-11-20 10:26:00.501677: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 121 of 1633
2024-11-20 10:26:10.498590: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 186 of 1633
2024-11-20 10:26:20.920356: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 215 of 1633
2024-11-20 10:26:30.751963: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 288 of 1633
2024-11-20 10:26:40.162276: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buf

11/11 - 331s - loss: 0.7212 - accuracy: 0.8592 - precision_2: 0.7730 - recall_2: 0.3950 - f1_score: 0.5228 - 331s/epoch - 30s/step


In [None]:
from tensorflow.keras.models import load_model

# Load the model
loaded_model = load_model('model/model')