# Compare DexRay with and without Tesseract split

This notebook loads the two models trained on 50k data and compares them.

In [1]:
import csv
import json
import os
from datetime import datetime

import numpy as np
import tensorflow.keras as keras
from tqdm import tqdm

from tesseract.evaluation import predict
from tesseract import evaluation, temporal, metrics

APK_METADATA_PATH = "/scratch/users/mbenali/metadata.csv"
NUMPY_FILES_DIR = "/scratch/users/mbenali/download_apk/100k_download/npy"
# Where to save all training and testing data after splitting 
DEXRAY_TESSERACT_DATA_DIR = "./data/dexray_tesseract"
DEXRAY_BASE_DATA_DIR = "./data/dexray_base"
YEAR_START = 2010
YEAR_END = 2022

2024-11-29 07:09:34.940342: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-29 07:09:35.043912: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-29 07:09:35.045955: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### DexRay with Tesseract

In [7]:
dexray_tesseract_model = keras.saving.load_model("../models/model-50k-tesseract")

Use our custom prediction function for the Keras classifier

In [8]:
def predict_keras(X_test):
    probabilities = dexray_tesseract_model.predict(X_test, verbose=0)
    return (probabilities > 0.5).astype(int).flatten()  # Convert to 1D array of labels

In [9]:
# Load saved arrays
X_train = np.load(os.path.join(DEXRAY_TESSERACT_DATA_DIR, 'X_train.npy'), allow_pickle=True)
X_test = np.load(os.path.join(DEXRAY_TESSERACT_DATA_DIR, 'X_test.npy'), allow_pickle=True)
y_train = np.load(os.path.join(DEXRAY_TESSERACT_DATA_DIR, 'y_train.npy'), allow_pickle=True)
y_test = np.load(os.path.join(DEXRAY_TESSERACT_DATA_DIR, 'y_test.npy'), allow_pickle=True)
t_train = np.load(os.path.join(DEXRAY_TESSERACT_DATA_DIR, 't_train.npy'), allow_pickle=True)
t_test = np.load(os.path.join(DEXRAY_TESSERACT_DATA_DIR, 't_test.npy'), allow_pickle=True)

In [10]:
y_preds = predict(
    dexray_tesseract_model, 
    X_tests=X_test,
    predict_function=predict_keras
)

100%|██████████| 3/3 [00:08<00:00,  2.67s/it]


In [13]:
results = metrics.calculate_metrics(y_test, y_preds, periods=-1)

In [15]:
results

defaultdict(list,
            {'tp': [849, 1755, 167],
             'fp': [39, 61, 8],
             'tn': [2258, 2163, 271],
             'fn': [35, 62, 4],
             'p': [884, 1817, 171],
             'n': [2297, 2224, 279],
             'tot': [3181, 4041, 450],
             'tp_cumu': [849, 2604, 2771],
             'fp_cumu': [39, 100, 108],
             'tn_cumu': [2258, 4421, 4692],
             'fn_cumu': [35, 97, 101],
             'p_cumu': [884, 2701, 2872],
             'n_cumu': [2297, 4521, 4800],
             'tot_cumu': [3181, 7222, 7672],
             'tpr': [0.9604072398190046,
              0.9658778205833792,
              0.9766081871345029],
             'fnr': [0.03959276018099547,
              0.0341221794166208,
              0.023391812865497075],
             'fpr': [0.01697866782760122,
              0.027428057553956834,
              0.02867383512544803],
             'tnr': [0.9830213321723987,
              0.9725719424460432,
              0.9713261

In [14]:
metrics.print_metrics(results)

------------+---------------------
Test period |     1      2      3   
------------+---------------------
Actual pos  |    884   1817    171
Actual neg  |   2297   2224    279
Total       |   3181   4041    450
------------+---------------------
TPR         |  0.960  0.966  0.977
FPR         |  0.017  0.027  0.029
TNR         |  0.983  0.973  0.971
FNR         |  0.040  0.034  0.023
------------+---------------------
Precision   |  0.956  0.966  0.954
Recall      |  0.960  0.966  0.977
F1          |  0.958  0.966  0.965
------------+---------------------


### DexRay without Tesseract

In [2]:
dexray_base_model = keras.saving.load_model("../models/model-50k-base")

In [3]:
X_test = np.load(os.path.join(DEXRAY_BASE_DATA_DIR, 'X_test.npy'), allow_pickle=True)
y_test = np.load(os.path.join(DEXRAY_BASE_DATA_DIR, 'y_test.npy'), allow_pickle=True)

In [None]:
X_test = X_test.reshape(X_test.shape[0], 16384, 1)

In [5]:
X_test.shape

(10000, 16384, 1)

In [6]:
from sklearn.metrics import f1_score

# Get predictions for X_test
y_pred = dexray_base_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions

# Compute F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

F1 Score: 0.9212026956972524
