In [None]:
#Import Libraries
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Bidirectional, GRU, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
pip install lime

In [None]:
#Import Libraries
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer

In [None]:
#Get access to the dataset which is uploaded on my google drive.
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
file_path = "/content/drive//MyDrive/ColabNotebooks/Android_Opcode_Sequences.csv"
data = pd.read_csv(file_path)
#The opcodes are space-separated in the CSV file and Split them into lists.
#Convert opcode sequences, which are represented as space-separated strings in CSV file, into lists of individual opcodes.
data['opcodes'] = data['opcodes'].apply(lambda x: x.split())

# Separate features and labels
opcode_sequences = data['opcodes'].tolist()
y = data['labels'].values

# I have trained a Word2Vec model on the opcode sequences.
# Tokenize opcode sequences to list of words (opcodes)
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=opcode_sequences, vector_size=10, window=5, min_count=1, workers=4,sg=0)
word2vec_model.save("word2vec.model")
# Convert opcode sequences to vectors
vectorized_sequences = [[word2vec_model.wv[word] for word in sequence] for sequence in opcode_sequences]
# Find the maximum sequence length
max_length = max(len(sequence) for sequence in vectorized_sequences)
# Pad sequences
X = pad_sequences(vectorized_sequences, maxlen=max_length, dtype='float32', padding='post')

In [None]:
#Visualize the word vectors using PCA technique to see if similar words are clustered together in the vector space.

words = list(word2vec_model.wv.index_to_key)
vectors = [word2vec_model.wv[word] for word in words]

pca = PCA(n_components=2)
result = pca.fit_transform(vectors)

plt.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Define the model
model = Sequential([
    # Convolutional Layer
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(max_length, 10)),# 64 is the vector_size in Word2Vec model
    MaxPooling1D(pool_size=2),

    # Bi-directional GRU Layers
    Bidirectional(GRU(64, return_sequences=True)),
    Bidirectional(GRU(32)),

    # Flatten Layer
    Flatten(),

    # Fully Connected Neural Network Module
    Dense(64, activation='relu'),
    Dropout(0.2),  # Dropout Regularization
    Dense(32, activation='relu'),
    Dropout(0.2),  # Dropout Regularization

    # Output Layer
    Dense(1, activation='sigmoid')  # Sigmoid Activation for Binary Classification
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
batch_size = 64
epochs = 100
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
#Evaluate the model
test_loss, test_accuracy = model.evaluate(X_train, y_train)
print(f"Test Accuracy: {test_accuracy}")

In [None]:

_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))
test_predictions = (model.predict(X_test)> 0.5).astype(int)
test_predictions.shape

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
true_negative, false_positive, false_negative, true_positive = conf_matrix.ravel()

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Print individual components of the confusion matrix
#how many samples were classified as benign software and was.
print("True Negative (TN):", true_negative)
#how many samples were classified as malware but were not.
print("False Positive (FP):", false_positive)
#how many samples were classified as benign software but was not.
print("False Negative (FN):", false_negative)
#how many samples were classified as malware and was.
print("True Positive (TP):", true_positive)

In [None]:
for i in range(1486):
    if (test_predictions[i]) != (y_test[i]):
      print('%d (expected %d)' % ((test_predictions[i]),(y_test[i])))


In [None]:
# Create a LimeTextExplainer
explainer = lime_text.LimeTextExplainer(class_names=['Benign(0)', 'Malware(1)'])

#  idx is the index of the instance in the dataset you want to explain
idx = 100 # the index of the instance you want to explain
instance = ' '.join(opcode_sequences[idx])  # Converting list of opcodes to space-separated string
# Define a prediction function
def predictor(texts):
    # Initialize list to hold sequences
    sequences = []

    # Convert texts to sequences of word vectors
    for text in texts:
        sequence = []
        words = text.split()  # Split text into words

        for word in words:
            # Check if the word is in the model's vocabulary
            if word in word2vec_model.wv.index_to_key:
                # Append the word vector to the sequence
                sequence.append(word2vec_model.wv[word])

        # Append the sequence to the list of sequences
        sequences.append(sequence)

    # Pad sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_length, dtype='float32', padding='post')

    # Get predictions from the model
    predictions = model.predict(padded_sequences)

    # Return stacked predictions
    return np.hstack((1 - predictions, predictions))
exp = explainer.explain_instance(instance, predictor, num_features=8)


# Show explanation in the notebook
exp.show_in_notebook(text=True)


In [None]:
#following snippet plots the graph of training accuracy vs. validation accuracy over the number of epochs.
# Assuming history is the return value from model.fit(...)
plt.figure(figsize=(12, 6))

# Subplot for accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

