In [2]:
# Import pandas libraries
import pandas as pd

# Load the dataset from the csv file
def load_dataset_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df

# Load the dataset from the csv file
loaded_dataset = load_dataset_from_csv('vectorized_dataset.csv')
print("\nLoaded dataset from csv file:")
print(loaded_dataset.head())

# Print keys of the loaded dataset
print("\nKeys of the loaded dataset:")
print(loaded_dataset.keys())

# Convert all columns to numpy arrays
dataset_hashes = loaded_dataset['hash'].tolist()
dataset_labels = loaded_dataset['label'].tolist()
dataset_families = loaded_dataset['family'].tolist()

# Process each vector in the dataset by removing the brackets
dataset_vectors = []
for vector in loaded_dataset['vector']:
    vector = vector.strip('[]').split(' ')
    # Typecast the vector of strings to int
    vector = [int(i) for i in vector]
    dataset_vectors.append(vector)
dataset_vectors = dataset_vectors

# Print the first 5 samples in the dataset along with their hashes and labels
print("\nFirst 5 samples in the dataset:")
for i in range(5):
    print(f"Sample {i}: {dataset_hashes[i]}")
    print(f"\t{dataset_vectors[i]}, {dataset_labels[i]}, {dataset_families[i]}")


Loaded dataset from csv file:
                                                hash  label     family  \
0  00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693...      1  GinMaster   
1  000068216bdb459df847bfdd67dd11069c3c50166db1ea...      0     benign   
2  0000764713b286cfe7e8e76c7038c92312977712d9c5a8...      1     Opfake   
3  0000962c2c34de1ca0c329b18be7847459da2d9d14b6b2...      0     benign   
4  000167f1ff061ea91440c40659c11c2af160342fd2e493...      0     benign   

                                              vector  
0  [1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0...  
1  [1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...  
2  [1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0...  
3  [1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0...  
4  [1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0...  

Keys of the loaded dataset:
Index(['hash', 'label', 'family', 'vector'], dtype='object')

First 5 samples in the dataset:
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb

In [3]:
# Print the number of families in the dataset
print("\nNumber of families in the dataset:", len(set(dataset_families)))

# Print each family in the dataset and its count, sorted by count
family_counts = pd.Series(dataset_families).value_counts()
print("\nFamily counts in the dataset:")
for family, count in family_counts.items():
    print(f"{family}: {count}")


Number of families in the dataset: 180

Family counts in the dataset:
benign: 123453
FakeInstaller: 925
DroidKungFu: 667
Plankton: 625
Opfake: 613
GinMaster: 339
BaseBridge: 330
Iconosys: 152
Kmin: 147
FakeDoc: 132
Geinimi: 92
Adrd: 91
DroidDream: 81
ExploitLinuxLotoor: 70
Glodream: 69
MobileTx: 69
FakeRun: 61
SendPay: 59
Gappusin: 58
Imlog: 43
SMSreg: 41
Yzhc: 37
Jifake: 29
Hamob: 28
Boxer: 27
Fakelogo: 19
Penetho: 19
Nyleaker: 18
Xsider: 18
FakePlayer: 17
Dougalek: 17
Fatakr: 17
Vdloader: 16
FoCobers: 15
Stealer: 14
SerBG: 14
Typstu: 14
Mobilespy: 14
Steek: 14
Zitmo: 14
Nandrobox: 13
TrojanSMS.Hippo: 13
Fakengry: 13
SpyHasb: 13
Copycat: 12
FakeTimer: 12
Nickspy: 12
Placms: 12
Cosha: 11
DroidSheep: 11
Spitmo: 11
Biige: 10
AccuTrack: 10
SMSZombie: 10
Raden: 10
Kiser: 9
Stiniter: 9
Zsone: 8
Mobinauten: 8
Spyset: 8
Coogos: 8
BeanBot: 8
Sakezon: 8
RootSmart: 7
Gapev: 7
Ceshark: 7
Gamex: 7
Mania: 6
Lemon: 6
Ksapp: 6
SeaWeth: 6
Kidlogger: 6
Fjcon: 6
QPlus: 6
Trackplus: 6
Aks: 5
FarMap: 5
T

In [None]:
import random

# Generate a one-vs-all training set for each family in the dataset with more than n occurrences
# Set to 0 to create a dataset for all families
min_occurrences = 10

# Create a dictionary to store the training sets for each family
training_sets_samples = {}
training_sets_labels = {}

# Iterate through each family and create a one-vs-all training set
for family in family_counts.index:
    if family_counts[family] >= min_occurrences:
        training_set_samples = []
        training_set_labels = []
        # Append malware samples for the current family
        malware_indexes = [i for i in range(len(dataset_families)) if dataset_families[i] == family]
        index = 0
        # Create a balanced dataset with 50,000 malware samples and 50,000 benign samples
        while len(training_set_samples) < 50000:
            training_set_samples.append(dataset_vectors[malware_indexes[index]])
            training_set_labels.append(1)
            index += 1
            if index >= len(malware_indexes):
                index = 0
        # Append benign samples for the current family
        benign_indexes = [i for i in range(len(dataset_families)) if dataset_families[i] != family]
        index = 0
        while len(training_set_samples) < 100000:
            training_set_samples.append(dataset_vectors[benign_indexes[index]])
            training_set_labels.append(0)
            index += 1
            if index >= len(benign_indexes):
                index = 0
        training_sets_samples[family] = training_set_samples
        training_sets_labels[family] = training_set_labels

# Print the number of training sets created
print("\nNumber of training sets created:", len(training_sets_samples))



Number of training sets created: 55


In [6]:
# Print the number of malware and benign samples in each training set
print("\nNumber of malware and benign samples in each training set:")
for family, labels in training_sets_labels.items():
    print(f"{family}: {labels.count(1)} malware samples, {labels.count(0)} benign samples")


Number of malware and benign samples in each training set:
benign: 50000 malware samples, 50000 benign samples
FakeInstaller: 50000 malware samples, 50000 benign samples
DroidKungFu: 50000 malware samples, 50000 benign samples
Plankton: 50000 malware samples, 50000 benign samples
Opfake: 50000 malware samples, 50000 benign samples
GinMaster: 50000 malware samples, 50000 benign samples
BaseBridge: 50000 malware samples, 50000 benign samples
Iconosys: 50000 malware samples, 50000 benign samples
Kmin: 50000 malware samples, 50000 benign samples
FakeDoc: 50000 malware samples, 50000 benign samples
Geinimi: 50000 malware samples, 50000 benign samples
Adrd: 50000 malware samples, 50000 benign samples
DroidDream: 50000 malware samples, 50000 benign samples
ExploitLinuxLotoor: 50000 malware samples, 50000 benign samples
Glodream: 50000 malware samples, 50000 benign samples
MobileTx: 50000 malware samples, 50000 benign samples
FakeRun: 50000 malware samples, 50000 benign samples
SendPay: 50000

In [7]:
# Print the first 5 samples of the training set for the family at index idx
idx = 1
family = family_counts.index[idx]
print(f"\nFirst 5 samples of the training set for family '{family}':")
for i in range(5):
    print(f"Sample {i}: {dataset_hashes[i]}")
    print(f"\t{training_sets_samples[family][i]}, {training_sets_labels[family][i]}")


First 5 samples of the training set for family 'FakeInstaller':
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	[1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
Sample 1: 000068216bdb459df847bfdd67dd11069c3c50166db1ea8772cdc9250d948bcf
	[1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
Sample 2: 0000764713b286cfe7e8e76c7038c92312977712d9c5a86d504be54f3c1d025a
	[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
Sample 3: 0000962c2c34de1ca0c329b18be7847459da2d9d14b6b23a21cbc6427522403c
	[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
Sample 4: 000167f1ff061ea91440c40659c11c2af160342fd2e493d609e4996b8820e78f
	[1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0

In [8]:
"""ChatGPT was used to help convert Numpy SVM to PyTorch SVM using automatic optimizer"""

import torch

class SVM:
    def __init__(self, epochs=1000, learning_rate=0.001):
        # Set learning rate and number of epochs/iterations
        self.learning_rate = learning_rate
        self.epochs = epochs

        # Initiate weights and biases to None, assigned based on the size of the first training point
        self.weights = None
        self.bias = None
    
    def fit(self, samples, raw_labels, regularization_term):
        # Convert samples and labels to PyTorch tensors
        samples = torch.tensor(samples, dtype=torch.float32)
        raw_labels = torch.tensor(raw_labels, dtype=torch.float32).view(-1, 1)

        # Update labels from (0 or 1) to (-1 or 1) for hinge loss calculation
        labels = torch.where(raw_labels == 1, torch.tensor(1.0, dtype=torch.float32), torch.tensor(-1.0, dtype=torch.float32))

        # Get the number of samples and number of features per sample
        num_samples, num_features = samples.shape

        # Initialize n weights to 0, where n is the number of features
        # requires_grad set to True to allow automatic tuning by PyTorch optimizer
        self.weights = torch.zeros((num_features, 1), dtype=torch.float32, requires_grad=True)

        # Set the initial bias to 0 (tensor of size 1)
        # requires_grad set to True to allow automatic tuning by PyTorch optimizer
        self.bias = torch.zeros(1, dtype=torch.float32, requires_grad=True)

        # Specify Stocastic Gradient Descent as the optimizer
        optimizer = torch.optim.SGD([self.weights, self.bias], lr=self.learning_rate)

        # Training function
        for epoch_index in range(self.epochs):
            # Reset gradients
            optimizer.zero_grad()

            # Predict the score of the sample (same as predict function w/o function call overhead)
            prediction = torch.matmul(samples, self.weights) - self.bias

            # Compute the margin by applying the labels to the corresponding predictions
            margin = prediction * labels

            # Compute the average hinge loss
            hinge_loss = torch.mean(torch.clamp(1 - margin, min=0))

            # Add regularization to the cost function
            reg_cost = regularization_term * torch.norm(self.weights, p=2) / 2

            # Calculate total loss
            loss = hinge_loss + reg_cost

            # Use PyTorch to automatically compute gradients
            loss.backward()

            # Update parameters
            optimizer.step()
        
    def predict(self, samples):
        # Compute the score/prediction by finding the dot product between the data points and the weights, including the bias
        return torch.sigmoid(torch.matmul(samples, self.weights) + self.bias)

In [48]:
# Create an SVM object
svm = SVM(epochs=5000, learning_rate=0.005)

# Train the SVM model using the training set for the family at index idx
# The training set at idx 0 is benign
idx = 18
family = family_counts.index[idx]

# Train the single class SVM model on the training set
svm.fit(training_sets_samples[family], training_sets_labels[family], regularization_term=0)

# Print the weights and bias of the trained SVM model
print("\nTrained SVM model:")
# print(f"Weights: {svm.weights}")
print(f"Bias: {svm.bias}")

# Print the first 5 samples of the training set for the family where the label is 1
print(f"\nFirst 5 samples of the training set for family '{family}' where the label is 1:")
count = 0
for i in range(len(training_sets_labels[family])):
    if training_sets_labels[family][i] == 1:
        print(f"Sample {i}: {dataset_hashes[i]}")
        print(f"\t{training_sets_samples[family][i]}, {training_sets_labels[family][i]}")
        print(f"\tPrediction: {svm.predict(torch.tensor(training_sets_samples[family][i], dtype=torch.float32))}")
        count += 1
        if count == 5:
            break

# Print the first 5 samples of the training set for the family where the label is 0
print(f"\nFirst 5 samples of the training set for family '{family}' where the label is 0:")
count = 0
for i in range(len(training_sets_labels[family])):
    if training_sets_labels[family][i] == 0:
        print(f"Sample {i}: {dataset_hashes[i]}")
        print(f"\t{training_sets_samples[family][i]}, {training_sets_labels[family][i]}")
        print(f"\tPrediction: {svm.predict(torch.tensor(training_sets_samples[family][i], dtype=torch.float32))}")
        count += 1
        if count == 5:
            break

# Get the accuracy of the model on the training set
correct_predictions = 0
for i in range(len(training_sets_labels[family])):
    prediction = svm.predict(torch.tensor(training_sets_samples[family][i], dtype=torch.float32))
    if (prediction >= 0.5 and training_sets_labels[family][i] == 1) or (prediction < 0.5 and training_sets_labels[family][i] == 0):
        correct_predictions += 1
accuracy = correct_predictions / len(training_sets_labels[family])
print(f"\nAccuracy of the model on the training set for family '{family}': {accuracy}")


Trained SVM model:
Bias: tensor([0.9808], requires_grad=True)

First 5 samples of the training set for family 'Gappusin' where the label is 1:
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
	Prediction: tensor([0.9508], grad_fn=<SigmoidBackward0>)
Sample 1: 000068216bdb459df847bfdd67dd11069c3c50166db1ea8772cdc9250d948bcf
	[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
	Prediction: tensor([0.9669], grad_fn=<SigmoidBackward0>)
Sample 2: 0000764713b286cfe7e8e76c7038c92312977712d9c5a86d504be54f3c1d025a
	[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
	Prediction: tensor([0.9315], grad_fn=<SigmoidBackward0>)
Sample 3: 0000962c2c34de1ca0c329b18be7847459da2d9d14b6b23a21cbc6427522403c
	[1, 1,

In [20]:
# Create an ensemble classifier which initializes multiple one vs all SVM classifiers, including one for benign samples
# Each SVM classifier makes a prediction on the sample data and outputs a score
# The classifier with the highest score classifies the sample

class Ensemble_SVM:
    def __init__(self, epochs=1000, learning_rate=0.001):
        # Store the number of epochs and learning rate
        self.epochs = epochs
        self.learning_rate = learning_rate
        # Set the number of models in the ensemble
        self.num_models = None
        # Initialize a list to store the models and a list for the corresponding family names
        self.models = []
        self.model_families = []
    
    def fit(self, families, sample_sets, label_sets, regularization_term=0.5):
        # Get the number of models
        self.num_models = len(families)
        for index in range(len(families)):
            self.model_families.append(families[index])
            # Create an SVM model for each family and train it
            model = SVM(epochs=self.epochs, learning_rate=self.learning_rate)
            model.fit(sample_sets[index], label_sets[index], regularization_term)
            self.models.append(model)
    
    def predict(self, samples):
        # Predict the class of the sample using voting (highest score)
        predictions = []
        for model in self.models:
            predictions.append(model.predict(samples))
        # Get the index of the model with the highest prediction
        max_index = predictions.index(max(predictions))
        return self.model_families[max_index]

In [36]:
# Train an emsemble SVM model using the top 20 families with the most occurrences
top_families = family_counts[:20].index.tolist()
top_sample_sets = []
top_label_sets = []
for family in top_families:
    top_sample_sets.append(training_sets_samples[family])
    top_label_sets.append(training_sets_labels[family])

ensemble_svm = Ensemble_SVM(epochs=2000, learning_rate=0.005)
ensemble_svm.fit(top_families, top_sample_sets, top_label_sets, regularization_term=0)

# Print the top 20 families used in the ensemble model
print("\nTop 20 families used in the ensemble model:")
print(top_families)


Top 20 families used in the ensemble model:
['benign', 'FakeInstaller', 'DroidKungFu', 'Plankton', 'Opfake', 'GinMaster', 'BaseBridge', 'Iconosys', 'Kmin', 'FakeDoc', 'Geinimi', 'Adrd', 'DroidDream', 'ExploitLinuxLotoor', 'Glodream', 'MobileTx', 'FakeRun', 'SendPay', 'Gappusin', 'Imlog']


In [37]:
# Classify the first 5 samples in the dataset using the ensemble model
print("\nClassifying the first 5 samples in the dataset using the ensemble model:")
for i in range(50):
    family = ensemble_svm.predict(torch.tensor(dataset_vectors[i], dtype=torch.float32))
    print(f"Sample {i}: {dataset_hashes[i]}")
    print(f"\tFamily: {family}, Actual Family: {dataset_families[i]}")


Classifying the first 5 samples in the dataset using the ensemble model:
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	Family: GinMaster, Actual Family: GinMaster
Sample 1: 000068216bdb459df847bfdd67dd11069c3c50166db1ea8772cdc9250d948bcf
	Family: Adrd, Actual Family: benign
Sample 2: 0000764713b286cfe7e8e76c7038c92312977712d9c5a86d504be54f3c1d025a
	Family: Adrd, Actual Family: Opfake
Sample 3: 0000962c2c34de1ca0c329b18be7847459da2d9d14b6b23a21cbc6427522403c
	Family: benign, Actual Family: benign
Sample 4: 000167f1ff061ea91440c40659c11c2af160342fd2e493d609e4996b8820e78f
	Family: Imlog, Actual Family: benign
Sample 5: 00017ed2c044caf7b1047184673ec3e11ab10ac0e64fb7e7bccaca0deb13198a
	Family: Plankton, Actual Family: benign
Sample 6: 000189f3a91b1c19f15e2838995f80d6bb40d74aa8135f6d3e4fdbb80a0bdee7
	Family: Adrd, Actual Family: benign
Sample 7: 0003043c7e2af5e07a2638fbf2391802b0c9ff1926e5d04d06df06992147a325
	Family: Plankton, Actual Family: benign
Sample 8: 0

In [45]:
index = 18
family = ensemble_svm.model_families[index]
training_set_samples = top_sample_sets[index]
training_set_labels = top_label_sets[index]
svm = ensemble_svm.models[index]

# Print the first 5 samples of the training set for the family where the label is 1
print(f"\nFirst 5 samples of the training set for family '{family}' where the label is 1:")
count = 0
for i in range(len(training_sets_labels[family])):
    if training_sets_labels[family][i] == 1:
        print(f"Sample {i}: {dataset_hashes[i]}")
        print(f"\t{training_sets_samples[family][i]}, {training_sets_labels[family][i]}")
        print(f"\tPrediction: {svm.predict(torch.tensor(training_sets_samples[family][i], dtype=torch.float32))}")
        count += 1
        if count == 5:
            break

# Print the first 5 samples of the training set for the family where the label is 0
print(f"\nFirst 5 samples of the training set for family '{family}' where the label is 0:")
count = 0
for i in range(len(training_sets_labels[family])):
    if training_sets_labels[family][i] == 0:
        print(f"Sample {i}: {dataset_hashes[i]}")
        print(f"\t{training_sets_samples[family][i]}, {training_sets_labels[family][i]}")
        print(f"\tPrediction: {svm.predict(torch.tensor(training_sets_samples[family][i], dtype=torch.float32))}")
        count += 1
        if count == 5:
            break

# Get the accuracy of the model on the training set
correct_predictions = 0
for i in range(len(training_sets_labels[family])):
    prediction = svm.predict(torch.tensor(training_sets_samples[family][i], dtype=torch.float32))
    if (prediction >= 0.5 and training_sets_labels[family][i] == 1) or (prediction < 0.5 and training_sets_labels[family][i] == 0):
        correct_predictions += 1
accuracy = correct_predictions / len(training_sets_labels[family])
print(f"\nAccuracy of the model on the training set for family '{family}': {accuracy}")


First 5 samples of the training set for family 'Gappusin' where the label is 1:
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
	Prediction: tensor([0.9186], grad_fn=<SigmoidBackward0>)
Sample 1: 000068216bdb459df847bfdd67dd11069c3c50166db1ea8772cdc9250d948bcf
	[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
	Prediction: tensor([0.9274], grad_fn=<SigmoidBackward0>)
Sample 2: 0000764713b286cfe7e8e76c7038c92312977712d9c5a86d504be54f3c1d025a
	[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 1
	Prediction: tensor([0.8914], grad_fn=<SigmoidBackward0>)
Sample 3: 0000962c2c34de1ca0c329b18be7847459da2d9d14b6b23a21cbc6427522403c
	[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,

In [43]:
# Get the accuracy of the ensemble model on the training set
correct_predictions = 0
for i in range(len(dataset_vectors)):
    family = ensemble_svm.predict(torch.tensor(dataset_vectors[i], dtype=torch.float32))
    if family == dataset_families[i]:
        correct_predictions += 1
accuracy = correct_predictions / len(dataset_vectors)
print(f"\nAccuracy of the ensemble model on the training set: {accuracy}")


Accuracy of the ensemble model on the training set: 0.31331726260144327
