In [28]:
# Import pandas libraries
import pandas as pd

# Load the dataset from the csv file
def load_dataset_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df

# Load the dataset from the csv file
loaded_dataset = load_dataset_from_csv('vectorized_dataset.csv')
print("\nLoaded dataset from csv file:")
print(loaded_dataset.head())

# Print keys of the loaded dataset
print("\nKeys of the loaded dataset:")
print(loaded_dataset.keys())

# Convert all columns to numpy arrays
dataset_hashes = loaded_dataset['hash'].tolist()
dataset_labels = loaded_dataset['label'].tolist()
dataset_families = loaded_dataset['family'].tolist()

# Process each vector in the dataset by removing the brackets
dataset_vectors = []
for vector in loaded_dataset['vector']:
    vector = vector.strip('[]').split(' ')
    # Typecast the vector of strings to int
    vector = [int(i) for i in vector]
    dataset_vectors.append(vector)
dataset_vectors = dataset_vectors

# Print the first 5 samples in the dataset along with their hashes and labels
print("\nFirst 5 samples in the dataset:")
for i in range(5):
    print(f"Sample {i}: {dataset_hashes[i]}")
    print(f"\t{dataset_vectors[i]}, {dataset_labels[i]}, {dataset_families[i]}")


Loaded dataset from csv file:
                                                hash  label     family  \
0  00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693...      1  GinMaster   
1  000068216bdb459df847bfdd67dd11069c3c50166db1ea...      0     benign   
2  0000764713b286cfe7e8e76c7038c92312977712d9c5a8...      1     Opfake   
3  0000962c2c34de1ca0c329b18be7847459da2d9d14b6b2...      0     benign   
4  000167f1ff061ea91440c40659c11c2af160342fd2e493...      0     benign   

                                      vector  
0  [1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0]  
1  [1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]  
2  [1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0]  
3  [1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]  
4  [1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]  

Keys of the loaded dataset:
Index(['hash', 'label', 'family', 'vector'], dtype='object')

First 5 samples in the dataset:
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 

In [29]:
# Print the number of families in the dataset
print("\nNumber of families in the dataset:", len(set(dataset_families)))

# Print each family in the dataset and its count, sorted by count
family_counts = pd.Series(dataset_families).value_counts()
print("\nFamily counts in the dataset:")
for family, count in family_counts.items():
    print(f"{family}: {count}")


Number of families in the dataset: 180

Family counts in the dataset:
benign: 123453
FakeInstaller: 925
DroidKungFu: 667
Plankton: 625
Opfake: 613
GinMaster: 339
BaseBridge: 330
Iconosys: 152
Kmin: 147
FakeDoc: 132
Geinimi: 92
Adrd: 91
DroidDream: 81
ExploitLinuxLotoor: 70
Glodream: 69
MobileTx: 69
FakeRun: 61
SendPay: 59
Gappusin: 58
Imlog: 43
SMSreg: 41
Yzhc: 37
Jifake: 29
Hamob: 28
Boxer: 27
Fakelogo: 19
Penetho: 19
Nyleaker: 18
Xsider: 18
FakePlayer: 17
Dougalek: 17
Fatakr: 17
Vdloader: 16
FoCobers: 15
Stealer: 14
SerBG: 14
Typstu: 14
Mobilespy: 14
Steek: 14
Zitmo: 14
Nandrobox: 13
TrojanSMS.Hippo: 13
Fakengry: 13
SpyHasb: 13
Copycat: 12
FakeTimer: 12
Nickspy: 12
Placms: 12
Cosha: 11
DroidSheep: 11
Spitmo: 11
Biige: 10
AccuTrack: 10
SMSZombie: 10
Raden: 10
Kiser: 9
Stiniter: 9
Zsone: 8
Mobinauten: 8
Spyset: 8
Coogos: 8
BeanBot: 8
Sakezon: 8
RootSmart: 7
Gapev: 7
Ceshark: 7
Gamex: 7
Mania: 6
Lemon: 6
Ksapp: 6
SeaWeth: 6
Kidlogger: 6
Fjcon: 6
QPlus: 6
Trackplus: 6
Aks: 5
FarMap: 5
T

In [30]:
# Generate a one-vs-all training set for each family in the dataset with more than n occurrences
# Set to 0 to create a dataset for all families
min_occurrences = 10

# Create a dictionary to store the training sets for each family
training_sets_samples = {}
training_sets_labels = {}

# Iterate through each family and create a one-vs-all training set
for family in family_counts.index:
    if family_counts[family] >= min_occurrences:
        training_set_samples = []
        training_set_labels = []
        for i in range(len(dataset_families)):
            if dataset_families[i] == family:
                training_set_samples.append(dataset_vectors[i])
                training_set_labels.append(1)
            else:
                training_set_samples.append(dataset_vectors[i])
                training_set_labels.append(0)
        training_sets_samples[family] = training_set_samples
        training_sets_labels[family] = training_set_labels

# Print the number of training sets created
print("\nNumber of training sets created:", len(training_sets_samples))



Number of training sets created: 55


In [31]:
# Print the first 5 samples of the training set for the family at index idx
idx = 1
family = family_counts.index[idx]
print(f"\nFirst 5 samples of the training set for family '{family}':")
for i in range(5):
    print(f"Sample {i}: {dataset_hashes[i]}")
    print(f"\t{training_sets_samples[family][i]}, {training_sets_labels[family][i]}")


First 5 samples of the training set for family 'FakeInstaller':
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0
Sample 1: 000068216bdb459df847bfdd67dd11069c3c50166db1ea8772cdc9250d948bcf
	[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0
Sample 2: 0000764713b286cfe7e8e76c7038c92312977712d9c5a86d504be54f3c1d025a
	[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], 0
Sample 3: 0000962c2c34de1ca0c329b18be7847459da2d9d14b6b23a21cbc6427522403c
	[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0
Sample 4: 000167f1ff061ea91440c40659c11c2af160342fd2e493d609e4996b8820e78f
	[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0


In [34]:
"""ChatGPT was used to help convert Numpy SVM to PyTorch SVM using automatic optimizer"""

import torch

class SVM:
    def __init__(self, epochs=100, learning_rate=0.001):
        # Set learning rate and number of epochs/iterations
        self.learning_rate = learning_rate
        self.epochs = epochs

        # Initiate weights and biases to None, assigned based on the size of the first training point
        self.weights = None
        self.bias = None
    
    def fit(self, samples, raw_labels, regularization_term):
        # Convert samples and labels to PyTorch tensors
        samples = torch.tensor(samples, dtype=torch.float32)
        raw_labels = torch.tensor(raw_labels, dtype=torch.float32).view(-1, 1)

        # Update labels from (0 or 1) to (-1 or 1)
        labels = torch.where(raw_labels == 0, -1, 1)

        # Get the number of samples and number of features per sample
        num_samples, num_features = samples.shape

        # Initialize n weights to 0, where n is the number of features
        # requires_grad set to True to allow automatic tuning by PyTorch optimizer
        self.weights = torch.zeros((num_features, 1), dtype=torch.float32, requires_grad=True)

        # Set the initial bias to 0
        # requires_grad set to True to allow automatic tuning by PyTorch optimizer
        self.bias = torch.zeros(1, dtype=torch.float32, requires_grad=True)

        # Specify Stocastic Gradient Descent as the optimizer
        optimizer = torch.optim.SGD([self.weights, self.bias], lr=self.learning_rate)

        # Training function
        for epoch_index in range(self.epochs):
            # Reset gradients
            optimizer.zero_grad()

            # Predict the score of the sample (same as predict function w/o function call overhead)
            prediction = torch.matmul(samples, self.weights) - self.bias

            # Compute the margin by applying the labels to the corresponding predictions
            margin = prediction * labels

            # Compute the average hinge loss
            hinge_loss = torch.mean(torch.clamp(1 - margin, min=0))

            # Add regularization to the cost function
            reg_cost = regularization_term * torch.norm(self.weights, p=2) / 2

            # Calculate total loss
            loss = hinge_loss + reg_cost

            # Use PyTorch to automatically compute gradients
            loss.backward()

            # Update parameters
            optimizer.step()
        
    def predict(self, samples):
        # Compute the score/prediction by finding the dot product between the data points and the weights, subtracting the bias
        return torch.matmul(samples, self.weights) - self.bias

In [35]:
# Create an SVM object
svm = SVM(epochs=100, learning_rate=0.001)

# Train the SVM model using the training set for the family at index idx
# The training set at idx 0 is benign
idx = 1
family = family_counts.index[idx]

# Print the family and its count
print(f"\nTraining SVM for family '{family}' with {len(training_sets_samples[family])} samples")

# Print the number of features in the training set
num_features = len(training_sets_samples[family][0])
print(f"Number of features in the training set: {num_features}")

# Test prints
print(training_sets_samples[family][0], training_sets_labels[family][0])

# Train the single class SVM model on the training set
svm.fit(training_sets_samples[family], training_sets_labels[family][0], regularization_term=0)

# Print the weights and bias of the trained SVM model
print("\nTrained SVM model:")
print(f"Weights: {svm.weights}")
print(f"Bias: {svm.bias}")


Training SVM for family 'FakeInstaller' with 129013 samples
Number of features in the training set: 20
[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] 0

Trained SVM model:
Weights: tensor([[-0.0997],
        [-0.0839],
        [-0.0388],
        [-0.0379],
        [-0.0523],
        [-0.0158],
        [-0.0061],
        [-0.0089],
        [-0.0139],
        [-0.0141],
        [-0.0417],
        [-0.0293],
        [-0.0234],
        [-0.0233],
        [-0.0225],
        [-0.0225],
        [-0.0199],
        [-0.0151],
        [-0.0131],
        [-0.0091]], requires_grad=True)
Bias: tensor([0.1000], requires_grad=True)
