In [3]:
# Import pandas libraries
import pandas as pd

# Load the dataset from the csv file
def load_dataset_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df

# Load the dataset from the csv file
loaded_dataset = load_dataset_from_csv('vectorized_dataset.csv')
print("\nLoaded dataset from csv file:")
print(loaded_dataset.head())

# Print keys of the loaded dataset
print("\nKeys of the loaded dataset:")
print(loaded_dataset.keys())

# Convert all columns to numpy arrays
dataset_hashes = loaded_dataset['hash'].tolist()
dataset_labels = loaded_dataset['label'].tolist()
dataset_families = loaded_dataset['family'].tolist()

# Process each vector in the dataset by removing the brackets
dataset_vectors = []
for vector in loaded_dataset['vector']:
    vector = vector.strip('[]').split(' ')
    dataset_vectors.append(vector)
dataset_vectors = dataset_vectors

# Print the first 5 samples in the dataset along with their hashes and labels
print("\nFirst 5 samples in the dataset:")
for i in range(5):
    print(f"Sample {i}: {dataset_hashes[i]}")
    print(f"\t{dataset_vectors[i]}, {dataset_labels[i]}, {dataset_families[i]}")


Loaded dataset from csv file:
                                                hash  label     family  \
0  00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693...      1  GinMaster   
1  000068216bdb459df847bfdd67dd11069c3c50166db1ea...      0     benign   
2  0000764713b286cfe7e8e76c7038c92312977712d9c5a8...      1     Opfake   
3  0000962c2c34de1ca0c329b18be7847459da2d9d14b6b2...      0     benign   
4  000167f1ff061ea91440c40659c11c2af160342fd2e493...      0     benign   

                                      vector  
0  [1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0]  
1  [1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]  
2  [1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0]  
3  [1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]  
4  [1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]  

Keys of the loaded dataset:
Index(['hash', 'label', 'family', 'vector'], dtype='object')

First 5 samples in the dataset:
Sample 0: 00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4
	['1', '1', '1', '1', '1', '0', '0', '1', '0

In [4]:
"""ChatGPT was used to help convert Numpy SVM to PyTorch SVM using automatic optimizer"""

import torch

class SVM:
    def __init__(self, epochs=100, learning_rate=0.001):
        # Set learning rate and number of epochs/iterations
        self.learning_rate = learning_rate
        self.epochs = epochs

        # Initiate weights and biases to None, assigned based on the size of the first training point
        self.weights = None
        self.bias = None
        
    def predict(self, samples):
        # Compute the score/prediction by finding the dot product between the data points and the weights, subtracting the bias
        prediction = torch.matmul(samples, self.weights) - self.bias
        # Use sign to set the output as 1 if score > 0, -1 if score < 0, or 0
        return torch.sign(prediction)
    
    def fit(self, samples, raw_labels, regularization_term):
        # Convert samples and labels to PyTorch tensors
        samples = torch.tensor(samples, dtype=torch.float32)
        raw_labels = torch.tensor(raw_labels, dtype=torch.float32).view(-1, 1)

        # Update labels from (0 or 1) to (-1 or 1)
        labels = torch.where(raw_labels == 0, -1, 1)

        # Get the number of samples and number of features per sample
        num_samples, num_features = samples.shape

        # Initialize n weights to 0, where n is the number of features
        # requires_grad set to True to allow automatic tuning by PyTorch optimizer
        self.weights = torch.zeros((num_features, 1), dtype=torch.float32, requires_grad=True)

        # Set the initial bias to 0
        # requires_grad set to True to allow automatic tuning by PyTorch optimizer
        self.bias = torch.zeros(1, dtype=torch.float32, requires_grad=True)

        # Specify Stocastic Gradient Descent as the optimizer
        optimizer = torch.optim.SGD([self.weights, self.bias], lr=self.lr)

        # Training function
        for epoch_index in range(self.epochs):
            # Reset gradients
            optimizer.zero_grad()

            # Predict the score of the sample (same as predict function w/o function call overhead)
            prediction = torch.matmul(samples, self.weights) - self.bias

            # Compute the margin by applying the labels to the corresponding predictions
            margin = prediction * labels

            # Compute the average hinge loss
            hinge_loss = torch.mean(torch.clamp(1 - margin, min=0))

            # Add regularization to the cost function
            reg_cost = regularization_term * torch.norm(self.weights, p=2) / 2

            # Calculate total loss
            loss = hinge_loss + reg_cost

            # Use PyTorch to automatically compute gradients
            loss.backward()

            # Update parameters
            optimizer.step()