In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection
%ls

/content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection
 AdaBoostClassifier.joblib
 classifiers_recall_scores.joblib
 CNNClassifier.joblib
 DecisionTreeClassifier.joblib
 EDA.ipynb
 features_k_1000.joblib
 features_k_100.joblib
 features_k_10.joblib
 features_k_1200.joblib
 features_k_1500.joblib
 features_k_2000.joblib
 features_k_2500.joblib
 features_k_3000.joblib
 features_k_500.joblib
 features_k_50.joblib
 gnbClassifier.joblib
 GradientBoostingClassifier.joblib
 knnClassifier.joblib
 LSTMClassifier.joblib
'Macro Malware Detection using Machine Learning Techniques A New Approach '
 mlpClasifier.joblib
 [0m[01;34m__pycache__[0m/
 randomForestClassifier.joblib
 recall_scores.joblib
 RobertaClassifier.joblib
 svmClassifier.joblib
 test_dataset.csv
 test_loader.joblib
 test_loader.pkl
 tfidf_1000.joblib
 tfidf_100.joblib
 tfidf_10.joblib
 tfidf_1200.joblib
 tfidf_1500.joblib
 tfidf_2000.joblib
 tfidf_2500.joblib
 tfidf_3000.joblib
 tfidf_500.joblib
 tfidf_50.joblib
 train_

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from joblib import dump, load
from collections import Counter
from utils import save_loader

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, precision_score, recall_score, f1_score

In [4]:
def calculate_vocabulary_size(datasets):
    vocab_counter = Counter()
    for dataset in datasets:
        for code in dataset['vba_code']:
            tokens = code.split()  # Assuming whitespace tokenization
            vocab_counter.update(tokens)
    return len(vocab_counter), vocab_counter

In [5]:
class datasetForLstm(Dataset):
    def __init__(self, dataset, vocab_counter, max_seq_length=100):
        self.text = dataset['vba_code']
        self.label = dataset['label']
        self.vocab_counter = vocab_counter
        self.max_seq_length = max_seq_length
        self.word2idx = {word: idx for idx, (word, _) in enumerate(vocab_counter.items(), 1)}

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        tokens = self.text[idx].split()  # Tokenize the VBA code
        indices = [self.word2idx.get(token, 0) for token in tokens]
        if len(indices) < self.max_seq_length:
            indices += [0] * (self.max_seq_length - len(indices))
        else:
            indices = indices[:self.max_seq_length]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.label[idx], dtype=torch.long)

In [6]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        h_0 = torch.zeros(num_layers, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.zeros(num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out

class LstmTrainer:

  def __init__(self, model, train_loader, validation_loader, criterion, optimizer, num_epochs, device):
    self.model = model
    self.train_loader = train_loader
    self.validation_loader = validation_loader
    self.criterion = criterion
    self.optimizer = optimizer
    self.device = device
    self.num_epochs = num_epochs

  def train_one_epoch(self):
    self.model.train()
    total_loss = 0
    for x_batch, y_batch in self.train_loader:
        x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
        optimizer.zero_grad()
        outputs = self.model(x_batch)
        loss = self.criterion(outputs, y_batch)
        loss.backward()
        self.optimizer.step()
        total_loss += loss.item()

    return total_loss/len(train_loader)

  def evaluate(self, data_loader):
      self.model.eval()
      all_preds = []
      all_labels = []

      with torch.no_grad():
          for x_batch, y_batch in data_loader:
              x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
              outputs = self.model(x_batch)
              _, preds = torch.max(outputs, dim=1)
              all_preds.extend(preds.cpu().numpy())
              all_labels.extend(y_batch.cpu().numpy())

      accuracy = accuracy_score(all_labels, all_preds)
      precision = precision_score(all_labels, all_preds)
      recall = recall_score(all_labels, all_preds)
      f1 = f1_score(all_labels, all_preds)
      conf_matrix = confusion_matrix(all_labels, all_preds)

      return accuracy, precision, recall, f1, conf_matrix

  def train(self):
    all_losses = []
    for epoch in range(self.num_epochs):
      train_loss = self.train_one_epoch()
      validation_accuracy, validation_precision, validation_recall, validation_f1, validation_conf_matrix = self.evaluate(self.validation_loader)
      print(f'Epoch {epoch+1}/{self.num_epochs}, Train Loss: {train_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f},  Precision: {validation_precision}, Recall: {validation_recall}, F1-score: {validation_f1}')
      all_losses.append(train_loss)
    return all_losses

  def test(self, test_loader):
      test_accuracy, precision, recall, f1_score, conf_matrix = self.evaluate(test_loader)
      print(f'Test Accuracy: {test_accuracy:.2f}%, Precision: {precision}, Recall: {recall}, F1-score: {f1_score}')
      return test_accuracy, precision, recall, f1_score, conf_matrix


In [7]:
train_set = pd.read_csv('train_dataset.csv', encoding='utf-16le')[['vba_code', 'label']]
val_set = pd.read_csv('validation_dataset.csv', encoding='utf-16le')[['vba_code', 'label']]
test_set = pd.read_csv('test_dataset.csv', encoding='utf-16le')[['vba_code', 'label']]

mapper = {'white': 1, 'mal': 0}
train_set['label'] = train_set['label'].map(mapper)
val_set['label'] = val_set['label'].map(mapper)
test_set['label'] = test_set['label'].map(mapper)

vocab_size, vocab_counter = calculate_vocabulary_size([train_set, val_set, test_set])

train_set = datasetForLstm(train_set, vocab_counter)
val_set = datasetForLstm(val_set, vocab_counter)
test_set = datasetForLstm(test_set, vocab_counter)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
validation_loader = DataLoader(val_set, batch_size=32)
test_loader = DataLoader(test_set, batch_size=32)

In [8]:
hidden_size = 128
output_size = 2
num_layers = 2
dropout = 0.5
num_epochs = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(vocab_size + 1, hidden_size, output_size, num_layers, dropout).to(device)  # Add 1 for padding index
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train and evaluate the model
model_trainer  = LstmTrainer(model, train_loader, validation_loader, criterion, optimizer, num_epochs, device)
all_losses = model_trainer.train()

Epoch 1/10, Train Loss: 0.4502, Validation Accuracy: 0.8950,  Precision: 0.8319873317498021, Recall: 0.9896402335656432, F1-score: 0.9039917412250517
Epoch 2/10, Train Loss: 0.3931, Validation Accuracy: 0.9136,  Precision: 0.9522142121524202, Recall: 0.8707854586551139, F1-score: 0.9096812278630461
Epoch 3/10, Train Loss: 0.4071, Validation Accuracy: 0.9161,  Precision: 0.9322763750244666, Recall: 0.8971557732152948, F1-score: 0.9143789594931848
Epoch 4/10, Train Loss: 0.3898, Validation Accuracy: 0.9224,  Precision: 0.9328185328185328, Recall: 0.9101525711056696, F1-score: 0.9213461721803794
Epoch 5/10, Train Loss: 0.3798, Validation Accuracy: 0.9407,  Precision: 0.9385192127460169, Recall: 0.9431154643058957, F1-score: 0.9408117249154453
Epoch 6/10, Train Loss: 0.3964, Validation Accuracy: 0.9251,  Precision: 0.9506690633113641, Recall: 0.8965906950461481, F1-score: 0.9228383094222566
Epoch 7/10, Train Loss: 0.3893, Validation Accuracy: 0.9346,  Precision: 0.9448515233320478, Recall:

In [9]:
test_accuracy, precision, recall, f1_score, conf_matrix = model_trainer.test(test_loader)

Test Accuracy: 0.98%, Precision: 0.9798303487276154, Recall: 0.9770676691729323, F1-score: 0.9784470588235294


In [10]:
print(conf_matrix)

[[5203  107]
 [ 122 5198]]


In [11]:
save_loader('/content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection/LSTMClassifier.joblib', model)

LSTMModel(
  (embedding): Embedding(304987, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
) saved sucessfuly
