In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection

/content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection


In [3]:
import pandas as pd
import re
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import random
import string
from joblib import load
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
class VBAPermutationsGenerator:
    """
    Class to generate benign-looking permutations of malicious VBA code.
    The permutations preserve the intent but modify the appearance
    to obfuscate malicious code.
    """

    def __init__(self, num_permutations=10):
        """
        Initialize the VBAPermutationsGenerator with the number of permutations.

        Parameters:
        - num_permutations (int): Number of permutations to generate (default: 10).
        """
        self.num_permutations = num_permutations

    def preprocess_code(self, code):
        """
        Preprocess VBA code by removing comments, extra spaces, and normalizing case.

        Input:
        - code (str): Raw VBA code.

        Output:
        - preprocessed code (str): Cleaned and preprocessed code.
        """
        code = re.sub(r'\'(?:.*?)\(', '', code)
        code = re.sub(r'\s+', ' ', code)
        code = code.lower()

        return code

    def extract_features(self, code):
        """
        Extract features such as token frequency and code length from VBA code.

        Input:
        - code (str): Preprocessed VBA code.

        Output:
        - word_freq (Counter): Frequency of each word in the code.
        - code_length (int): Total number of words in the code.
        """
        tokens = nltk.word_tokenize(code)
        stop_words = set(nltk.corpus.stopwords.words('english'))
        tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
        word_freq = Counter(tokens)
        code_length = len(code.split())

        return word_freq, code_length

    def generate_permutations(self, code):
        """
        Generate random permutations of the given VBA code by shuffling its words.

        Input:
        - code (str): Preprocessed VBA code.

        Output:
        - permutations (list): List of shuffled code permutations.
        """
        permutations = []
        for _ in range(self.num_permutations):
            shuffled_code = ' '.join(random.sample(code.split(), len(code.split())))
            permutations.append(shuffled_code)

        return permutations

    def preserve_malicious_intent(self, permutation, original_code):
        """
        Modify permutations to preserve the malicious intent by replacing certain keywords.

        Input:
        - permutation (str): Permutated code.
        - original_code (str): Original VBA code.

        Output:
        - modified permutation (str): Code with malicious intent preserved via keyword replacements.
        """
        keyword_replacements = {
            'private sub': 'public function',
            'end sub': 'return statement',
            'option explicit': 'constant declaration'
        }

        for key, value in keyword_replacements.items():
            permutation = re.sub(r'\b' + key + r'\b', value, permutation)
        return permutation

    def output_permutations(self, permutations):
        """
        Create benign-looking permutations by removing malicious keywords.

        Input:
        - permutations (list): List of permutations with preserved malicious intent.

        Output:
        - benign_permutations (list): List of benign-looking permutations.
        """
        benign_permutations = []
        for permutation in permutations:
            permutation = re.sub(r'\b' + r'\w+\s*sub\b', '', permutation)
            benign_permutations.append(permutation)

        return benign_permutations

    def generate_benign_permutation(self, malicious_code):
        """
        Main function to generate benign-looking permutations of malicious code.

        Input:
        - malicious_code (str): Raw malicious VBA code.

        Output:
        - benign_permutations (list): List of benign-looking permutations.
        """
        preprocessed_code = self.preprocess_code(malicious_code)
        word_freq, code_length = self.extract_features(preprocessed_code)
        permutations = self.generate_permutations(preprocessed_code)
        preserved_permutations = [self.preserve_malicious_intent(permutation, preprocessed_code) for permutation in permutations]
        benign_permutations = self.output_permutations(preserved_permutations)

        return benign_permutations

    def create_benign_permutations(self, malicious_macro, num_permutations=10):
        """
        Entry point for creating benign-looking permutations from malicious VBA code.

        Parameters:
        - malicious_macro (str): Raw malicious VBA code.
        - num_permutations (int): Number of permutations to generate (default: 10).

        Output:
        - benign_permutations (list): List of benign-looking permutations.
        """
        self.num_permutations = num_permutations
        return self.generate_benign_permutation(malicious_macro)


In [5]:
mapper = {
    'white' : 1,
    'mal' : 0
}

train_set = pd.read_csv('train_dataset.csv', encoding='utf-16le')

x_train, y_train = train_set['vba_code'], train_set['label'].map(mapper)

In [6]:
mal_code = train_set.iloc[2]['vba_code']

generator = VBAPermutationsGenerator(num_permutations=5)
benign_permutations = generator.create_benign_permutations(mal_code)

print(benign_permutations)



In [None]:
rf_clf = load('randomForestClassifier.joblib')
adaboost_clf = load('AdaBoostClassifier.joblib')
dt_clf = load('DecisionTreeClassifier.joblib')
gb_clf = load('GradientBoostingClassifier.joblib')
knn_clf = load('knnClassifier.joblib')
svm_clf = load('svmClassifier.joblib')
mlp_clf = load('mlpClasifier.joblib')
models = [rf_clf, adaboost_clf, dt_clf, gb_clf, knn_clf, svm_clf, mlp_clf]
roberta_clf = torch.load('RobertaClassifier.joblib')
cnn_clf = torch.load('CNNClassifier.joblib')
lstm_clf = load('LSTMClassifier.joblib')

xtfidf_1000 = load('x_train_1000.joblib')
y_1000 = load()
tfidf_1000 = load('tfidf_1000.joblib')

In [None]:
benign_permutations_tfidf = tfidf_1000.transform(benign_permutations)