In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection

/content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection


In [None]:
import pandas as pd
import re

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.preprocessing import KBinsDiscretizer

### Loading the Data

In [None]:
mapper = {
    'white' : 1,
    'mal' : 0
}

train_set = pd.read_csv('train_dataset.csv', encoding='utf-16le')
val_set = pd.read_csv('validation_dataset.csv', encoding='utf-16le')
test_set = pd.read_csv('test_dataset.csv', encoding='utf-16le')

train_set['label'] = train_set['label'].map(mapper)
val_set['label'] = val_set['label'].map(mapper)
test_set['label'] = test_set['label'].map(mapper)

### Feature Extrcation methods

In [None]:
# Helper function to extract length of the VBA code as a proxy for size
def extract_code_length(vba_code):
    return len(vba_code)

# Helper function to detect presence of auto-execution keywords
def contains_autoexec_words(vba_code):
    autoexec_words = ['AutoExec', 'AutoOpen', 'DocumentOpen']
    return any(word in vba_code for word in autoexec_words)

# Helper function to detect API access invocations
def contains_api_access(vba_code):
    api_keywords = ['kernel32', 'user32', 'CreateFile', 'WriteFile']
    return any(api in vba_code for api in api_keywords)

# Helper function to count the number of macros in the VBA code
def count_macros(vba_code):
    return vba_code.lower().count('sub')

# Helper function to detect presence of specific words like DocumentClose
def contains_specific_word(vba_code, word):
    return word in vba_code

In [None]:
class VBAFeatureExtractor:
    """
    Class for extracting features from VBA code. It computes basic, derived, and discretized features,
    transforming them into binary form for use in machine learning models.
    """

    def __init__(self):
        """
        Initialize the VBAFeatureExtractor with predefined keyword sets and a discretizer for continuous features.
        """
        # Keywords related to automatic execution, API access, and suspicious behavior in VBA code
        self.autoexec_words = ['AutoExec', 'AutoOpen', 'DocumentOpen', 'AutoClose', 'DocumentClose', 'AutoExit', 'AutoNew', 'AutoSave']
        self.api_keywords = ['kernel32', 'user32', 'CreateFile', 'WriteFile', 'ReadFile', 'CloseHandle', 'ShellExecute', 'WinExec', 'CreateProcess', 'VirtualAlloc', 'RtlMoveMemory']
        self.suspicious_words = ['Shell', 'Environ', 'Chr', 'Base64', 'Hex', 'Xor', 'Encryption', 'DllCall']
        # Discretizer for converting continuous features into categorical ones
        self.discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

    def extract_code_length(self, vba_code):
        """
        Extract the length of the VBA code.

        Input:
        - vba_code (str): The VBA code from which to calculate the length.

        Output:
        - (int): The length of the VBA code.
        """
        return len(vba_code)

    def contains_autoexec_words(self, vba_code):
        """
        Check for the presence of auto-execution words in the VBA code.

        Input:
        - vba_code (str): The VBA code to check.

        Output:
        - (int): The count of autoexec words present in the code.
        """
        return sum(word.lower() in vba_code.lower() for word in self.autoexec_words)

    def contains_api_access(self, vba_code):
        """
        Check for the presence of API access keywords in the VBA code.

        Input:
        - vba_code (str): The VBA code to check.

        Output:
        - (int): The count of API keywords present in the code.
        """
        return sum(api.lower() in vba_code.lower() for api in self.api_keywords)

    def count_macros(self, vba_code):
        """
        Count the number of macros (Sub or Function) in the VBA code.

        Input:
        - vba_code (str): The VBA code to check.

        Output:
        - (int): The count of macros in the code.
        """
        return len(re.findall(r'\bSub\b|\bFunction\b', vba_code, re.IGNORECASE))

    def count_suspicious_words(self, vba_code):
        """
        Count the number of suspicious words in the VBA code.

        Input:
        - vba_code (str): The VBA code to check.

        Output:
        - (int): The count of suspicious words in the code.
        """
        return sum(word.lower() in vba_code.lower() for word in self.suspicious_words)

    def extract_features(self, data):
        """
        Extract features from the VBA code and generate derived features.

        Input:
        - data (pandas DataFrame): DataFrame containing a 'vba_code' column with VBA code strings.

        Output:
        - (pandas DataFrame): DataFrame with extracted and transformed features.
        """
        # Extract basic features
        data['code_length'] = data['vba_code'].apply(self.extract_code_length)
        data['autoexec_count'] = data['vba_code'].apply(self.contains_autoexec_words)
        data['api_access_count'] = data['vba_code'].apply(self.contains_api_access)
        data['macro_count'] = data['vba_code'].apply(self.count_macros)
        data['suspicious_word_count'] = data['vba_code'].apply(self.count_suspicious_words)

        # Compute derived features
        data['avg_macro_length'] = data['code_length'] / (data['macro_count'] + 1)  # Avoid division by zero
        data['api_to_macro_ratio'] = data['api_access_count'] / (data['macro_count'] + 1)

        # Discretize continuous features
        continuous_features = ['code_length', 'macro_count', 'avg_macro_length', 'api_to_macro_ratio']
        discretized_features = self.discretizer.fit_transform(data[continuous_features])

        for i, feature in enumerate(continuous_features):
            data[f'{feature}_discretized'] = discretized_features[:, i]

        # Convert features to binary form
        for feature in data.columns:
            if feature != 'vba_code':
                data[feature] = (data[feature] > 0).astype(int)

        return data

    def get_feature_names(self):
        """
        Retrieve the names of all extracted features, both basic and discretized.

        Output:
        - (list): List of feature names.
        """
        basic_features = ['code_length', 'autoexec_count', 'api_access_count', 'macro_count', 'suspicious_word_count', 'avg_macro_length', 'api_to_macro_ratio']
        discretized_features = [f'{feature}_discretized' for feature in ['code_length', 'macro_count', 'avg_macro_length', 'api_to_macro_ratio']]
        return basic_features + discretized_features


### Preprocessing

In [None]:
extractor = VBAFeatureExtractor()

# Extract features for each dataset
full_train_set = extractor.extract_features(train_set)
full_val_set = extractor.extract_features(val_set)
full_test_set = extractor.extract_features(test_set)



In [None]:
full_train_set.head()

Unnamed: 0,label,vba_code,code_length,autoexec_count,api_access_count,macro_count,suspicious_word_count,avg_macro_length,api_to_macro_ratio,code_length_discretized,macro_count_discretized,avg_macro_length_discretized,api_to_macro_ratio_discretized
0,1,Private Sub Workbook_BeforeClose(Cancel As Boo...,1,0,0,1,0,1,0,1,1,0,0
1,1,Option Explicit\nOption Explicit\nOption Expli...,1,0,0,1,0,1,0,1,1,1,0
2,0,'Option Explicit\nPrivate Sub Workbook_BeforeC...,1,0,0,1,1,1,0,1,1,1,0
3,0,'Option Explicit\nPrivate Sub Workbook_BeforeC...,1,0,0,1,1,1,0,1,1,1,0
4,0,'Option Explicit\nPrivate Sub Workbook_BeforeC...,1,0,0,1,1,1,0,1,1,1,0


### Training and Validation

In [None]:
class VBAClassifier:
    def __init__(self, model):
        self.model = model

    def train(self, X_train, y_train):
        """Train the model"""
        self.model.fit(X_train, y_train)

    def validate(self, X_val, y_val):
        """Validate the model on the validation dataset"""
        y_pred = self.model.predict(X_val)
        y_pred_proba = self.model.predict_proba(X_val)[:, 1]  # Probability of positive class

        accuracy = accuracy_score(y_val, y_pred)
        conf_matrix = confusion_matrix(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        auc = roc_auc_score(y_val, y_pred_proba)
        report = classification_report(y_val, y_pred)

        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Validation AUC: {auc:.4f}")
        return accuracy, precision, recall, f1, auc, conf_matrix, report

    def test(self, X_test, y_test):
        """Test the model on the test dataset"""
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]  # Probability of positive class

        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_proba)
        report = classification_report(y_test, y_pred)

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test AUC: {auc:.4f}")
        print(f"Classification Report:\n{report}")
        return accuracy, precision, recall, f1, auc, conf_matrix, report

In [None]:
X_train = full_train_set.drop(['vba_code','label'], axis=1)
y_train = full_train_set['label']

X_val = full_val_set.drop(['vba_code','label'], axis=1)
y_val = full_val_set['label']

X_test = full_test_set.drop(['vba_code','label'], axis=1)
y_test = full_test_set['label']


models = {
        "SVM": SVC(probability=True),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Neural Network": MLPClassifier()
    }

# Train, validate, and test each model with GridSearch
for model_name, model in models.items():
        print(f"\n--- {model_name} ---")
        classifier = VBAClassifier(model)

        # Train the model using GridSearchCV
        classifier.train(X_train, y_train)

        # Validate the model on the validation dataset
        accuracy, precision, recall, f1, auc, conf_matrix, report = classifier.validate(X_val, y_val)

        # Test the model on the test dataset
        accuracy, precision, recall, f1, auc, conf_matrix, report = classifier.test(X_test, y_test)


--- SVM ---
Validation Accuracy: 0.8536
Validation AUC: 0.8549
Test Accuracy: 0.8637
Test AUC: 0.8627
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      5310
           1       0.85      0.89      0.87      5320

    accuracy                           0.86     10630
   macro avg       0.86      0.86      0.86     10630
weighted avg       0.86      0.86      0.86     10630


--- Decision Tree ---
Validation Accuracy: 0.8538
Validation AUC: 0.8908
Test Accuracy: 0.8643
Test AUC: 0.8984
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      5310
           1       0.85      0.89      0.87      5320

    accuracy                           0.86     10630
   macro avg       0.87      0.86      0.86     10630
weighted avg       0.87      0.86      0.86     10630


--- Random Forest ---
Validation Accuracy: 0.8536
Validation AUC: 0.8907
Test Accur