In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection

/content/drive/MyDrive/Colab Notebooks/Malicious Macro Detection


In [None]:
%ls

AdaBoostClassifier.joblib          RobertaClassifier.joblib  vba_pipeline.py
classifiers_recall_scores.joblib   svmClassifier.joblib      word2vec_model.joblib
CNNClassifier.joblib               test_dataset.csv          word2vec_model.pkl
DecisionTreeClassifier.joblib      test_loader.joblib        x_test_1000.joblib
EDA.ipynb                          test_loader.pkl           x_test_100.joblib
features_k_1000.joblib             tfidf_1000.joblib         x_test_10.joblib
features_k_100.joblib              tfidf_100.joblib          x_test_1200.joblib
features_k_10.joblib               tfidf_10.joblib           x_test_1500.joblib
features_k_1200.joblib             tfidf_1200.joblib         x_test_2000.joblib
features_k_1500.joblib             tfidf_1500.joblib         x_test_2500.joblib
features_k_2000.joblib             tfidf_2000.joblib         x_test_3000.joblib
features_k_2500.joblib             tfidf_2500.joblib         x_test_500.joblib
features_k_3000.joblib             tfidf_300

In [None]:
from joblib import load
import re
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
train_set = pd.read_csv('train_dataset.csv', encoding='utf-16le')
val_set = pd.read_csv('validation_dataset.csv', encoding='utf-16le')
test_set = pd.read_csv('test_dataset.csv', encoding='utf-16le')

In [None]:
mapper = {
    'white' : 1,
     'mal' : 0
    }

train_set['label'] = train_set['label'].map(mapper)
val_set['label'] = val_set['label'].map(mapper)
test_set['label'] = test_set['label'].map(mapper)

In [None]:
class MacroFeatureExtractor:

    def __init__(self, data, suspicious_keywords):
        """
        dataset: list of strings, where each string is a macro or VBA code.
        """
        self.data = data
        self.suspicious_keywords = suspicious_keywords

    def count_concatenation(self, line):
        return len(re.findall(r'(&|\+)', line))

    def count_arithmetic(self, line):
        return len(re.findall(r'(\+|-|\*|/)', line))

    def count_parentheses(self, line):
        return len(re.findall(r'\(', line))

    def count_assignment(self, line):
        return line.count('=')

    def count_strings(self, line):
        return len(re.findall(r'\"[^\"]*\"', line))

    def count_suspicious_keywords(self, line):
        count = 0
        for keyword in self.suspicious_keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', line, re.IGNORECASE):
                count += 1
        return count

    def length(self, line):
        return len(line)

    def extract_features_from_macro(self, macro):
        """
        Extract features for a single macro.
        """
        L = macro.splitlines()  # Split macro into lines
        n = len(L)

        F1 = max(self.length(line) for line in L)
        F2 = max(self.count_concatenation(line) for line in L)
        F3 = max(self.count_arithmetic(line) for line in L)
        F4 = max(self.count_parentheses(line) for line in L)
        F5 = max(self.count_strings(line) for line in L)
        F9 = max(self.count_assignment(line) for line in L)

        # Split macro into procedures (assuming they are delimited by 'Sub' or 'Function')
        P = re.split(r'(Sub|Function)', macro)
        P = [p for p in P if p.strip()]  # Remove empty strings
        m = len(P)

        F6 = max(self.count_concatenation(proc) for proc in P)
        F7 = max(self.count_arithmetic(proc) for proc in P)
        F8 = max(self.count_parentheses(proc) for proc in P)
        F10 = max(self.count_strings(proc) for proc in P)
        F11 = max(self.count_assignment(proc) for proc in P)

        F12 = len(P)
        F13 = len(L)

        F14 = sum(1 for line in L if 'CallByName' in line)
        F15 = sum(self.count_suspicious_keywords(line) for line in L)

        return [F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15]

    def process_dataset(self):
        """
        Process all macros in the dataset and return a pandas DataFrame.
        labels: optional, target labels for supervised learning
        """
        print(self.data)
        features = []
        for i, code in enumerate(self.data['vba_code']):
          try:
            feature = self.extract_features_from_macro(code)
            features.append(feature)
            if i % 1000 == 0:  # Print progress every 1000 macros
                print(f"Processed {i+1} macros")
          except Exception as e:
            print(f"Error processing macro {i}: {str(e)}")
        feature_df = pd.DataFrame(features, columns=[f'F{i+1}' for i in range(15)])
        full_df = pd.concat([self.data, feature_df], axis=1)
        feature_df['label'] = self.data['label']

        return feature_df

In [None]:
class MacroClassifier:
    def __init__(self, train_set, val_set, test_set, suspicious_keywords):
        """
        Initialize with pre-split datasets.
        """
        self.train_extractor = MacroFeatureExtractor(train_set, suspicious_keywords)
        self.val_extractor = MacroFeatureExtractor(val_set, suspicious_keywords)
        self.test_extractor = MacroFeatureExtractor(test_set, suspicious_keywords)

        self.model_dict = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'MLP': MLPClassifier(random_state=42, max_iter=500),
            'SVM': SVC(random_state=42),
            'KNN': KNeighborsClassifier()
        }

        # Process datasets for feature extraction
        self.train_data = self.train_extractor.process_dataset()
        self.val_data = self.val_extractor.process_dataset()
        self.test_data = self.test_extractor.process_dataset()

    def get_data(self, dataset, feature_selection):
        """
        Extract features from the specified dataset (train/val/test).
        """
        X = dataset.drop('label', axis=1)
        y = dataset['label']

        if feature_selection == 'F1–F14':
            X = X.drop(columns=['F15'])  # Exclude F15
        elif feature_selection == 'F15':
            X = X[['F15']]  # Use only F15

        return X, y

    def train_models(self, X_train, y_train):
        """
        Train all models in the model_dict using the training data.
        """
        trained_models = {}
        for name, model in self.model_dict.items():
            model.fit(X_train, y_train)
            trained_models[name] = model
        return trained_models

In [None]:
class ModelEvaluator:
    def __init__(self, classifier):
        self.classifier = classifier

    def evaluate_model(self, X_train, X_val, y_train, y_val):
        results = {}
        for name, model in self.classifier.train_models(X_train, y_train).items():
            y_pred = model.predict(X_val)

            # Calculate evaluation metrics
            accuracy = accuracy_score(y_val, y_pred)
            precision = precision_score(y_val, y_pred)
            recall = recall_score(y_val, y_pred)
            f1 = f1_score(y_val, y_pred)
            far = self.calculate_far(y_val, y_pred)

            results[name] = {
                'FAR': far,
                'Precision': precision,
                'Recall': recall,
                'Accuracy': accuracy,
                'F1-Score': f1
            }
        return results

    def calculate_far(self, y_test, y_pred):
        # FAR: False Alarm Rate = FP / (FP + TN)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        far = fp / (fp + tn) if (fp + tn) > 0 else 0
        return far

    def evaluate_with_feature_selection(self):
        feature_selections = ['F1–F14', 'F15', 'F1–F15']
        results = {}

        # Iterate over different feature selections
        for selection in feature_selections:
            # Extract features for each set
            X_train, y_train = self.classifier.get_data(self.classifier.train_data, selection)
            X_val, y_val = self.classifier.get_data(self.classifier.val_data, selection)

            # Standardization
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)

            print(f"\nFeature Selection: {selection}")
            selection_results = self.evaluate_model(X_train_scaled, X_val_scaled, y_train, y_val)
            results[selection] = selection_results
            self.print_results(selection, selection_results)

        return results

    def print_results(self, feature_selection, results):
        print(f"Results for feature selection: {feature_selection}")
        for model, metrics in results.items():
            print(f"Model: {model}")
            print(f"  FAR: {metrics['FAR']:.3f}")
            print(f"  Precision: {metrics['Precision']:.3f}")
            print(f"  Recall: {metrics['Recall']:.3f}")
            print(f"  Accuracy: {metrics['Accuracy']:.3f}")
            print(f"  F1-Score: {metrics['F1-Score']:.3f}")
            print("-" * 30)


In [None]:
suspicious_keywords = [
        "Auto_Open", "AutoOpen", "Document_Open", "Workbook_Open", "Document_Close",
        "CreateObject", "GetObject", "Wscript.Shell", "Shell.Application",
        "Shell", "Run", "Exec", "Create", "ShellExecute",
        "CreateProcessA", "CreateThread", "CreateUserThread", "VirtualAlloc",
        "VirtualAllocEx", "RtlMoveMemory", "WriteProcessMemory", "VirtualProtect",
        "SetContextThread", "QueueApcThread", "WriteVirtualMemory",
        "Print", "FileCopy", "Open", "Write", "Output",
        "SaveToFile", "CreateTextFile", "Kill", "Binary", "cmd.exe", "powershell.exe", "vbhide",
        "StartupPath", "Environ", "Windows", "ShowWindow", "dde", "Lib",
        "ExecuteExcel4Macro", "System", "Virtual"
    ]

In [None]:
classifier = MacroClassifier(train_set, val_set, test_set, suspicious_keywords)

# Initialize the evaluator with the classifier
evaluator = ModelEvaluator(classifier)

# Perform evaluation with different feature selections
results = evaluator.evaluate_with_feature_selection()

       label                                           vba_code
0          1  Private Sub Workbook_BeforeClose(Cancel As Boo...
1          1  Option Explicit\nOption Explicit\nOption Expli...
2          0  'Option Explicit\nPrivate Sub Workbook_BeforeC...
3          0  'Option Explicit\nPrivate Sub Workbook_BeforeC...
4          0  'Option Explicit\nPrivate Sub Workbook_BeforeC...
...      ...                                                ...
31883      0  'Option Explicit\nPrivate Sub Workbook_BeforeC...
31884      1  Option Explicit\n\nPrivate Sub Workbook_Open()...
31885      1  Option Explicit\n\nPrivate Sub CFixPicture_Act...
31886      0  'Option Explicit\nPrivate Sub Workbook_BeforeC...
31887      0  'Option Explicit\nPrivate Sub Workbook_BeforeC...

[31888 rows x 2 columns]
Processed 1 macros
Processed 1001 macros
Processed 2001 macros
Processed 3001 macros
Processed 4001 macros
Processed 5001 macros
Processed 6001 macros
Processed 7001 macros
Processed 8001 macros
Processed 9



Results for feature selection: F1–F14
Model: RandomForest
  FAR: 0.014
  Precision: 0.986
  Recall: 0.998
  Accuracy: 0.992
  F1-Score: 0.992
------------------------------
Model: MLP
  FAR: 0.052
  Precision: 0.950
  Recall: 0.974
  Accuracy: 0.961
  F1-Score: 0.962
------------------------------
Model: SVM
  FAR: 0.129
  Precision: 0.881
  Recall: 0.957
  Accuracy: 0.914
  F1-Score: 0.918
------------------------------
Model: KNN
  FAR: 0.039
  Precision: 0.962
  Recall: 0.976
  Accuracy: 0.968
  F1-Score: 0.969
------------------------------

Feature Selection: F15
Results for feature selection: F15
Model: RandomForest
  FAR: 0.152
  Precision: 0.842
  Recall: 0.812
  Accuracy: 0.830
  F1-Score: 0.827
------------------------------
Model: MLP
  FAR: 0.133
  Precision: 0.855
  Recall: 0.792
  Accuracy: 0.829
  F1-Score: 0.822
------------------------------
Model: SVM
  FAR: 0.152
  Precision: 0.841
  Recall: 0.805
  Accuracy: 0.827
  F1-Score: 0.823
------------------------------
Mod

In [None]:
results

{'F1–F14': {'RandomForest': {'FAR': 0.014285714285714285,
   'Precision': 0.9858578340156308,
   'Recall': 0.9979280467131286,
   'Accuracy': 0.9918148461755575,
   'F1-Score': 0.9918562201628756},
  'MLP': {'FAR': 0.05169172932330827,
   'Precision': 0.9495134936662383,
   'Recall': 0.974194763608966,
   'Accuracy': 0.9612381221187317,
   'F1-Score': 0.9616957976943102},
  'SVM': {'FAR': 0.12857142857142856,
   'Precision': 0.881394139067106,
   'Recall': 0.9574307779242796,
   'Accuracy': 0.9143851726408881,
   'F1-Score': 0.9178403755868545},
  'KNN': {'FAR': 0.0387218045112782,
   'Precision': 0.9617455896007429,
   'Recall': 0.975513279336975,
   'Accuracy': 0.9683883714366356,
   'F1-Score': 0.9685805124368806}},
 'F15': {'RandomForest': {'FAR': 0.15150375939849625,
   'Precision': 0.8424858315419191,
   'Recall': 0.8120173290638538,
   'Accuracy': 0.8302756609276508,
   'F1-Score': 0.8269710339535776},
  'MLP': {'FAR': 0.13345864661654136,
   'Precision': 0.8554854467738653,
   