From bb2fdebe99fa3fd540a46fdee88b2a97cb828d4f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Oct 2025 21:20:15 +0000
Subject: [PATCH 1/3] Initial plan


From 89f921ddbb4df94c57a76bc74ce01f09219cc521 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Oct 2025 21:28:32 +0000
Subject: [PATCH 2/3] Add security hardening to build_database function

- Added comprehensive docstring with security expectations
- Validate language parameter against whitelist of supported CodeQL languages
- Validate extractor_options type and content (must be iterable of strings)
- Explicitly set shell=False in all subprocess calls
- Added detailed security comments referencing OWASP and Python docs
- All changes formatted with Black

Co-authored-by: dmsmi <6402178+dmsmi@users.noreply.github.com>
---
 .../models-as-data/bulk_generate_mad.py       | 91 +++++++++++++++++--
 1 file changed, 84 insertions(+), 7 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index e2de5038206e..cb630e6950e5 100755
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -36,8 +36,13 @@ def missing_module(module_name: str) -> None:
 
 import generate_mad as mad
 
+# SECURITY: subprocess.check_output with shell=False (explicit) and list arguments.
+# - shell=False: Prevents shell interpretation
+# - List arguments: Passed directly to git, preventing command injection
+# - No user-controlled inputs; only hardcoded git command
+# Reference: Python subprocess documentation on security considerations
 gitroot = (
-    subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
+    subprocess.check_output(["git", "rev-parse", "--show-toplevel"], shell=False)
     .decode("utf-8")
     .strip()
 )
@@ -95,6 +100,12 @@ def clone_project(project: Project) -> str:
         else:
             print(f"Cloning {name} from {repo_url}")
 
+        # SECURITY: subprocess.check_call with shell=False (explicit) and list arguments.
+        # - shell=False: Prevents shell interpretation of special characters
+        # - List arguments: Each argument is passed directly to git, preventing injection
+        # - repo_url and git_tag come from config files which should be trusted sources
+        # Reference: Python subprocess documentation on security considerations
+        # (https://docs.python.org/3/library/subprocess.html#security-considerations)
         subprocess.check_call(
             [
                 "git",
@@ -107,7 +118,8 @@ def clone_project(project: Project) -> str:
                 ),  # Add branch if tag is provided
                 repo_url,
                 target_dir,
-            ]
+            ],
+            shell=False,  # Explicit: Prevents shell injection attacks
         )
         print(f"Completed cloning {name}")
     else:
@@ -179,15 +191,68 @@ def build_database(
     """
     Build a CodeQL database for a project.
 
+    Security Expectations:
+    - This function executes subprocess commands with user-controlled inputs.
+    - Language parameter is validated against a whitelist of known CodeQL languages.
+    - Extractor options are validated to ensure they are a list or iterable.
+    - All subprocess calls use shell=False (explicit) to prevent shell injection.
+    - All command arguments are passed as a list to prevent command injection.
+    - Input sources: language and extractor_options come from config files which
+      should be trusted, but validation is still performed as defense-in-depth.
+
     Args:
         language: The language for which to build the database (e.g., "rust").
-        extractor_options: Additional options for the extractor.
+                  Must be one of the supported CodeQL languages.
+        extractor_options: Additional options for the extractor. Must be an iterable
+                          (list, tuple, etc.) of strings.
         project: A dictionary containing project information with 'name' and 'git-repo' keys.
         project_dir: Path to the CodeQL database.
 
     Returns:
-        The path to the created database directory.
+        The path to the created database directory, or None if the build fails.
+
+    Raises:
+        ValueError: If language is not in the supported list or extractor_options
+                   is not a valid iterable.
     """
+    # SECURITY: Validate language parameter against known CodeQL languages.
+    # This prevents potential command injection through arbitrary language values.
+    # Reference: OWASP Command Injection Prevention Cheat Sheet
+    # (https://cheatsheetseries.owasp.org/cheatsheets/OS_Command_Injection_Defense_Cheat_Sheet.html)
+    SUPPORTED_LANGUAGES = {
+        "cpp",
+        "csharp",
+        "go",
+        "java",
+        "javascript",
+        "python",
+        "ruby",
+        "rust",
+        "swift",
+    }
+    if language not in SUPPORTED_LANGUAGES:
+        raise ValueError(
+            f"Unsupported language '{language}'. Must be one of: {', '.join(sorted(SUPPORTED_LANGUAGES))}"
+        )
+
+    # SECURITY: Validate extractor_options is an iterable and contains safe values.
+    # This prevents arbitrary data from being passed to subprocess commands.
+    # We ensure it's a list/tuple/iterable and convert to list for safe iteration.
+    try:
+        extractor_options_list = list(extractor_options)
+    except (TypeError, ValueError) as e:
+        raise ValueError(
+            f"extractor_options must be an iterable (list, tuple, etc.), got {type(extractor_options).__name__}: {e}"
+        )
+
+    # SECURITY: Ensure all extractor options are strings to prevent injection
+    # of complex objects or other unsafe data types.
+    for idx, opt in enumerate(extractor_options_list):
+        if not isinstance(opt, str):
+            raise ValueError(
+                f"All extractor options must be strings. Option at index {idx} is {type(opt).__name__}: {opt}"
+            )
+
     name = project["name"]
 
     # Create database directory path
@@ -196,8 +261,19 @@ def build_database(
     # Only build the database if it doesn't already exist
     if not database_dir.exists():
         print(f"Building CodeQL database for {name}...")
-        extractor_options = [option for x in extractor_options for option in ("-O", x)]
+        # Convert extractor options to CodeQL format: each option becomes "-O", "value"
+        formatted_options = [
+            option for x in extractor_options_list for option in ("-O", x)
+        ]
         try:
+            # SECURITY: subprocess.check_call with shell=False (explicit) and list arguments.
+            # - shell=False: Prevents shell interpretation of special characters (e.g., `;`, `|`, `&`)
+            # - List arguments: Each argument is passed directly to the executable, preventing
+            #   argument injection attacks.
+            # - All inputs are validated above to ensure they contain safe values.
+            # Reference: Python subprocess documentation on security considerations
+            # (https://docs.python.org/3/library/subprocess.html#security-considerations)
+            # Reference: OWASP Command Injection Prevention - Use of parameterized APIs
             subprocess.check_call(
                 [
                     "codeql",
@@ -206,10 +282,11 @@ def build_database(
                     f"--language={language}",
                     "--source-root=" + project_dir,
                     "--overwrite",
-                    *extractor_options,
+                    *formatted_options,
                     "--",
                     database_dir,
-                ]
+                ],
+                shell=False,  # Explicit: Prevents shell injection attacks
             )
             print(f"Successfully created database at {database_dir}")
         except subprocess.CalledProcessError as e:

From 83c4d7a618ad25647d88d6d1f5db528cb0ae0b49 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Oct 2025 21:33:44 +0000
Subject: [PATCH 3/3] Address code review feedback

- Remove exception details from error messages to avoid exposing internal details
- Remove option values from error messages to prevent potential log exposure
- Use descriptive variable name 'extractor_option' instead of 'x' in comprehension

Co-authored-by: dmsmi <6402178+dmsmi@users.noreply.github.com>
---
 misc/scripts/models-as-data/bulk_generate_mad.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index cb630e6950e5..29bb66ee01a4 100755
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -240,17 +240,17 @@ def build_database(
     # We ensure it's a list/tuple/iterable and convert to list for safe iteration.
     try:
         extractor_options_list = list(extractor_options)
-    except (TypeError, ValueError) as e:
+    except (TypeError, ValueError):
         raise ValueError(
-            f"extractor_options must be an iterable (list, tuple, etc.), got {type(extractor_options).__name__}: {e}"
+            f"extractor_options must be an iterable (list, tuple, etc.), got {type(extractor_options).__name__}"
         )
 
     # SECURITY: Ensure all extractor options are strings to prevent injection
     # of complex objects or other unsafe data types.
-    for idx, opt in enumerate(extractor_options_list):
-        if not isinstance(opt, str):
+    for idx, extractor_option in enumerate(extractor_options_list):
+        if not isinstance(extractor_option, str):
             raise ValueError(
-                f"All extractor options must be strings. Option at index {idx} is {type(opt).__name__}: {opt}"
+                f"All extractor options must be strings. Option at index {idx} is {type(extractor_option).__name__}"
             )
 
     name = project["name"]
@@ -263,7 +263,9 @@ def build_database(
         print(f"Building CodeQL database for {name}...")
         # Convert extractor options to CodeQL format: each option becomes "-O", "value"
         formatted_options = [
-            option for x in extractor_options_list for option in ("-O", x)
+            option
+            for extractor_option in extractor_options_list
+            for option in ("-O", extractor_option)
         ]
         try:
             # SECURITY: subprocess.check_call with shell=False (explicit) and list arguments.