knowledge-bases-testing · Copilot · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -6,6 +6,7 @@
 """
 
 import pathlib
+import re
 import subprocess
 import sys
 from typing import Required, TypedDict, List, Callable, Optional
@@ -44,6 +45,88 @@ def missing_module(module_name: str) -> None:
 build_dir = pathlib.Path(gitroot, "mad-generation-build")
 
 
+# Security: Define allowed CodeQL languages to prevent command injection
+# Reference: https://docs.python.org/3/library/subprocess.html#security-considerations
+# Reference: https://owasp.org/www-community/attacks/Command_Injection
+ALLOWED_LANGUAGES = frozenset(
+    ["cpp", "csharp", "go", "java", "javascript", "python", "ruby", "rust", "swift"]
+)
+
+
+def validate_language(language: str) -> None:
+    """
+    Validate that the language parameter is a known safe CodeQL language.
+
+    This validation prevents command injection by ensuring only whitelisted
+    language names are passed to subprocess calls.
+
+    Security considerations:
+    - Only allows alphanumeric characters (preventing shell metacharacters)
+    - Validates against a known set of CodeQL languages
+    - Raises ValueError for any invalid input
+
+    References:
+    - https://docs.python.org/3/library/subprocess.html#security-considerations
+    - https://owasp.org/www-community/attacks/Command_Injection
+
+    Args:
+        language: The language string to validate
+
+    Raises:
+        ValueError: If the language is not in the allowed set
+    """
+    if not language or language not in ALLOWED_LANGUAGES:
+        raise ValueError(
+            f"Invalid language: '{language}'. Must be one of: {', '.join(sorted(ALLOWED_LANGUAGES))}"
+        )
+
+
+def validate_extractor_options(extractor_options) -> None:
+    """
+    Validate that extractor_options contains only safe values.
+
+    This validation prevents command injection by ensuring extractor options
+    contain only simple key-value pairs without shell metacharacters.
+
+    Security considerations:
+    - Must be a list of strings
+    - Each string must contain only alphanumeric characters, underscores,
+      dashes, dots, forward slashes, colons, and equals signs
+    - Rejects any shell metacharacters (;, &, |, `, $, etc.)
+
+    References:
+    - https://docs.python.org/3/library/subprocess.html#security-considerations
+    - https://owasp.org/www-community/attacks/Command_Injection
+
+    Args:
+        extractor_options: The options to validate (expected to be a list of strings)
+
+    Raises:
+        ValueError: If extractor_options is not a list or contains invalid characters
+    """
+    if not isinstance(extractor_options, list):
+        raise ValueError(
+            f"extractor_options must be a list, got {type(extractor_options).__name__}"
+        )
+
+    # Pattern allows: alphanumeric, underscore, dash, dot, forward slash, colon, equals
+    # This is safe for key=value pairs and paths, but rejects shell metacharacters
+    safe_pattern = re.compile(r"^[a-zA-Z0-9_\-./=:]+$")
+
+    for option in extractor_options:
+        if not isinstance(option, str):
+            raise ValueError(
+                f"extractor_options must contain only strings, got {type(option).__name__} for value: {option}"
+            )
+        # Reject empty strings or strings with unsafe characters
+        if not option or not safe_pattern.match(option):
+            raise ValueError(
+                f"Invalid extractor option: '{option}'. "
+                f"Options must contain only alphanumeric characters, underscores, "
+                f"dashes, dots, forward slashes, colons, and equals signs."
+            )
+
+
 # A project to generate models for
 Project = TypedDict(
     "Project",
@@ -107,7 +190,8 @@ def clone_project(project: Project) -> str:
                 ),  # Add branch if tag is provided
                 repo_url,
                 target_dir,
-            ]
+            ],
+            shell=False,  # Explicitly set to prevent shell injection
         )
         print(f"Completed cloning {name}")
     else:
@@ -179,6 +263,9 @@ def build_database(
     """
     Build a CodeQL database for a project.
 
+    Security: This function validates all user-controlled inputs before passing
+    them to subprocess.check_call to prevent command injection attacks.
+
     Args:
         language: The language for which to build the database (e.g., "rust").
         extractor_options: Additional options for the extractor.
@@ -187,7 +274,17 @@ def build_database(
 
     Returns:
         The path to the created database directory.
+
+    Raises:
+        ValueError: If language or extractor_options contain invalid values
     """
+    # Security: Validate inputs to prevent command injection
+    # References:
+    # - https://docs.python.org/3/library/subprocess.html#security-considerations
+    # - https://owasp.org/www-community/attacks/Command_Injection
+    validate_language(language)
+    validate_extractor_options(extractor_options)
+
     name = project["name"]
 
     # Create database directory path
@@ -198,6 +295,9 @@ def build_database(
         print(f"Building CodeQL database for {name}...")
         extractor_options = [option for x in extractor_options for option in ("-O", x)]
         try:
+            # Security: Using shell=False (default) and passing command as a list
+            # prevents shell injection attacks. All inputs are validated above.
+            # Reference: https://docs.python.org/3/library/subprocess.html#security-considerations
             subprocess.check_call(
                 [
                     "codeql",
@@ -209,7 +309,8 @@ def build_database(
                     *extractor_options,
                     "--",
                     database_dir,
-                ]
+                ],
+                shell=False,  # Explicitly set to prevent shell injection
             )
             print(f"Successfully created database at {database_dir}")
         except subprocess.CalledProcessError as e:

diff --git a/misc/scripts/models-as-data/test_bulk_generate_mad.py b/misc/scripts/models-as-data/test_bulk_generate_mad.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Tests for bulk_generate_mad.py security enhancements.
+"""
+
+import pytest
+from bulk_generate_mad import validate_language, validate_extractor_options
+
+
+class TestLanguageValidation:
+    """Test cases for language parameter validation."""
+
+    def test_valid_languages(self):
+        """Test that valid CodeQL languages are accepted."""
+        valid_languages = [
+            "cpp",
+            "csharp",
+            "go",
+            "java",
+            "javascript",
+            "python",
+            "ruby",
+            "rust",
+            "swift",
+        ]
+        for lang in valid_languages:
+            # Should not raise ValueError
+            validate_language(lang)
+
+    def test_invalid_language_with_special_chars(self):
+        """Test that languages with special characters are rejected."""
+        invalid_languages = [
+            "python; rm -rf /",
+            "cpp && echo hacked",
+            "java | cat /etc/passwd",
+            "rust`whoami`",
+            "go$(id)",
+        ]
+        for lang in invalid_languages:
+            with pytest.raises(ValueError, match="Invalid language"):
+                validate_language(lang)
+
+    def test_invalid_language_empty(self):
+        """Test that empty language is rejected."""
+        with pytest.raises(ValueError, match="Invalid language"):
+            validate_language("")
+
+    def test_invalid_language_unknown(self):
+        """Test that unknown languages are rejected."""
+        with pytest.raises(ValueError, match="Invalid language"):
+            validate_language("cobol")
+
+
+class TestExtractorOptionsValidation:
+    """Test cases for extractor_options validation."""
+
+    def test_valid_extractor_options(self):
+        """Test that valid extractor options are accepted."""
+        valid_options = [
+            ["key=value"],
+            ["option1=value1", "option2=value2"],
+            ["flag-name=true"],
+            ["config_option=123"],
+            ["MixedCase=Value"],
+            [],  # Empty list is valid
+        ]
+        for options in valid_options:
+            # Should not raise ValueError
+            validate_extractor_options(options)
+
+    def test_invalid_extractor_options_with_special_chars(self):
+        """Test that extractor options with special characters are rejected."""
+        invalid_options = [
+            ["key; rm -rf /"],
+            ["option && echo hacked"],
+            ["flag|cat /etc/passwd"],
+            ["config`whoami`"],
+            ["value$(id)"],
+            ["key=value; echo hacked"],
+            ["key=value\nmalicious"],
+        ]
+        for options in invalid_options:
+            with pytest.raises(ValueError, match="Invalid extractor option"):
+                validate_extractor_options(options)
+
+    def test_invalid_extractor_options_not_list(self):
+        """Test that non-list extractor options are rejected."""
+        with pytest.raises(ValueError, match="must be a list"):
+            validate_extractor_options("not-a-list")
+
+    def test_invalid_extractor_options_non_string_elements(self):
+        """Test that extractor options with non-string elements are rejected."""
+        with pytest.raises(ValueError, match="must contain only strings"):
+            validate_extractor_options(["valid", 123, "also-valid"])
+
+    def test_invalid_extractor_options_empty_string(self):
+        """Test that extractor options with empty strings are rejected."""
+        with pytest.raises(ValueError, match="Invalid extractor option"):
+            validate_extractor_options([""])
+        with pytest.raises(ValueError, match="Invalid extractor option"):
+            validate_extractor_options(["valid", "", "also-valid"])
+
+    def test_valid_extractor_options_with_dots_and_slashes(self):
+        """Test that extractor options with dots and forward slashes are accepted."""
+        valid_options = [
+            ["path=/some/path"],
+            ["version=1.2.3"],
+            ["url=https://example.com"],
+            ["file.name=value"],
+        ]
+        for options in valid_options:
+            # Should not raise ValueError
+            validate_extractor_options(options)