diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index e2de5038206e..145424a5ea66 100755 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -3,6 +3,16 @@ Experimental script for bulk generation of MaD models based on a list of projects. Note: This file must be formatted using the Black Python formatter. + +Security Requirements: +- All subprocess.check_call invocations MUST use shell=False to prevent shell injection +- Language parameter MUST be validated against VALID_LANGUAGES before use +- Extractor options MUST be validated to reject shell metacharacters +- These requirements are enforced by validate_language() and validate_extractor_options() + +References: +- https://docs.python.org/3/library/subprocess.html#security-considerations +- https://owasp.org/www-community/attacks/Command_Injection """ import pathlib @@ -43,6 +53,64 @@ def missing_module(module_name: str) -> None: ) build_dir = pathlib.Path(gitroot, "mad-generation-build") +# List of valid CodeQL languages to prevent command injection +# Reference: https://docs.python.org/3/library/subprocess.html#security-considerations +VALID_LANGUAGES = { + "cpp", + "csharp", + "go", + "java", + "javascript", + "python", + "ruby", + "rust", + "swift", +} + + +def validate_language(language: str) -> None: + """ + Validate that the language is from a known safe list. + + This is a security measure to prevent command injection attacks. + Reference: https://owasp.org/www-community/attacks/Command_Injection + + Args: + language: The language identifier to validate + + Raises: + ValueError: If the language is not in the valid list + """ + if language not in VALID_LANGUAGES: + raise ValueError( + f"Invalid language '{language}'. Must be one of: {', '.join(sorted(VALID_LANGUAGES))}" + ) + + +def validate_extractor_options(extractor_options: list) -> None: + """ + Validate that extractor options are safe to use in subprocess calls. + + This is a security measure to prevent command injection attacks. + Reference: https://owasp.org/www-community/attacks/Command_Injection + + Args: + extractor_options: List of extractor options to validate + + Raises: + ValueError: If any option contains potentially dangerous characters + """ + dangerous_chars = set(";&|<>`$(){}[]\\'\"\n*") + for option in extractor_options: + if not isinstance(option, str): + raise ValueError( + f"Extractor option must be a string, got {type(option).__name__}" + ) + if any(char in option for char in dangerous_chars): + raise ValueError( + f"Extractor option '{option}' contains potentially dangerous characters" + ) + # A project to generate models for Project = TypedDict( @@ -95,6 +163,13 @@ def clone_project(project: Project) -> str: else: print(f"Cloning {name} from {repo_url}") + # Security: Always use shell=False (explicit) and pass arguments as a list + # to prevent shell injection attacks. This is critical even though we're + # already passing arguments as a list, as it makes the security posture explicit. + # References: + # - https://docs.python.org/3/library/subprocess.html#security-considerations + # - https://owasp.org/www-community/attacks/Command_Injection + # IMPORTANT: Keep shell=False for all subprocess calls to maintain security. subprocess.check_call( [ "git", @@ -107,7 +182,8 @@ def clone_project(project: Project) -> str: ), # Add branch if tag is provided repo_url, target_dir, - ] + ], + shell=False, ) print(f"Completed cloning {name}") else: @@ -190,6 +266,14 @@ def build_database( """ name = project["name"] + # Security: Validate language and extractor_options before using in subprocess + # to prevent command injection attacks. + # References: + # - https://docs.python.org/3/library/subprocess.html#security-considerations + # - https://owasp.org/www-community/attacks/Command_Injection + validate_language(language) + validate_extractor_options(extractor_options) + # Create database directory path database_dir = build_dir / f"{name}-db" @@ -198,18 +282,28 @@ def build_database( print(f"Building CodeQL database for {name}...") extractor_options = [option for x in extractor_options for option in ("-O", x)] try: + # Security: Always use shell=False (explicit) and pass arguments as a list + # to prevent shell injection attacks. This is critical even though we're + # already passing arguments as a list, as it makes the security posture explicit. + # References: + # - https://docs.python.org/3/library/subprocess.html#security-considerations + # - https://owasp.org/www-community/attacks/Command_Injection + # IMPORTANT: Keep shell=False for all subprocess calls to maintain security. subprocess.check_call( [ "codeql", "database", "create", - f"--language={language}", - "--source-root=" + project_dir, + "--language", + language, + "--source-root", + project_dir, "--overwrite", *extractor_options, "--", database_dir, - ] + ], + shell=False, ) print(f"Successfully created database at {database_dir}") except subprocess.CalledProcessError as e: @@ -457,6 +551,13 @@ def main(config, args) -> None: sys.exit(1) language = config["language"] + # Security: Validate language early to prevent command injection + try: + validate_language(language) + except ValueError as e: + print(f"ERROR: {e}") + sys.exit(1) + # Create build directory if it doesn't exist build_dir.mkdir(parents=True, exist_ok=True) @@ -464,6 +565,12 @@ def main(config, args) -> None: match get_strategy(config): case "repo": extractor_options = config.get("extractor_options", []) + # Security: Validate extractor_options early to prevent command injection + try: + validate_extractor_options(extractor_options) + except ValueError as e: + print(f"ERROR: {e}") + sys.exit(1) database_results = build_databases_from_projects( language, extractor_options, diff --git a/misc/scripts/models-as-data/test_bulk_generate_mad_security.py b/misc/scripts/models-as-data/test_bulk_generate_mad_security.py new file mode 100644 index 000000000000..462ff170ff77 --- /dev/null +++ b/misc/scripts/models-as-data/test_bulk_generate_mad_security.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Security tests for bulk_generate_mad.py to validate subprocess security improvements. + +These tests ensure that: +1. Language validation works correctly +2. Extractor options validation works correctly +3. Invalid inputs are rejected to prevent command injection +""" + +import unittest +import sys +import pathlib + +# Add the directory to the path so we can import the module +sys.path.insert(0, str(pathlib.Path(__file__).parent)) + +import bulk_generate_mad + + +class TestLanguageValidation(unittest.TestCase): + """Test cases for language validation.""" + + def test_valid_languages(self): + """Test that all valid languages are accepted.""" + valid_languages = [ + "cpp", + "csharp", + "go", + "java", + "javascript", + "python", + "ruby", + "rust", + "swift", + ] + for language in valid_languages: + try: + bulk_generate_mad.validate_language(language) + except ValueError: + self.fail(f"Valid language '{language}' was rejected") + + def test_invalid_language(self): + """Test that invalid languages are rejected.""" + invalid_languages = [ + "invalid", + "shell", + "bash", + "perl", + '"; rm -rf /', + ] + for language in invalid_languages: + with self.assertRaises( + ValueError, msg=f"Invalid language '{language}' was accepted" + ): + bulk_generate_mad.validate_language(language) + + +class TestExtractorOptionsValidation(unittest.TestCase): + """Test cases for extractor options validation.""" + + def test_valid_extractor_options(self): + """Test that valid extractor options are accepted.""" + valid_options = [ + ["--option1", "--option2"], + ["--verbose"], + ["--timeout=100"], + [], # Empty list should be valid + ] + for options in valid_options: + try: + bulk_generate_mad.validate_extractor_options(options) + except ValueError: + self.fail(f"Valid extractor options {options} were rejected") + + def test_dangerous_characters_rejected(self): + """Test that extractor options with dangerous characters are rejected.""" + dangerous_options = [ + ["; rm -rf /"], + ["| cat /etc/passwd"], + ["&& malicious_command"], + ["`whoami`"], + ["$(malicious)"], + ["test;malicious"], + ["test|other"], + ["test&background"], + ["testoutput"], + ["test\\escape"], + ["test'quote"], + ['test"doublequote'], + ["test\nmalicious"], + ["test{brace}"], + ["test[bracket]"], + ["test(paren)"], + ["test*glob"], + ] + for options in dangerous_options: + with self.assertRaises( + ValueError, + msg=f"Dangerous extractor option {options} was accepted", + ): + bulk_generate_mad.validate_extractor_options(options) + + def test_non_string_options_rejected(self): + """Test that non-string options are rejected.""" + invalid_options = [ + [123], + [None], + [{"key": "value"}], + [["nested", "list"]], + ] + for options in invalid_options: + with self.assertRaises( + ValueError, msg=f"Non-string option {options} was accepted" + ): + bulk_generate_mad.validate_extractor_options(options) + + +class TestSubprocessSecurity(unittest.TestCase): + """Test cases to ensure subprocess calls are secure.""" + + def test_valid_languages_constant_exists(self): + """Test that VALID_LANGUAGES constant is defined and contains expected languages.""" + self.assertIn("VALID_LANGUAGES", dir(bulk_generate_mad)) + self.assertIsInstance(bulk_generate_mad.VALID_LANGUAGES, set) + # Check for a few expected languages + expected = {"python", "java", "javascript", "go", "rust"} + self.assertTrue( + expected.issubset(bulk_generate_mad.VALID_LANGUAGES), + f"Expected languages {expected} not found in VALID_LANGUAGES", + ) + + +if __name__ == "__main__": + unittest.main()