diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index e2de5038206e..01a469877b08 100755 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -6,6 +6,7 @@ """ import pathlib +import re import subprocess import sys from typing import Required, TypedDict, List, Callable, Optional @@ -44,6 +45,88 @@ def missing_module(module_name: str) -> None: build_dir = pathlib.Path(gitroot, "mad-generation-build") +# Security: Define allowed CodeQL languages to prevent command injection +# Reference: https://docs.python.org/3/library/subprocess.html#security-considerations +# Reference: https://owasp.org/www-community/attacks/Command_Injection +ALLOWED_LANGUAGES = frozenset( + ["cpp", "csharp", "go", "java", "javascript", "python", "ruby", "rust", "swift"] +) + + +def validate_language(language: str) -> None: + """ + Validate that the language parameter is a known safe CodeQL language. + + This validation prevents command injection by ensuring only whitelisted + language names are passed to subprocess calls. + + Security considerations: + - Only allows alphanumeric characters (preventing shell metacharacters) + - Validates against a known set of CodeQL languages + - Raises ValueError for any invalid input + + References: + - https://docs.python.org/3/library/subprocess.html#security-considerations + - https://owasp.org/www-community/attacks/Command_Injection + + Args: + language: The language string to validate + + Raises: + ValueError: If the language is not in the allowed set + """ + if not language or language not in ALLOWED_LANGUAGES: + raise ValueError( + f"Invalid language: '{language}'. Must be one of: {', '.join(sorted(ALLOWED_LANGUAGES))}" + ) + + +def validate_extractor_options(extractor_options) -> None: + """ + Validate that extractor_options contains only safe values. + + This validation prevents command injection by ensuring extractor options + contain only simple key-value pairs without shell metacharacters. + + Security considerations: + - Must be a list of strings + - Each string must contain only alphanumeric characters, underscores, + dashes, dots, forward slashes, colons, and equals signs + - Rejects any shell metacharacters (;, &, |, `, $, etc.) + + References: + - https://docs.python.org/3/library/subprocess.html#security-considerations + - https://owasp.org/www-community/attacks/Command_Injection + + Args: + extractor_options: The options to validate (expected to be a list of strings) + + Raises: + ValueError: If extractor_options is not a list or contains invalid characters + """ + if not isinstance(extractor_options, list): + raise ValueError( + f"extractor_options must be a list, got {type(extractor_options).__name__}" + ) + + # Pattern allows: alphanumeric, underscore, dash, dot, forward slash, colon, equals + # This is safe for key=value pairs and paths, but rejects shell metacharacters + safe_pattern = re.compile(r"^[a-zA-Z0-9_\-./=:]+$") + + for option in extractor_options: + if not isinstance(option, str): + raise ValueError( + f"extractor_options must contain only strings, got {type(option).__name__} for value: {option}" + ) + # Reject empty strings or strings with unsafe characters + if not option or not safe_pattern.match(option): + raise ValueError( + f"Invalid extractor option: '{option}'. " + f"Options must contain only alphanumeric characters, underscores, " + f"dashes, dots, forward slashes, colons, and equals signs." + ) + + # A project to generate models for Project = TypedDict( "Project", @@ -107,7 +190,8 @@ def clone_project(project: Project) -> str: ), # Add branch if tag is provided repo_url, target_dir, - ] + ], + shell=False, # Explicitly set to prevent shell injection ) print(f"Completed cloning {name}") else: @@ -179,6 +263,9 @@ def build_database( """ Build a CodeQL database for a project. + Security: This function validates all user-controlled inputs before passing + them to subprocess.check_call to prevent command injection attacks. + Args: language: The language for which to build the database (e.g., "rust"). extractor_options: Additional options for the extractor. @@ -187,7 +274,17 @@ def build_database( Returns: The path to the created database directory. + + Raises: + ValueError: If language or extractor_options contain invalid values """ + # Security: Validate inputs to prevent command injection + # References: + # - https://docs.python.org/3/library/subprocess.html#security-considerations + # - https://owasp.org/www-community/attacks/Command_Injection + validate_language(language) + validate_extractor_options(extractor_options) + name = project["name"] # Create database directory path @@ -198,6 +295,9 @@ def build_database( print(f"Building CodeQL database for {name}...") extractor_options = [option for x in extractor_options for option in ("-O", x)] try: + # Security: Using shell=False (default) and passing command as a list + # prevents shell injection attacks. All inputs are validated above. + # Reference: https://docs.python.org/3/library/subprocess.html#security-considerations subprocess.check_call( [ "codeql", @@ -209,7 +309,8 @@ def build_database( *extractor_options, "--", database_dir, - ] + ], + shell=False, # Explicitly set to prevent shell injection ) print(f"Successfully created database at {database_dir}") except subprocess.CalledProcessError as e: diff --git a/misc/scripts/models-as-data/test_bulk_generate_mad.py b/misc/scripts/models-as-data/test_bulk_generate_mad.py new file mode 100644 index 000000000000..4cf2de88bf86 --- /dev/null +++ b/misc/scripts/models-as-data/test_bulk_generate_mad.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Tests for bulk_generate_mad.py security enhancements. +""" + +import pytest +from bulk_generate_mad import validate_language, validate_extractor_options + + +class TestLanguageValidation: + """Test cases for language parameter validation.""" + + def test_valid_languages(self): + """Test that valid CodeQL languages are accepted.""" + valid_languages = [ + "cpp", + "csharp", + "go", + "java", + "javascript", + "python", + "ruby", + "rust", + "swift", + ] + for lang in valid_languages: + # Should not raise ValueError + validate_language(lang) + + def test_invalid_language_with_special_chars(self): + """Test that languages with special characters are rejected.""" + invalid_languages = [ + "python; rm -rf /", + "cpp && echo hacked", + "java | cat /etc/passwd", + "rust`whoami`", + "go$(id)", + ] + for lang in invalid_languages: + with pytest.raises(ValueError, match="Invalid language"): + validate_language(lang) + + def test_invalid_language_empty(self): + """Test that empty language is rejected.""" + with pytest.raises(ValueError, match="Invalid language"): + validate_language("") + + def test_invalid_language_unknown(self): + """Test that unknown languages are rejected.""" + with pytest.raises(ValueError, match="Invalid language"): + validate_language("cobol") + + +class TestExtractorOptionsValidation: + """Test cases for extractor_options validation.""" + + def test_valid_extractor_options(self): + """Test that valid extractor options are accepted.""" + valid_options = [ + ["key=value"], + ["option1=value1", "option2=value2"], + ["flag-name=true"], + ["config_option=123"], + ["MixedCase=Value"], + [], # Empty list is valid + ] + for options in valid_options: + # Should not raise ValueError + validate_extractor_options(options) + + def test_invalid_extractor_options_with_special_chars(self): + """Test that extractor options with special characters are rejected.""" + invalid_options = [ + ["key; rm -rf /"], + ["option && echo hacked"], + ["flag|cat /etc/passwd"], + ["config`whoami`"], + ["value$(id)"], + ["key=value; echo hacked"], + ["key=value\nmalicious"], + ] + for options in invalid_options: + with pytest.raises(ValueError, match="Invalid extractor option"): + validate_extractor_options(options) + + def test_invalid_extractor_options_not_list(self): + """Test that non-list extractor options are rejected.""" + with pytest.raises(ValueError, match="must be a list"): + validate_extractor_options("not-a-list") + + def test_invalid_extractor_options_non_string_elements(self): + """Test that extractor options with non-string elements are rejected.""" + with pytest.raises(ValueError, match="must contain only strings"): + validate_extractor_options(["valid", 123, "also-valid"]) + + def test_invalid_extractor_options_empty_string(self): + """Test that extractor options with empty strings are rejected.""" + with pytest.raises(ValueError, match="Invalid extractor option"): + validate_extractor_options([""]) + with pytest.raises(ValueError, match="Invalid extractor option"): + validate_extractor_options(["valid", "", "also-valid"]) + + def test_valid_extractor_options_with_dots_and_slashes(self): + """Test that extractor options with dots and forward slashes are accepted.""" + valid_options = [ + ["path=/some/path"], + ["version=1.2.3"], + ["url=https://example.com"], + ["file.name=value"], + ] + for options in valid_options: + # Should not raise ValueError + validate_extractor_options(options)