Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 103 additions & 2 deletions misc/scripts/models-as-data/bulk_generate_mad.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import pathlib
import re
import subprocess
import sys
from typing import Required, TypedDict, List, Callable, Optional
Expand Down Expand Up @@ -44,6 +45,88 @@ def missing_module(module_name: str) -> None:
build_dir = pathlib.Path(gitroot, "mad-generation-build")


# Security: Define allowed CodeQL languages to prevent command injection
# Reference: https://docs.python.org/3/library/subprocess.html#security-considerations
# Reference: https://owasp.org/www-community/attacks/Command_Injection
ALLOWED_LANGUAGES = frozenset(
["cpp", "csharp", "go", "java", "javascript", "python", "ruby", "rust", "swift"]
)


def validate_language(language: str) -> None:
"""
Validate that the language parameter is a known safe CodeQL language.

This validation prevents command injection by ensuring only whitelisted
language names are passed to subprocess calls.

Security considerations:
- Only allows alphanumeric characters (preventing shell metacharacters)
- Validates against a known set of CodeQL languages
- Raises ValueError for any invalid input

References:
- https://docs.python.org/3/library/subprocess.html#security-considerations
- https://owasp.org/www-community/attacks/Command_Injection

Args:
language: The language string to validate

Raises:
ValueError: If the language is not in the allowed set
"""
if not language or language not in ALLOWED_LANGUAGES:
raise ValueError(
f"Invalid language: '{language}'. Must be one of: {', '.join(sorted(ALLOWED_LANGUAGES))}"
)


def validate_extractor_options(extractor_options) -> None:
"""
Validate that extractor_options contains only safe values.

This validation prevents command injection by ensuring extractor options
contain only simple key-value pairs without shell metacharacters.

Security considerations:
- Must be a list of strings
- Each string must contain only alphanumeric characters, underscores,
dashes, dots, forward slashes, colons, and equals signs
- Rejects any shell metacharacters (;, &, |, `, $, etc.)

References:
- https://docs.python.org/3/library/subprocess.html#security-considerations
- https://owasp.org/www-community/attacks/Command_Injection

Args:
extractor_options: The options to validate (expected to be a list of strings)

Raises:
ValueError: If extractor_options is not a list or contains invalid characters
"""
if not isinstance(extractor_options, list):
raise ValueError(
f"extractor_options must be a list, got {type(extractor_options).__name__}"
)

# Pattern allows: alphanumeric, underscore, dash, dot, forward slash, colon, equals
# This is safe for key=value pairs and paths, but rejects shell metacharacters
safe_pattern = re.compile(r"^[a-zA-Z0-9_\-./=:]+$")

for option in extractor_options:
if not isinstance(option, str):
raise ValueError(
f"extractor_options must contain only strings, got {type(option).__name__} for value: {option}"
)
# Reject empty strings or strings with unsafe characters
if not option or not safe_pattern.match(option):
raise ValueError(
f"Invalid extractor option: '{option}'. "
f"Options must contain only alphanumeric characters, underscores, "
f"dashes, dots, forward slashes, colons, and equals signs."
)


# A project to generate models for
Project = TypedDict(
"Project",
Expand Down Expand Up @@ -107,7 +190,8 @@ def clone_project(project: Project) -> str:
), # Add branch if tag is provided
repo_url,
target_dir,
]
],
shell=False, # Explicitly set to prevent shell injection
)
print(f"Completed cloning {name}")
else:
Expand Down Expand Up @@ -179,6 +263,9 @@ def build_database(
"""
Build a CodeQL database for a project.

Security: This function validates all user-controlled inputs before passing
them to subprocess.check_call to prevent command injection attacks.

Args:
language: The language for which to build the database (e.g., "rust").
extractor_options: Additional options for the extractor.
Expand All @@ -187,7 +274,17 @@ def build_database(

Returns:
The path to the created database directory.

Raises:
ValueError: If language or extractor_options contain invalid values
"""
# Security: Validate inputs to prevent command injection
# References:
# - https://docs.python.org/3/library/subprocess.html#security-considerations
# - https://owasp.org/www-community/attacks/Command_Injection
validate_language(language)
validate_extractor_options(extractor_options)

name = project["name"]

# Create database directory path
Expand All @@ -198,6 +295,9 @@ def build_database(
print(f"Building CodeQL database for {name}...")
extractor_options = [option for x in extractor_options for option in ("-O", x)]
try:
# Security: Using shell=False (default) and passing command as a list
# prevents shell injection attacks. All inputs are validated above.
# Reference: https://docs.python.org/3/library/subprocess.html#security-considerations
subprocess.check_call(
[
"codeql",
Expand All @@ -209,7 +309,8 @@ def build_database(
*extractor_options,
"--",
database_dir,
]
],
shell=False, # Explicitly set to prevent shell injection
)
print(f"Successfully created database at {database_dir}")
except subprocess.CalledProcessError as e:
Expand Down
113 changes: 113 additions & 0 deletions misc/scripts/models-as-data/test_bulk_generate_mad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""
Tests for bulk_generate_mad.py security enhancements.
"""

import pytest
from bulk_generate_mad import validate_language, validate_extractor_options


class TestLanguageValidation:
"""Test cases for language parameter validation."""

def test_valid_languages(self):
"""Test that valid CodeQL languages are accepted."""
valid_languages = [
"cpp",
"csharp",
"go",
"java",
"javascript",
"python",
"ruby",
"rust",
"swift",
]
for lang in valid_languages:
# Should not raise ValueError
validate_language(lang)

def test_invalid_language_with_special_chars(self):
"""Test that languages with special characters are rejected."""
invalid_languages = [
"python; rm -rf /",
"cpp && echo hacked",
"java | cat /etc/passwd",
"rust`whoami`",
"go$(id)",
]
for lang in invalid_languages:
with pytest.raises(ValueError, match="Invalid language"):
validate_language(lang)

def test_invalid_language_empty(self):
"""Test that empty language is rejected."""
with pytest.raises(ValueError, match="Invalid language"):
validate_language("")

def test_invalid_language_unknown(self):
"""Test that unknown languages are rejected."""
with pytest.raises(ValueError, match="Invalid language"):
validate_language("cobol")


class TestExtractorOptionsValidation:
"""Test cases for extractor_options validation."""

def test_valid_extractor_options(self):
"""Test that valid extractor options are accepted."""
valid_options = [
["key=value"],
["option1=value1", "option2=value2"],
["flag-name=true"],
["config_option=123"],
["MixedCase=Value"],
[], # Empty list is valid
]
for options in valid_options:
# Should not raise ValueError
validate_extractor_options(options)

def test_invalid_extractor_options_with_special_chars(self):
"""Test that extractor options with special characters are rejected."""
invalid_options = [
["key; rm -rf /"],
["option && echo hacked"],
["flag|cat /etc/passwd"],
["config`whoami`"],
["value$(id)"],
["key=value; echo hacked"],
["key=value\nmalicious"],
]
for options in invalid_options:
with pytest.raises(ValueError, match="Invalid extractor option"):
validate_extractor_options(options)

def test_invalid_extractor_options_not_list(self):
"""Test that non-list extractor options are rejected."""
with pytest.raises(ValueError, match="must be a list"):
validate_extractor_options("not-a-list")

def test_invalid_extractor_options_non_string_elements(self):
"""Test that extractor options with non-string elements are rejected."""
with pytest.raises(ValueError, match="must contain only strings"):
validate_extractor_options(["valid", 123, "also-valid"])

def test_invalid_extractor_options_empty_string(self):
"""Test that extractor options with empty strings are rejected."""
with pytest.raises(ValueError, match="Invalid extractor option"):
validate_extractor_options([""])
with pytest.raises(ValueError, match="Invalid extractor option"):
validate_extractor_options(["valid", "", "also-valid"])

def test_valid_extractor_options_with_dots_and_slashes(self):
"""Test that extractor options with dots and forward slashes are accepted."""
valid_options = [
["path=/some/path"],
["version=1.2.3"],
["url=https://example.com"],
["file.name=value"],
]
for options in valid_options:
# Should not raise ValueError
validate_extractor_options(options)