diff --git a/deploy/aws/cloudformation/template.yaml b/deploy/aws/cloudformation/template.yaml index 03d09dda..b1bb217b 100644 --- a/deploy/aws/cloudformation/template.yaml +++ b/deploy/aws/cloudformation/template.yaml @@ -17,6 +17,8 @@ Metadata: - InstanceType - RootVolumeSize - DataVolumeSize + - DataVolumeIops + - DataVolumeThroughput - AmiSsmParameter - Label: default: Access @@ -51,6 +53,10 @@ Metadata: default: Root volume size DataVolumeSize: default: Hypeman data volume size + DataVolumeIops: + default: Hypeman data volume IOPS + DataVolumeThroughput: + default: Hypeman data volume throughput HypemanVersion: default: Hypeman release HypemanBranch: @@ -109,6 +115,16 @@ Parameters: MinValue: 50 MaxValue: 16384 Description: Hypeman data EBS volume size in GiB. This volume is formatted as XFS and mounted at /var/lib/hypeman. + DataVolumeIops: + Type: String + Default: "" + AllowedPattern: "^$|^[0-9]+$" + Description: Optional provisioned IOPS for the Hypeman data gp3 EBS volume. Leave empty for the EC2 default. + DataVolumeThroughput: + Type: String + Default: "" + AllowedPattern: "^$|^[0-9]+$" + Description: Optional provisioned throughput in MiB/s for the Hypeman data gp3 EBS volume. When set, Hypeman disk I/O capacity is configured to the same value. HypemanVersion: Type: String Default: latest @@ -285,18 +301,34 @@ Resources: stack_uuid = event["StackId"].rsplit("/", 1)[-1] return f"{event['ResourceProperties']['NamePrefix']}-{stack_uuid}" - def create_launch_template(name): + def put_if_set(payload, key, value): + if value: + payload[key] = value + + def create_launch_template(name, props): payload = { "Action": "CreateLaunchTemplate", "Version": "2016-11-15", "LaunchTemplateName": name, "LaunchTemplateData.CpuOptions.NestedVirtualization": "enabled", + "LaunchTemplateData.BlockDeviceMapping.1.DeviceName": "/dev/sda1", + "LaunchTemplateData.BlockDeviceMapping.1.Ebs.VolumeSize": props["RootVolumeSize"], + "LaunchTemplateData.BlockDeviceMapping.1.Ebs.VolumeType": "gp3", + "LaunchTemplateData.BlockDeviceMapping.1.Ebs.Encrypted": "true", + "LaunchTemplateData.BlockDeviceMapping.1.Ebs.DeleteOnTermination": "true", + "LaunchTemplateData.BlockDeviceMapping.2.DeviceName": "/dev/sdf", + "LaunchTemplateData.BlockDeviceMapping.2.Ebs.VolumeSize": props["DataVolumeSize"], + "LaunchTemplateData.BlockDeviceMapping.2.Ebs.VolumeType": "gp3", + "LaunchTemplateData.BlockDeviceMapping.2.Ebs.Encrypted": "true", + "LaunchTemplateData.BlockDeviceMapping.2.Ebs.DeleteOnTermination": "true", "TagSpecification.1.ResourceType": "launch-template", "TagSpecification.1.Tag.1.Key": "Name", "TagSpecification.1.Tag.1.Value": name, "TagSpecification.1.Tag.2.Key": "hypeman:deployment", "TagSpecification.1.Tag.2.Value": "aws", } + put_if_set(payload, "LaunchTemplateData.BlockDeviceMapping.2.Ebs.Iops", props.get("DataVolumeIops", "")) + put_if_set(payload, "LaunchTemplateData.BlockDeviceMapping.2.Ebs.Throughput", props.get("DataVolumeThroughput", "")) xml = ec2_query(payload) root = ET.fromstring(xml) launch_template_id = root.find(".//{*}launchTemplateId") @@ -326,7 +358,7 @@ Resources: return if request_type == "Update": delete_launch_template(physical_id) - data = create_launch_template(launch_template_name(event)) + data = create_launch_template(launch_template_name(event), event["ResourceProperties"]) send(event, context, "SUCCESS", data, physical_id=data["LaunchTemplateId"]) except Exception as exc: traceback.print_exc() @@ -337,6 +369,10 @@ Resources: Properties: ServiceToken: !GetAtt NestedVirtualizationLaunchTemplateFunction.Arn NamePrefix: hypeman + RootVolumeSize: !Ref RootVolumeSize + DataVolumeSize: !Ref DataVolumeSize + DataVolumeIops: !Ref DataVolumeIops + DataVolumeThroughput: !Ref DataVolumeThroughput HypemanHost: Type: AWS::EC2::Instance @@ -351,19 +387,6 @@ Resources: - !Ref HypemanSecurityGroup IamInstanceProfile: !Ref HypemanInstanceProfile KeyName: !If [UseSSH, !Ref KeyName, !Ref AWS::NoValue] - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: !Ref RootVolumeSize - VolumeType: gp3 - Encrypted: true - DeleteOnTermination: true - - DeviceName: /dev/sdf - Ebs: - VolumeSize: !Ref DataVolumeSize - VolumeType: gp3 - Encrypted: true - DeleteOnTermination: true Tags: - Key: Name Value: !Sub ${AWS::StackName}-hypeman @@ -445,6 +468,16 @@ Resources: fi curl -fsSL https://raw.githubusercontent.com/kernel/hypeman/main/scripts/install.sh | bash + if [ -n "${DataVolumeThroughput}" ]; then + install -d -m 755 /etc/systemd/system/hypeman.service.d + cat >/etc/systemd/system/hypeman.service.d/disk-io-capacity.conf </usr/local/bin/hypeman-create-token <<'SCRIPT' #!/usr/bin/env bash diff --git a/deploy/aws/cloudformation/template_test.go b/deploy/aws/cloudformation/template_test.go index b9473bb6..725de633 100644 --- a/deploy/aws/cloudformation/template_test.go +++ b/deploy/aws/cloudformation/template_test.go @@ -20,6 +20,8 @@ func TestQuickstartParameters(t *testing.T) { assertDefault(t, parameters, "AllowedSshCidr", "127.0.0.1/32") assertDefault(t, parameters, "RootVolumeSize", "30") assertDefault(t, parameters, "DataVolumeSize", "100") + assertDefault(t, parameters, "DataVolumeIops", "") + assertDefault(t, parameters, "DataVolumeThroughput", "") assertDefault(t, parameters, "HypemanVersion", "latest") assertDefault(t, parameters, "HypemanCliVersion", "latest") @@ -87,6 +89,16 @@ func TestCloudFormationLaunchContract(t *testing.T) { zipFile := scalar(t, requireField(t, code, "ZipFile")) assertContains(t, zipFile, `"Action": "CreateLaunchTemplate"`) assertContains(t, zipFile, `"LaunchTemplateData.CpuOptions.NestedVirtualization": "enabled"`) + assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.1.Ebs.VolumeSize": props["RootVolumeSize"]`) + assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.2.Ebs.VolumeSize": props["DataVolumeSize"]`) + assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.2.Ebs.Iops"`) + assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.2.Ebs.Throughput"`) + + launchTemplateProperties := requireMapping(t, requireField(t, launchTemplate, "Properties")) + assertRef(t, requireField(t, launchTemplateProperties, "RootVolumeSize"), "RootVolumeSize") + assertRef(t, requireField(t, launchTemplateProperties, "DataVolumeSize"), "DataVolumeSize") + assertRef(t, requireField(t, launchTemplateProperties, "DataVolumeIops"), "DataVolumeIops") + assertRef(t, requireField(t, launchTemplateProperties, "DataVolumeThroughput"), "DataVolumeThroughput") host := requireMapping(t, requireField(t, resources, "HypemanHost")) if got := scalar(t, requireField(t, host, "Type")); got != "AWS::EC2::Instance" { @@ -97,19 +109,10 @@ func TestCloudFormationLaunchContract(t *testing.T) { assertGetAtt(t, requireField(t, hostLaunchTemplate, "LaunchTemplateId"), "NestedVirtualizationLaunchTemplate.LaunchTemplateId") assertGetAtt(t, requireField(t, hostLaunchTemplate, "Version"), "NestedVirtualizationLaunchTemplate.VersionNumber") - blockDeviceMappings := requireSequence(t, requireField(t, hostProperties, "BlockDeviceMappings")) - if len(blockDeviceMappings.Content) != 2 { - t.Fatalf("expected root and Hypeman data block device mappings, got %d", len(blockDeviceMappings.Content)) - } - dataDevice := requireMapping(t, blockDeviceMappings.Content[1]) - if got := scalar(t, requireField(t, dataDevice, "DeviceName")); got != "/dev/sdf" { - t.Fatalf("data device name = %q, want /dev/sdf", got) - } - dataEBS := requireMapping(t, requireField(t, dataDevice, "Ebs")) - assertRef(t, requireField(t, dataEBS, "VolumeSize"), "DataVolumeSize") - userData := nodeText(requireField(t, hostProperties, "UserData")) assertContains(t, userData, "curl -fsSL https://raw.githubusercontent.com/kernel/hypeman/main/scripts/install.sh | bash") + assertContains(t, userData, `if [ -n "${DataVolumeThroughput}" ]; then`) + assertContains(t, userData, `Environment="CAPACITY__DISK_IO=${DataVolumeThroughput}MB/s"`) assertContains(t, userData, "xfsprogs") assertContains(t, userData, "mkfs.xfs -f") assertContains(t, userData, "/var/lib/hypeman") diff --git a/lib/resources/README.md b/lib/resources/README.md index af71f6fd..1ca94319 100644 --- a/lib/resources/README.md +++ b/lib/resources/README.md @@ -73,7 +73,7 @@ Per-VM disk I/O rate limiting with burst support: - **Cloud Hypervisor**: Uses native `RateLimiterConfig` with token bucket - **QEMU**: Uses drive `throttling.bps-total` options -- **Default**: Proportional to CPU: `(vcpus / cpu_capacity) * disk_io_capacity * 2.0` +- **Default**: Proportional to CPU: `(vcpus / cpu_capacity) * disk_io_capacity` - **Burst**: 4x sustained rate (allows fast cold starts) ## Example: Default Limits @@ -84,9 +84,9 @@ Per-VM disk I/O rate limiting with burst support: | Resource | Calculation | Default Limit | |----------|-------------|---------------| -| Network (down/up) | 10Gbps × 2.0 × 12.5% | 2.5 Gbps (312 MB/s) | -| Disk I/O (sustained) | 1GB/s × 2.0 × 12.5% | 250 MB/s | -| Disk I/O (burst) | 250 MB/s × 4 | 1 GB/s | +| Network (down/up) | 10Gbps × 12.5% | 1.25 Gbps (156 MB/s) | +| Disk I/O (sustained) | 1GB/s × 12.5% | 125 MB/s | +| Disk I/O (burst) | 125 MB/s × 4 | 500 MB/s | ## Effective Limits diff --git a/lib/resources/resource.go b/lib/resources/resource.go index 79f1c974..caaf5ba5 100644 --- a/lib/resources/resource.go +++ b/lib/resources/resource.go @@ -787,7 +787,7 @@ func (m *Manager) DiskIOCapacity() int64 { // DefaultNetworkBandwidth calculates the default network bandwidth for an instance // based on its CPU allocation proportional to host CPU capacity. -// Formula: (instanceVcpus / hostCpuCapacity) * networkCapacity * oversubRatio +// Formula: (instanceVcpus / hostCpuCapacity) * networkCapacity // Returns symmetric download/upload limits. func (m *Manager) DefaultNetworkBandwidth(vcpus int) (downloadBps, uploadBps int64) { cpuCapacity := m.CPUCapacity() @@ -800,11 +800,7 @@ func (m *Manager) DefaultNetworkBandwidth(vcpus int) (downloadBps, uploadBps int return 0, 0 } - ratio := m.GetOversubRatio(ResourceNetwork) - effectiveNet := int64(float64(netCapacity) * ratio) - - // Proportional to CPU: (vcpus / cpuCapacity) * effectiveNet - bandwidth := (int64(vcpus) * effectiveNet) / cpuCapacity + bandwidth := (int64(vcpus) * netCapacity) / cpuCapacity // Symmetric limits by default return bandwidth, bandwidth @@ -812,7 +808,7 @@ func (m *Manager) DefaultNetworkBandwidth(vcpus int) (downloadBps, uploadBps int // DefaultDiskIOBandwidth calculates the default disk I/O bandwidth for an instance // based on its CPU allocation proportional to host CPU capacity. -// Formula: (instanceVcpus / hostCpuCapacity) * diskIOCapacity * oversubRatio +// Formula: (instanceVcpus / hostCpuCapacity) * diskIOCapacity // Returns sustained rate and burst rate (4x sustained). func (m *Manager) DefaultDiskIOBandwidth(vcpus int) (ioBps, burstBps int64) { cpuCapacity := m.CPUCapacity() @@ -825,14 +821,7 @@ func (m *Manager) DefaultDiskIOBandwidth(vcpus int) (ioBps, burstBps int64) { return 0, 0 } - ratio := m.cfg.Oversubscription.DiskIO - if ratio <= 0 { - ratio = 2.0 // Default 2x oversubscription for disk I/O - } - effectiveIO := int64(float64(ioCapacity) * ratio) - - // Proportional to CPU: (vcpus / cpuCapacity) * effectiveIO - sustained := (int64(vcpus) * effectiveIO) / cpuCapacity + sustained := (int64(vcpus) * ioCapacity) / cpuCapacity // Burst is 4x sustained (allows fast cold starts) burst := sustained * 4 diff --git a/lib/resources/resource_test.go b/lib/resources/resource_test.go index 852f8a7b..0ba2504a 100644 --- a/lib/resources/resource_test.go +++ b/lib/resources/resource_test.go @@ -78,7 +78,7 @@ func TestDefaultNetworkBandwidth(t *testing.T) { cfg := &config.Config{ DataDir: t.TempDir(), Oversubscription: config.OversubscriptionConfig{ - CPU: 1.0, Memory: 1.0, Disk: 1.0, Network: 1.0, + CPU: 1.0, Memory: 1.0, Disk: 1.0, Network: 4.0, }, Capacity: config.CapacityConfig{Network: "10Gbps"}, // 1.25 GB/s = 1,250,000,000 bytes/sec } @@ -123,6 +123,35 @@ func TestDefaultNetworkBandwidth_ZeroCPU(t *testing.T) { assert.Equal(t, int64(0), uploadBw, "Should return 0 when CPU capacity is 0") } +func TestDefaultDiskIOBandwidthIgnoresAdmissionOversubscription(t *testing.T) { + cfg := &config.Config{ + DataDir: t.TempDir(), + Oversubscription: config.OversubscriptionConfig{ + CPU: 1.0, Memory: 1.0, Disk: 1.0, Network: 1.0, DiskIO: 4.0, + }, + Capacity: config.CapacityConfig{DiskIO: "1GB/s"}, + } + p := paths.New(cfg.DataDir) + + mgr := NewManager(cfg, p) + mgr.SetInstanceLister(&mockInstanceLister{}) + mgr.SetImageLister(&mockImageLister{}) + mgr.SetVolumeLister(&mockVolumeLister{}) + + err := mgr.Initialize(context.Background()) + require.NoError(t, err) + + cpuCapacity := mgr.CPUCapacity() + ioCapacity := mgr.DiskIOCapacity() + + if cpuCapacity > 0 && ioCapacity > 0 { + ioBps, burstBps := mgr.DefaultDiskIOBandwidth(2) + expected := (int64(2) * ioCapacity) / cpuCapacity + assert.Equal(t, expected, ioBps) + assert.Equal(t, expected*4, burstBps) + } +} + func TestParseBandwidth(t *testing.T) { tests := []struct { input string