In [1]:
import json
import random
import uuid


def generate_node(node_type):
    """生成单个节点的配置"""
    # GPU配置（数量为2的整数倍）
    gpu_configs = {
        "cloud": [4, 8],      # 云端节点4或8个GPU
        "edge": [2, 4],       # 边缘节点2或4个GPU
        "terminal": [2, 2]    # 终端节点1或2个GPU
    }

    # CPU配置（按实际服务器配置，8的整数倍）
    cpu_configs = {
        "cloud": [32, 48, 64],     # 云端节点CPU核心数
        "edge": [16, 24, 32],      # 边缘节点CPU核心数
        "terminal": [8, 16, 24]    # 终端节点CPU核心数
    }

    # 内存配置（按实际服务器配置，16的整数倍，单位GB）
    memory_configs = {
        "cloud": [128, 256, 512],    # 云端节点内存
        "edge": [64, 128, 256],      # 边缘节点内存
        "terminal": [32, 64, 128]    # 终端节点内存
    }

    # GPU型号及其对应的显存配置
    gpu_specs = {
        "cloud": {"model": "V100", "memory": 32},      # V100 32GB
        "edge": {"model": "P100", "memory": 16},       # P100 16GB
        "terminal": {"model": "T4", "memory": 16}      # T4 16GB
    }

    gpu_count = random.choice(gpu_configs[node_type])
    gpu_model = gpu_specs[node_type]["model"]

    gpu_list = []
    for _ in range(gpu_count):
        gpu_list.append(
            {
                "gpu_id": str(uuid.uuid4()),
                "gpu_type": gpu_model,
            }
        )

    return {
        "node_id": f"{node_type}_node_{random.randint(1000, 9999)}",
        "cpu_cores": random.choice(cpu_configs[node_type]),
        "memory": random.choice(memory_configs[node_type]),
        "gpu_count": gpu_count,
        "gpu_model": gpu_model,
        "gpus": gpu_list,
        "gpu_memory": gpu_specs[node_type]["memory"],
        "ip_address": f"192.168.{random.randint(1, 255)}.{random.randint(1, 255)}"
    }


In [2]:
def generate_cluster_config():
    """生成完整的集群配置"""
    # 集群数量配置
    cluster_counts = {
        "cloud": {"min": 1, "max": 1},  # 云端集群数量范围
        "edge": {"min": 2, "max": 2},  # 边缘集群数量范围
        "terminal": {"min": 4, "max": 4},  # 终端集群数量范围
    }

    # 每个集群的节点数量配置
    nodes_per_cluster = {
        "cloud": {"min": 10, "max": 10},  # 每个云端集群的节点数量范围
        "edge": {"min": 4, "max": 4},  # 每个边缘集群的节点数量范围
        "terminal": {"min": 4, "max": 4},  # 每个终端集群的节点数量范围
    }

    # 带宽配置（单位：Mbps）
    intra_domain_bandwidth = {
        "cloud": {"min": 40000, "max": 100000},  # 云端集群内带宽范围 (40Gbps-100Gbps)
        "edge": {"min": 10000, "max": 40000},    # 边缘集群内带宽范围 (10Gbps-40Gbps)
        "terminal": {"min": 1000, "max": 10000}, # 终端集群内带宽范围 (1Gbps-10Gbps)
    }

    inter_domain_bandwidth = {"min": 200, "max": 300}  # 域间带宽

    cluster_config = {"clusters": {"cloud_clusters": [], "edge_clusters": [], "terminal_clusters": []}}

    # 生成云端集群
    for i in range(random.randint(cluster_counts["cloud"]["min"], cluster_counts["cloud"]["max"])):
        cloud_cluster = {
            "cluster_id": f"cloud_cluster_{i+1}",
            "cluster_name": f"CloudCluster-{i+1}",
            "cluster_type": "cloud",
            "nodes": [
                generate_node("cloud")
                for _ in range(random.randint(nodes_per_cluster["cloud"]["min"], nodes_per_cluster["cloud"]["max"]))
            ],
            "intra_domain_bandwidth": random.randint(
                intra_domain_bandwidth["cloud"]["min"], intra_domain_bandwidth["cloud"]["max"]
            ),
            "inter_domain_bandwidth": random.randint(inter_domain_bandwidth["min"], inter_domain_bandwidth["max"]),
        }
        cluster_config["clusters"]["cloud_clusters"].append(cloud_cluster)

    # 生成边缘集群
    for i in range(random.randint(cluster_counts["edge"]["min"], cluster_counts["edge"]["max"])):
        edge_cluster = {
            "cluster_id": f"edge_cluster_{i+1}",
            "cluster_name": f"EdgeCluster-{i+1}",
            "cluster_type": "edge",
            "nodes": [
                generate_node("edge")
                for _ in range(random.randint(nodes_per_cluster["edge"]["min"], nodes_per_cluster["edge"]["max"]))
            ],
            "intra_domain_bandwidth": random.randint(
                intra_domain_bandwidth["edge"]["min"], intra_domain_bandwidth["edge"]["max"]
            ),
            "inter_domain_bandwidth": random.randint(inter_domain_bandwidth["min"], inter_domain_bandwidth["max"]),
        }
        cluster_config["clusters"]["edge_clusters"].append(edge_cluster)

    # 生成终端集群
    for i in range(random.randint(cluster_counts["terminal"]["min"], cluster_counts["terminal"]["max"])):
        terminal_cluster = {
            "cluster_id": f"terminal_cluster_{i+1}",
            "cluster_name": f"TerminalCluster-{i+1}",
            "cluster_type": "terminal",
            "nodes": [
                generate_node("terminal")
                for _ in range(
                    random.randint(nodes_per_cluster["terminal"]["min"], nodes_per_cluster["terminal"]["max"])
                )
            ],
            "intra_domain_bandwidth": random.randint(
                intra_domain_bandwidth["terminal"]["min"], intra_domain_bandwidth["terminal"]["max"]
            ),
            "inter_domain_bandwidth": random.randint(inter_domain_bandwidth["min"], inter_domain_bandwidth["max"]),
        }
        cluster_config["clusters"]["terminal_clusters"].append(terminal_cluster)

    return cluster_config

In [3]:
def print_config_summary(cluster_config):
    """打印配置摘要"""
    print("集群配置摘要:")
    print(f"云端集群数量: {len(cluster_config['clusters']['cloud_clusters'])}")
    print(f"边缘集群数量: {len(cluster_config['clusters']['edge_clusters'])}")
    print(f"终端集群数量: {len(cluster_config['clusters']['terminal_clusters'])}")

    print(f"云端节点数量: {sum(len(cluster['nodes']) for cluster in cluster_config['clusters']['cloud_clusters'])}")
    print(f"边缘节点数量: {sum(len(cluster['nodes']) for cluster in cluster_config['clusters']['edge_clusters'])}")
    print(f"终端节点数量: {sum(len(cluster['nodes']) for cluster in cluster_config['clusters']['terminal_clusters'])}")

    print(f"云端节点GPU数量: {sum(node['gpu_count'] for cluster in cluster_config['clusters']['cloud_clusters'] for node in cluster['nodes'])}")
    print(f"边缘节点GPU数量: {sum(node['gpu_count'] for cluster in cluster_config['clusters']['edge_clusters'] for node in cluster['nodes'])}")
    print(f"终端节点GPU数量: {sum(node['gpu_count'] for cluster in cluster_config['clusters']['terminal_clusters'] for node in cluster['nodes'])}")
    print(f"总GPU数量: {sum(node['gpu_count'] for cluster in cluster_config['clusters']['cloud_clusters'] for node in cluster['nodes']) + sum(node['gpu_count'] for cluster in cluster_config['clusters']['edge_clusters'] for node in cluster['nodes']) + sum(node['gpu_count'] for cluster in cluster_config['clusters']['terminal_clusters'] for node in cluster['nodes'])}")


    print(f"云集群平均带宽: {sum(cluster['intra_domain_bandwidth'] for cluster in cluster_config['clusters']['cloud_clusters']) / len(cluster_config['clusters']['cloud_clusters'])}")
    print(f"边缘集群平均带宽: {sum(cluster['intra_domain_bandwidth'] for cluster in cluster_config['clusters']['edge_clusters']) / len(cluster_config['clusters']['edge_clusters'])}")
    print(f"终端集群平均带宽: {sum(cluster['intra_domain_bandwidth'] for cluster in cluster_config['clusters']['terminal_clusters']) / len(cluster_config['clusters']['terminal_clusters'])}")

    print(f"平均域间带宽: {sum(cluster['inter_domain_bandwidth'] for cluster in cluster_config['clusters']['cloud_clusters']) / len(cluster_config['clusters']['cloud_clusters'])}")



In [4]:
def generate_fs_config(cluster_config):
    """生成训练任务配置"""
    training_tasks = {
        "ResNet50": {
            "task_type": "Image Classification",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "ImageNet",  # [[6]]
            "dataset_size": 150000,  # ImageNet原始数据集约150GB [[6]]
            "model_size": 98,  # ResNet50标准模型大小约98MB [[5]]
            "model_name": "ResNet50"
        },
        "MobileNetV3": {
            "task_type": "Image Classification",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "ImageNet",  # [[6]]
            "dataset_size": 150000,  # ImageNet原始数据集约150GB [[6]]
            "model_size": 22,  # MobileNetV3轻量化模型典型大小 [[3]]
            "model_name": "MobileNetV3"
        },
        "ResNet18": {
            "task_type": "Image Classification",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "CIFAR-10",  # [[2]][[4]]
            "dataset_size": 170,  # CIFAR-10压缩包标准大小约170MB [[2]][[4]]
            "model_size": 45,  # ResNet18典型参数量对应约45MB [[5]]
            "model_name": "ResNet18"
        },
        "MobileNetV2": {
            "task_type": "Image Classification",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "CIFAR-10",  # [[2]][[4]]
            "dataset_size": 170,  # CIFAR-10压缩包标准大小约170MB [[2]][[4]]
            "model_size": 14,  # MobileNetV2轻量化版本典型大小 [[3]]
            "model_name": "MobileNetV2"
        },
        "EfficientNet": {
            "task_type": "Image Classification",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "CIFAR-10",  # [[2]][[4]]
            "dataset_size": 170,  # CIFAR-10压缩包标准大小约170MB [[2]][[4]]
            "model_size": 29,  # EfficientNet-B0基准模型大小 [[5]]
            "model_name": "EfficientNet"
        },
        "VGG11": {
            "task_type": "Image Classification",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "CIFAR-10",  # [[2]][[4]]
            "dataset_size": 170,  # CIFAR-10压缩包标准大小约170MB [[2]][[4]]
            "model_size": 507,  # VGG11典型参数量对应约507MB [[5]]
            "model_name": "VGG11"
        },
        "DCGAN": {
            "task_type": "Image Generation",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "LSUN",  # [[9]]
            "dataset_size": 42000,  # LSUN官方发布版本约42GB [[9]]
            "model_size": 45,  # DCGAN基础架构典型模型大小 [[5]]
            "model_name": "DCGAN"
        },
        "PointNet": {
            "task_type": "3D Point Cloud Processing",
            "batch_sizes": [32, 64, 128],
            "dataset_name": "ShapeNet",  # [[8]]
            "dataset_size": 30000,  # ShapeNet Core55版本约30GB [[8]]
            "model_size": 40,  # PointNet基础模型参数量对应约40MB [[5]]
            "model_name": "PointNet"
        },
        "BERT": {
            "task_type": "Question Answering",
            "batch_sizes": [32],
            "dataset_name": "SQuAD",  # [[9]]
            "dataset_size": 35000,  # SQuAD v2.0预处理后约35GB [[9]]
            "model_size": 1200,  # BERT-Base英文版约1.2GB [[5]]
            "model_name": "BERT"
        },
        "LSTM": {
            "task_type": "Language Modeling",
            "batch_sizes": [64, 128],
            "dataset_name": "Wikitext2",  # [[9]]
            "dataset_size": 5000,  # Wikitext2原始文本约5GB [[9]]
            "model_size": 35,  # 单层LSTM典型参数量对应约35MB [[5]]
            "model_name": "LSTM"
        },
        "Transformer": {
            "task_type": "Machine Translation",
            "batch_sizes": [32, 64],
            "dataset_name": "Multi30k",  # [[9]]
            "dataset_size": 3000,  # Multi30k标准版本约3GB [[9]]
            "model_size": 85,  # 基础Transformer模型参数量对应约85MB [[5]]
            "model_name": "Transformer"
        }
    }

    # ... existing code ...
    # 获取所有节点的ID
    all_nodes = []
    for clusters in cluster_config["clusters"].values():
        for cluster in clusters:
            for node in cluster["nodes"]:
                all_nodes.append(node["node_id"])

    # 为每个训练任务的数据随机分配节点
    training_distribution = {
        "tasks": []
    }

    for model_name, task_info in training_tasks.items():
        data_nodes = random.sample(all_nodes, 2)
        model_nodes = random.sample(all_nodes, 2)

        task_config = {
            "model_name": model_name,
            "task_type": task_info["task_type"],
            "batch_sizes": task_info["batch_sizes"],
            "dataset": {
                "size_mb": task_info["dataset_size"],
                "storage_nodes": data_nodes
            },
            "model": {
                "size_mb": task_info["model_size"],
                "storage_nodes": model_nodes
            }
        }
        training_distribution["tasks"].append(task_config)

    return training_distribution

In [5]:
if __name__ == "__main__":
    # 生成集群资源配置
    cluster_config = generate_cluster_config()

    # print(config summary)
    print_config_summary(cluster_config)

    # 生成数据布局配置
    fs_config = generate_fs_config(cluster_config)

    # 将配置保存到文件
    with open("cluster_config.json", "w", encoding="utf-8") as f:
        json.dump(cluster_config, f, ensure_ascii=False, indent=2)

    with open("fs_config.json", "w", encoding="utf-8") as f:
        json.dump(fs_config, f, ensure_ascii=False, indent=2)

集群配置摘要:
云端集群数量: 1
边缘集群数量: 2
终端集群数量: 4
云端节点数量: 10
边缘节点数量: 8
终端节点数量: 16
云端节点GPU数量: 60
边缘节点GPU数量: 24
终端节点GPU数量: 32
总GPU数量: 116
云集群平均带宽: 80967.0
边缘集群平均带宽: 18573.0
终端集群平均带宽: 4595.75
平均域间带宽: 223.0
