In [None]:
%%capture
!pip install google-cloud-container google-auth google-auth-oauthlib google-auth-httplib2 google-cloud-compute


In [None]:
!gcloud auth application-default login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=6wMqMd5jbEvCCivLHy3EIZ6q4t69hm&prompt=consent&token_usage=remote&access_type=offline&code_challenge=Q8qziRPlhfWwVF6YPXg69AP4nJuFfA9BakAsEmHBaac&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 

Command killed by keyboard interrupt

^C


In [None]:
import time
from google.cloud import container_v1, compute_v1
from google.auth import exceptions
from google.auth.transport.requests import Request
import concurrent.futures

In [None]:
project_id = 'YOUR_ID'  
cluster_name = 'YOUR_CLUSTER_NAME'
machine_type = 'n1-standard-4'
num_nodes = 1
accelerator_type = 'nvidia-tesla-t4'
accelerator_count = 1

In [None]:
client = container_v1.ClusterManagerClient()

compute_client = compute_v1.ZonesClient()

def get_available_zones():
    zones = []
    for zone in compute_client.list(project=project_id):
        if zone.status == "UP":
            zones.append(zone.name)
    return zones

def create_cluster_in_zone(zone):
    cluster = container_v1.Cluster(
        name=cluster_name,
        initial_node_count=num_nodes,
        node_config=container_v1.NodeConfig(
            machine_type=machine_type,
            oauth_scopes=[
                "https://www.googleapis.com/auth/cloud-platform"
            ],
            accelerators=[container_v1.AcceleratorConfig(
                accelerator_type=accelerator_type,
                accelerator_count=accelerator_count
            )],
        )
    )

    create_request = container_v1.CreateClusterRequest(
        project_id=project_id,
        zone=zone,
        cluster=cluster
    )

    try:
        operation = client.create_cluster(create_request)
        print(f"Cluster creation started in zone {zone}, operation name: {operation.name}")

        while True:
            op_result = client.get_operation(
                project_id=project_id,
                zone=zone,
                operation_id=operation.name
            )
            if op_result.status == container_v1.Operation.Status.DONE:
                if op_result.error:
                    print(f"Error occurred during cluster creation in zone {zone}: {op_result.error}")
                    return (zone, False, f"Error: {op_result.error}")
                else:
                    print(f"Cluster {cluster_name} created successfully in zone {zone}!")
                    return (zone, True, "Success")
            else:
                print(f"Cluster creation in {zone} still in progress...")
                time.sleep(10)

    except exceptions.GoogleAuthError as auth_error:
        print(f"Authentication error in {zone}: {auth_error}")
        return (zone, False, f"Auth Error: {auth_error}")
    except Exception as e:
        print(f"Failed to create cluster in zone {zone}: {str(e)}")
        return (zone, False, f"Error: {str(e)}")

def cleanup_other_attempts(successful_zone, all_zones):
    for zone in all_zones:
        if zone == successful_zone:
            continue

        try:
            operations = client.list_operations(
                project_id=project_id,
                zone=zone
            )

            for operation in operations.operations:
                if cluster_name in operation.target_link and operation.status != container_v1.Operation.Status.DONE:
                    print(f"Canceling cluster creation in zone {zone}")
                    client.cancel_operation(
                        project_id=project_id,
                        zone=zone,
                        operation_id=operation.name
                    )
        except Exception as e:
            print(f"Error while cleaning up operations in zone {zone}: {str(e)}")

def try_create_cluster_parallel(max_workers=50):
    zones = get_available_zones()
    if not zones:
        print("No available zones found.")
        return

    print(f"Attempting to create cluster in parallel across {len(zones)} zones")

    zones_to_try = zones[:max_workers]

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_zone = {executor.submit(create_cluster_in_zone, zone): zone for zone in zones_to_try}

        for future in concurrent.futures.as_completed(future_to_zone):
            zone = future_to_zone[future]
            try:
                zone_result, success, message = future.result()
                if success:
                    print(f"Successfully created cluster in {zone_result}. Canceling other attempts...")
                    cleanup_other_attempts(zone_result, zones_to_try)
                    return zone_result
            except Exception as e:
                print(f"Exception in zone {zone}: {str(e)}")

    print("All parallel attempts to create the cluster have failed.")
    return None

if __name__ == "__main__":
    successful_zone = try_create_cluster_parallel()
    if successful_zone:
        print(f"Cluster creation completed successfully in zone {successful_zone}")
    else:
        print("Cluster creation failed in all attempted zones")