In [55]:
import os
import json

DEBUG = False

In [57]:
def check_k8sgpt(k8sgpt_file):
    with open(k8sgpt_file, "r") as f:
        try:
            k8sgpt_data = json.load(f)
            if k8sgpt_data["results"]:
                for result in k8sgpt_data["results"]:
                    if result.get("kind") == "Service":
                        continue # Skip Services
                    if result.get("kind") == "PersistentVolumeClaim":
                        for error in result.get("error", []):
                            DEBUG and print(f"PVC error: {error['Text']} in {k8sgpt_file}")
                            return "PVC_failed"
                    if result.get("kind") == "Pod":
                        if "mysql" in result.get("parentObject"):
                            for error in result.get("error", []):
                                if "Back-off pulling image" in error["Text"]:
                                    # raise Exception
                                    raise Exception(f"Back-off pulling image in {k8sgpt_file}")
                                elif "PersistentVolumeClaims" in error["Text"]:
                                    DEBUG and print(f"MySQL pod has unbound immediate PersistentVolumeClaims in {k8sgpt_file}")
                                    return "MySQL_PVC_unbound"
                                elif "Readiness probe failed" in error["Text"]:
                                    DEBUG and print(f"MySQL Readiness probe failed in {k8sgpt_file}")
                                    return "MySQL_not_ready"
                                else:
                                    DEBUG and print(f"MySQL unknown error in {k8sgpt_file}")
                                    return "Unknown_error"
                        else:
                            for error in result.get("error", []):
                                if "Back-off pulling image" in error["Text"]:
                                    # raise Exception
                                    raise Exception(f"Back-off pulling image in {k8sgpt_file}")
                                elif "PersistentVolumeClaims" in error["Text"]:
                                    DEBUG and print(f"Wordpress pod has unbound immediate PersistentVolumeClaims in {k8sgpt_file}")
                                    return "Wordpress_PVC_unbound"
                                elif "Readiness probe failed" in error["Text"]:
                                    DEBUG and print(f"Wordpress Readiness probe failed in {k8sgpt_file}")
                                    return "Wordpress_not_ready"
                                else:
                                    DEBUG and print(f"Wordpress unknown error in {k8sgpt_file}")
                                    return "Unknown_error"
                    else:
                        kind = result.get("kind")
                        for error in result.get("error", []):
                            #raise Exception(f"{kind} error: {error['Text']} in {k8sgpt_file}")
                            DEBUG and print(f"{kind} error: {error['Text']} in {k8sgpt_file}")
                            return "Unknown_error"
        except json.JSONDecodeError:
            raise Exception(f"Error: Failed to parse JSON in {k8sgpt_file}")

In [121]:
import os  
import json
from collections import defaultdict

def aggregate_test_results(base_directory):

    aggregated_results = {
        "total_responses": 0,
        "secrets_used": 0,
        "base64_encoding_needed": 0,
        "yaml_not_wrapped": 0,
        "invalid_yaml": 0,
        "kubeconform_failed": 0,
        "no_mysql_or_wordpress": 0,
        "deployment_failed": 0,
        "duplicate_resources": 0,
        "healthy": 0,
        "unhealthy": 0,
        "PVC_failed": 0,
        "MySQL_PVC_unbound": 0,
        "MySQL_not_ready": 0,
        "Wordpress_PVC_unbound": 0,
        "Wordpress_not_ready": 0,
        "Unknown_error": 0
    }
    aggregate_test_results
    scores_healthy = []
    
    # Iterate over all directories in the base directory
    for response_dir in os.listdir(base_directory):
        response_path = os.path.join(base_directory, response_dir)
        
        # Check if it's a directory and matches the response-* pattern
        if os.path.isdir(response_path) and response_dir.startswith("response-"):
            testing_file = os.path.join(response_path, "testing.json")
            
            # Check if the testing.json file exists
            if os.path.exists(testing_file):
                with open(testing_file, "r") as f:
                    try:
                        testing_data = json.load(f)

                        # Aggregate results  
                        aggregated_results["total_responses"] += 1

                        # Get extra information
                        if testing_data.get("secrets_found"):
                            aggregated_results["secrets_used"] += 1
                            DEBUG and print(f"secrets used in {response_path}")

                        if testing_data.get("base64_needed"):
                            aggregated_results["base64_encoding_needed"] += 1

                        if testing_data.get("yaml_not_wrapped"):
                            aggregated_results["yaml_not_wrapped"] += 1
                            print(f"YAML not wrapped in {response_path}")

                        if not testing_data.get("valid_yaml"):
                            aggregated_results["invalid_yaml"] += 1
                        
                        elif not testing_data.get("kubeconform"):
                            aggregated_results["kubeconform_failed"] += 1
                            kubeconform_file = os.path.join(response_path, "conform.json")
                            if not os.path.exists(kubeconform_file):
                                raise Exception(f"conform.json not found in {response_path}")

                        elif not testing_data.get("mysql_found") or not testing_data.get("wordpress_found"):
                            aggregated_results["no_mysql_or_wordpress"] += 1

                        elif not testing_data.get("deployed_successful"):
                            aggregated_results["deployment_failed"] += 1

                            for deployment_error in testing_data.get("deploy_errors"):
                                DEBUG and print(f"Deployment failed for {response_path}, error: {deployment_error}")
                                if "AlreadyExists" in deployment_error:
                                    aggregated_results["duplicate_resources"] += 1
                                    break

                        elif testing_data.get("healthy"):
                            aggregated_results["healthy"] += 1
                            scores_healthy.append(int(testing_data.get("polaris_score")))

                        else:
                            # why did it fail if it was not healthy
                            aggregated_results["unhealthy"] += 1

                            # Check why it failed in the k8sgpt.json file
                            k8sgpt_file = os.path.join(response_path, "k8sgpt.json")
                            if os.path.exists(k8sgpt_file):
                                cause = check_k8sgpt(k8sgpt_file)
                                if cause == "PVC_failed":
                                    aggregated_results["PVC_failed"] += 1
                                elif cause == "MySQL_PVC_unbound":
                                    aggregated_results["MySQL_PVC_unbound"] += 1
                                elif cause == "MySQL_not_ready":
                                    aggregated_results["MySQL_not_ready"] += 1
                                elif cause == "Wordpress_PVC_unbound":
                                    aggregated_results["Wordpress_PVC_unbound"] += 1
                                elif cause == "Wordpress_not_ready":
                                    aggregated_results["Wordpress_not_ready"] += 1
                                elif cause == "Unknown_error":
                                    aggregated_results["Unknown_error"] += 1
                                
                            else:
                                raise Exception(f"k8sgpt.json not found in {response_path}")
                        
                    except json.JSONDecodeError:
                        raise Exception(f"Error: Failed to parse JSON in {testing_file}")
            else:
                raise Exception(f"testing.json not found in {response_path}")
    return aggregated_results, scores_healthy

# GPT-4o: Zero-Shot

In [61]:
base_directory = "./gpt4o/zero_shot/baseline_system_prompt"
gpt4o_zs_baseline_system_prompt_results, gpt4o_zs_baseline_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")  
print(json.dumps(gpt4o_zs_baseline_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_baseline_system_prompt_scores}")

Aggregated Results: ./gpt4o/zero_shot/baseline_system_prompt
{
    "total_responses": 50,
    "secrets_used": 31,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 25,
    "unhealthy": 25,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 24,
    "Unknown_error": 0
}
Polaris Scores: [40, 40, 37, 37, 35, 36, 40, 40, 40, 40, 37, 35, 37, 40, 35, 35, 40, 37, 35, 35, 43, 35, 40, 40, 40]


In [62]:
base_directory = "./gpt4o/zero_shot/baseline_system_prompt_detailed"
gpt4o_zs_baseline_system_prompt_detailed_results, gpt4o_zs_baseline_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")  
print(json.dumps(gpt4o_zs_baseline_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_baseline_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/zero_shot/baseline_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 49,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 43,
    "unhealthy": 7,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 7,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 44, 42, 42, 39, 44, 42, 42, 42, 42, 42, 39, 44, 42, 44, 42, 42, 42, 42, 42, 44, 44, 39, 42, 42, 42, 42]


In [63]:
base_directory = "./gpt4o/zero_shot/role_system_prompt"
gpt4o_zs_role_system_prompt_results, gpt4o_zs_role_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_system_prompt_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_system_prompt
{
    "total_responses": 50,
    "secrets_used": 35,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 1,
    "healthy": 24,
    "unhealthy": 25,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 25,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 35, 40, 40, 40, 40, 41, 40, 35, 35, 35, 40, 38, 40, 40, 35, 40, 37, 40, 40, 36, 40, 38]


In [64]:
base_directory = "./gpt4o/zero_shot/role_system_prompt_detailed"
gpt4o_zs_role_system_prompt_detailed_results, gpt4o_zs_role_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 48,
    "base64_encoding_needed": 5,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 43,
    "unhealthy": 6,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 4,
    "Unknown_error": 1
}
Polaris Scores: [42, 44, 40, 42, 42, 39, 42, 42, 44, 44, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 42, 42, 42, 44, 44, 42, 42, 42, 44, 42, 44, 42, 42, 42, 42]


In [65]:
base_directory = "./gpt4o/zero_shot/role_best_system_prompt"
gpt4o_zs_role_best_system_prompt_results, gpt4o_zs_role_best_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_best_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_best_system_prompt_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_best_system_prompt
{
    "total_responses": 50,
    "secrets_used": 36,
    "base64_encoding_needed": 2,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 19,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 18,
    "Unknown_error": 1
}
Polaris Scores: [36, 40, 37, 35, 40, 40, 40, 40, 35, 37, 35, 40, 40, 40, 41, 40, 40, 35, 41, 40, 40, 40, 37, 43, 40, 35, 37, 40, 40, 40]


In [66]:
base_directory = "./gpt4o/zero_shot/role_best_system_prompt_detailed"
gpt4o_zs_role_best_system_prompt_detailed_results, gpt4o_zs_role_best_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_best_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_best_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_best_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 10,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 40,
    "unhealthy": 9,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 9,
    "Unknown_error": 0
}
Polaris Scores: [44, 44, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 44, 44, 42, 42, 44, 42, 42, 44, 42, 42, 44, 44, 44, 42, 42, 44, 44, 44, 42, 42, 44, 42]


# GPT-3.5: Zero-Shot

In [67]:
base_directory = "./gpt3_5/zero_shot/baseline_system_prompt"
gpt3_5_zs_baseline_system_prompt_results, gpt3_5_zs_baseline_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_baseline_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_baseline_system_prompt_scores}")

Aggregated Results: ./gpt3_5/zero_shot/baseline_system_prompt
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 6,
    "duplicate_resources": 0,
    "healthy": 3,
    "unhealthy": 37,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 36,
    "Unknown_error": 0
}
Polaris Scores: [35, 35, 32]


In [68]:
base_directory = "./gpt3_5/zero_shot/baseline_system_prompt_detailed"
gpt3_5_zs_baseline_system_prompt_detailed_results, gpt3_5_zs_baseline_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_baseline_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_baseline_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/zero_shot/baseline_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 48,
    "base64_encoding_needed": 24,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 6,
    "duplicate_resources": 2,
    "healthy": 0,
    "unhealthy": 43,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 34,
    "Unknown_error": 9
}
Polaris Scores: []


In [69]:
base_directory = "./gpt3_5/zero_shot/role_system_prompt"
gpt3_5_zs_role_system_prompt_results, gpt3_5_zs_role_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_system_prompt_scores}")

Aggregated Results: ./gpt3_5/zero_shot/role_system_prompt
{
    "total_responses": 50,
    "secrets_used": 0,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 13,
    "duplicate_resources": 0,
    "healthy": 1,
    "unhealthy": 36,
    "PVC_failed": 3,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 31,
    "Unknown_error": 2
}
Polaris Scores: [35]


In [70]:
base_directory = "./gpt3_5/zero_shot/role_system_prompt_detailed"
gpt3_5_zs_role_system_prompt_detailed_results, gpt3_5_zs_role_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/zero_shot/role_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 44,
    "base64_encoding_needed": 20,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 3,
    "unhealthy": 45,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 31,
    "Unknown_error": 14
}
Polaris Scores: [39, 38, 38]


In [71]:
base_directory = "./gpt3_5/zero_shot/role_best_system_prompt"
gpt3_5_zs_role_best_system_prompt_results, gpt3_5_zs_role_best_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_best_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_best_system_prompt_scores}")

Aggregated Results: ./gpt3_5/zero_shot/role_best_system_prompt
{
    "total_responses": 50,
    "secrets_used": 0,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 4,
    "duplicate_resources": 0,
    "healthy": 0,
    "unhealthy": 45,
    "PVC_failed": 4,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 40,
    "Unknown_error": 1
}
Polaris Scores: []


In [72]:
base_directory = "./gpt3_5/zero_shot/role_best_system_prompt_detailed"
gpt3_5_zs_role_best_system_prompt_detailed_results, gpt3_5_zs_role_best_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_best_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_best_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/zero_shot/role_best_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 41,
    "base64_encoding_needed": 21,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 2,
    "duplicate_resources": 1,
    "healthy": 1,
    "unhealthy": 45,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 36,
    "Unknown_error": 9
}
Polaris Scores: [41]


# GPT-4: Zero-Shot

In [73]:
base_directory = "./gpt4/zero_shot/baseline_system_prompt"
gpt4_zs_baseline_system_prompt_results, gpt4_zs_baseline_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_baseline_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_baseline_system_prompt_scores}")

Aggregated Results: ./gpt4/zero_shot/baseline_system_prompt
{
    "total_responses": 50,
    "secrets_used": 24,
    "base64_encoding_needed": 11,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 10,
    "unhealthy": 39,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 34,
    "Unknown_error": 5
}
Polaris Scores: [40, 35, 35, 35, 35, 35, 35, 35, 35, 35]


In [74]:
base_directory = "./gpt4/zero_shot/baseline_system_prompt_detailed"
gpt4_zs_baseline_system_prompt_detailed_results, gpt4_zs_baseline_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_baseline_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_baseline_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/zero_shot/baseline_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 12,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 29,
    "unhealthy": 15,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Unknown_error": 2
}
Polaris Scores: [39, 42, 42, 44, 39, 42, 42, 44, 43, 42, 42, 42, 44, 42, 39, 42, 42, 42, 42, 43, 42, 42, 44, 42, 42, 43, 42, 42, 42]


In [75]:
base_directory = "./gpt4/zero_shot/role_system_prompt"
gpt4_zs_role_system_prompt_results, gpt4_zs_role_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_system_prompt_scores}")

Aggregated Results: ./gpt4/zero_shot/role_system_prompt
{
    "total_responses": 50,
    "secrets_used": 27,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 6,
    "unhealthy": 42,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 38,
    "Unknown_error": 4
}
Polaris Scores: [35, 40, 35, 37, 35, 40]


In [76]:
base_directory = "./gpt4/zero_shot/role_system_prompt_detailed"
gpt4_zs_role_system_prompt_detailed_results, gpt4_zs_role_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/zero_shot/role_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 50,
    "base64_encoding_needed": 4,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 35,
    "unhealthy": 15,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Unknown_error": 2
}
Polaris Scores: [42, 42, 44, 39, 41, 42, 44, 42, 42, 42, 39, 43, 42, 39, 42, 39, 42, 42, 42, 43, 42, 42, 46, 43, 43, 42, 42, 43, 42, 42, 43, 42, 43, 39, 39]


In [77]:
base_directory = "./gpt4/zero_shot/role_best_system_prompt"
gpt4_zs_role_best_system_prompt_results, gpt4_zs_role_best_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_best_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_best_system_prompt_scores}")

Aggregated Results: ./gpt4/zero_shot/role_best_system_prompt
{
    "total_responses": 50,
    "secrets_used": 42,
    "base64_encoding_needed": 11,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 5,
    "unhealthy": 44,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 42,
    "Unknown_error": 2
}
Polaris Scores: [37, 40, 37, 35, 40]


In [78]:
base_directory = "./gpt4/zero_shot/role_best_system_prompt_detailed"
gpt4_zs_role_best_system_prompt_detailed_results, gpt4_zs_role_best_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_best_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_best_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/zero_shot/role_best_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 49,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 36,
    "unhealthy": 13,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Unknown_error": 3
}
Polaris Scores: [41, 44, 42, 42, 42, 41, 46, 42, 44, 42, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 39, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42]


# GPT-4o: CoT

In [122]:
base_directory = "./gpt4o/cot/human_prompt"
gpt4o_cot_human_prompt_results, gpt4o_cot_human_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_human_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_human_prompt_scores}")

YAML not wrapped in ./gpt4o/cot/human_prompt/response-0
Aggregated Results: ./gpt4o/cot/human_prompt
{
    "total_responses": 50,
    "secrets_used": 21,
    "base64_encoding_needed": 12,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 3,
    "kubeconform_failed": 20,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 20,
    "unhealthy": 7,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 5,
    "Unknown_error": 0
}
Polaris Scores: [35, 35, 35, 40, 39, 35, 40, 40, 40, 43, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40]


In [123]:
base_directory = "./gpt4o/cot/human_prompt_detailed"
gpt4o_cot_human_prompt_detailed_results, gpt4o_cot_human_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_human_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_human_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/cot/human_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 25,
    "base64_encoding_needed": 10,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 25,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 18,
    "unhealthy": 7,
    "PVC_failed": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 5,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 42, 42, 42, 42, 42, 42, 40, 42, 42, 42, 44, 44, 42, 42, 42, 40]


In [124]:
base_directory = "./gpt4o/cot/ape_prompt"
gpt4o_cot_ape_prompt_results, gpt4o_cot_ape_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_ape_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_ape_prompt_scores}")

Aggregated Results: ./gpt4o/cot/ape_prompt
{
    "total_responses": 50,
    "secrets_used": 30,
    "base64_encoding_needed": 11,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 12,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 26,
    "unhealthy": 12,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 12,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 40, 40, 40, 40, 40, 35, 40, 40, 40, 40, 40, 40, 40, 40, 37, 40, 40, 35, 40, 35, 43, 40, 40, 40]


In [125]:
base_directory = "./gpt4o/cot/ape_prompt_detailed"
gpt4o_cot_ape_prompt_detailed_results, gpt4o_cot_ape_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_ape_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_ape_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/cot/ape_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 32,
    "base64_encoding_needed": 20,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 18,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 21,
    "unhealthy": 11,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 9,
    "Unknown_error": 1
}
Polaris Scores: [42, 42, 42, 44, 44, 40, 42, 42, 42, 42, 40, 42, 42, 44, 42, 42, 44, 44, 42, 44, 42]


# GPT-4: CoT

In [83]:
base_directory = "./gpt4/cot/human_prompt"
gpt4_cot_human_prompt_results, gpt4_cot_human_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_human_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_human_prompt_scores}")

Aggregated Results: ./gpt4/cot/human_prompt
{
    "total_responses": 50,
    "secrets_used": 33,
    "base64_encoding_needed": 23,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 3,
    "no_mysql_or_wordpress": 2,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 15,
    "unhealthy": 30,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 30,
    "Unknown_error": 0
}
Polaris Scores: [40, 40, 35, 40, 35, 40, 40, 35, 42, 35, 35, 35, 35, 35, 35]


In [84]:
base_directory = "./gpt4/cot/human_prompt_detailed"
gpt4_cot_human_prompt_detailed_results, gpt4_cot_human_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_human_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_human_prompt_detailed_scores}")

Aggregated Results: ./gpt4/cot/human_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 23,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 3,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 31,
    "unhealthy": 16,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 16,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 42, 42, 42, 42, 42, 43, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 43, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42]


In [85]:
base_directory = "./gpt4/cot/ape_prompt"
gpt4_cot_ape_prompt_results, gpt4_cot_ape_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_ape_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_ape_prompt_scores}")

Aggregated Results: ./gpt4/cot/ape_prompt
{
    "total_responses": 50,
    "secrets_used": 34,
    "base64_encoding_needed": 25,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 18,
    "unhealthy": 28,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 26,
    "Unknown_error": 2
}
Polaris Scores: [35, 35, 35, 35, 35, 40, 40, 40, 35, 35, 40, 40, 40, 40, 35, 40, 40, 35]


In [86]:
base_directory = "./gpt4/cot/ape_prompt_detailed"
gpt4_cot_ape_prompt_detailed_results, gpt4_cot_ape_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_ape_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_ape_prompt_detailed_scores}")

Aggregated Results: ./gpt4/cot/ape_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 46,
    "base64_encoding_needed": 25,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 25,
    "unhealthy": 22,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 15,
    "Unknown_error": 7
}
Polaris Scores: [42, 42, 44, 42, 42, 42, 42, 36, 42, 42, 42, 39, 42, 39, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 42]


# GPT-3.5: CoT

In [87]:
base_directory = "./gpt3_5/cot/human_prompt"
gpt3_5_cot_human_prompt_results, gpt3_5_cot_human_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_human_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_human_prompt_scores}")

Aggregated Results: ./gpt3_5/cot/human_prompt
{
    "total_responses": 50,
    "secrets_used": 10,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 9,
    "unhealthy": 38,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 35,
    "Unknown_error": 3
}
Polaris Scores: [35, 32, 35, 32, 36, 32, 35, 35, 36]


In [88]:
base_directory = "./gpt3_5/cot/human_prompt_detailed"
gpt3_5_cot_human_prompt_detailed_results, gpt3_5_cot_human_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_human_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_human_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/cot/human_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 41,
    "base64_encoding_needed": 22,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 4,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 2,
    "unhealthy": 40,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 36,
    "Unknown_error": 4
}
Polaris Scores: [36, 39]


In [89]:
base_directory = "./gpt3_5/cot/ape_prompt"
gpt3_5_cot_ape_prompt_results, gpt3_5_cot_ape_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_ape_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_ape_prompt_scores}")

Aggregated Results: ./gpt3_5/cot/ape_prompt
{
    "total_responses": 50,
    "secrets_used": 10,
    "base64_encoding_needed": 9,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 10,
    "unhealthy": 37,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 37,
    "Unknown_error": 0
}
Polaris Scores: [32, 32, 32, 32, 32, 32, 32, 32, 35, 32]


In [90]:
base_directory = "./gpt3_5/cot/ape_prompt_detailed"
gpt3_5_cot_ape_prompt_detailed_results, gpt3_5_cot_ape_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_ape_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_ape_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/cot/ape_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 24,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 3,
    "duplicate_resources": 0,
    "healthy": 7,
    "unhealthy": 39,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 32,
    "Unknown_error": 7
}
Polaris Scores: [36, 36, 36, 36, 36, 36, 39]


# GPT-4o: ToT

In [91]:
base_directory = "./gpt4o/tot/tot_prompt_1"
gpt4o_tot_prompt_1_results, gpt4o_tot_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_prompt_1_scores}")


Aggregated Results: ./gpt4o/tot/tot_prompt_1
{
    "total_responses": 50,
    "secrets_used": 29,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 19,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 17,
    "Unknown_error": 1
}
Polaris Scores: [40, 43, 40, 35, 43, 35, 37, 35, 35, 37, 37, 35, 40, 40, 35, 35, 43, 37, 37, 40, 35, 40, 40, 40, 40, 35, 40, 40, 37, 40]


In [92]:
base_directory = "./gpt4o/tot/tot_prompt_2"
gpt4o_tot_prompt_2_results, gpt4o_tot_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_prompt_2_scores}")

Aggregated Results: ./gpt4o/tot/tot_prompt_2
{
    "total_responses": 50,
    "secrets_used": 28,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 2,
    "duplicate_resources": 2,
    "healthy": 21,
    "unhealthy": 23,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 19,
    "Unknown_error": 3
}
Polaris Scores: [40, 40, 40, 35, 40, 35, 35, 40, 37, 35, 35, 35, 43, 40, 35, 35, 37, 43, 40, 35, 40]


In [155]:
base_directory = "./gpt4o/tot/tot_prompt_3"
gpt4o_tot_prompt_3_results, gpt4o_tot_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_prompt_3_scores}")

Aggregated Results: ./gpt4o/tot/tot_prompt_3
{
    "total_responses": 50,
    "secrets_used": 20,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 3,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 1,
    "healthy": 26,
    "unhealthy": 18,
    "PVC_failed": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 16,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 37, 37, 37, 43, 40, 40, 40, 40, 40, 37, 35, 40, 35, 37, 37, 37, 40, 40, 37, 37, 35, 40, 40, 37]


In [127]:
base_directory = "./gpt4o/tot/tot_detailed_prompt_1"
gpt4o_tot_detailed_prompt_1_results, gpt4o_tot_detailed_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_detailed_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_detailed_prompt_1_scores}")

Aggregated Results: ./gpt4o/tot/tot_detailed_prompt_1
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 36,
    "unhealthy": 14,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 11,
    "Unknown_error": 2
}
Polaris Scores: [42, 42, 44, 44, 44, 44, 42, 42, 44, 42, 42, 42, 42, 42, 42, 44, 44, 42, 44, 42, 42, 42, 42, 42, 42, 43, 42, 44, 42, 42, 44, 44, 42, 44, 39, 42]


In [128]:
base_directory = "./gpt4o/tot/tot_detailed_prompt_2"
gpt4o_tot_detailed_prompt_2_results, gpt4o_tot_detailed_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_detailed_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_detailed_prompt_2_scores}")

Aggregated Results: ./gpt4o/tot/tot_detailed_prompt_2
{
    "total_responses": 50,
    "secrets_used": 50,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 38,
    "unhealthy": 12,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 9,
    "Unknown_error": 2
}
Polaris Scores: [42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 40, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 42, 42, 42, 42, 39, 42, 42, 42, 42]


In [129]:
base_directory = "./gpt4o/tot/tot_detailed_prompt_3"
gpt4o_tot_detailed_prompt_3_results, gpt4o_tot_detailed_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_detailed_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_detailed_prompt_3_scores}")

Aggregated Results: ./gpt4o/tot/tot_detailed_prompt_3
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 18,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 15,
    "Unknown_error": 3
}
Polaris Scores: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42]


# GPT-4: ToT

In [130]:
base_directory = "./gpt4/tot/tot_prompt_1"
gpt4_tot_prompt_1_results, gpt4_tot_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_prompt_1_scores}")

Aggregated Results: ./gpt4/tot/tot_prompt_1
{
    "total_responses": 50,
    "secrets_used": 9,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 4,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 13,
    "unhealthy": 31,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 24,
    "Unknown_error": 7
}
Polaris Scores: [37, 35, 37, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]


In [131]:
base_directory = "./gpt4/tot/tot_prompt_2"
gpt4_tot_prompt_2_results, gpt4_tot_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_prompt_2_scores}")

Aggregated Results: ./gpt4/tot/tot_prompt_2
{
    "total_responses": 50,
    "secrets_used": 20,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 9,
    "no_mysql_or_wordpress": 2,
    "deployment_failed": 24,
    "duplicate_resources": 23,
    "healthy": 3,
    "unhealthy": 10,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 4,
    "Unknown_error": 6
}
Polaris Scores: [35, 40, 35]


In [132]:
base_directory = "./gpt4/tot/tot_prompt_3"
gpt4_tot_prompt_3_results, gpt4_tot_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_prompt_3_scores}")

YAML not wrapped in ./gpt4/tot/tot_prompt_3/response-25
YAML not wrapped in ./gpt4/tot/tot_prompt_3/response-48
Aggregated Results: ./gpt4/tot/tot_prompt_3
{
    "total_responses": 50,
    "secrets_used": 11,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 2,
    "invalid_yaml": 7,
    "kubeconform_failed": 6,
    "no_mysql_or_wordpress": 9,
    "deployment_failed": 4,
    "duplicate_resources": 4,
    "healthy": 1,
    "unhealthy": 23,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Unknown_error": 10
}
Polaris Scores: [35]


In [136]:
base_directory = "./gpt4/tot/tot_detailed_prompt_1"
gpt4_tot_detailed_prompt_1_results, gpt4_tot_detailed_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_detailed_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_detailed_prompt_1_scores}")

Aggregated Results: ./gpt4/tot/tot_detailed_prompt_1
{
    "total_responses": 50,
    "secrets_used": 32,
    "base64_encoding_needed": 14,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 3,
    "no_mysql_or_wordpress": 14,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 18,
    "unhealthy": 15,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Unknown_error": 2
}
Polaris Scores: [42, 44, 42, 42, 42, 44, 42, 42, 42, 39, 42, 44, 42, 42, 42, 39, 43, 42]


In [134]:
base_directory = "./gpt4/tot/tot_detailed_prompt_2"
gpt4_tot_detailed_prompt_2_results, gpt4_tot_detailed_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_detailed_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_detailed_prompt_2_scores}")

Aggregated Results: ./gpt4/tot/tot_detailed_prompt_2
{
    "total_responses": 50,
    "secrets_used": 30,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 5,
    "kubeconform_failed": 6,
    "no_mysql_or_wordpress": 8,
    "deployment_failed": 9,
    "duplicate_resources": 9,
    "healthy": 8,
    "unhealthy": 14,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 7,
    "Unknown_error": 7
}
Polaris Scores: [44, 42, 42, 42, 42, 40, 44, 42]


In [135]:
base_directory = "./gpt4/tot/tot_detailed_prompt_3"
gpt4_tot_detailed_prompt_3_results, gpt4_tot_detailed_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_detailed_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_detailed_prompt_3_scores}")

YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-22
YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-45
YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-16
YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-37
YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-39
YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-30
YAML not wrapped in ./gpt4/tot/tot_detailed_prompt_3/response-35
Aggregated Results: ./gpt4/tot/tot_detailed_prompt_3
{
    "total_responses": 50,
    "secrets_used": 19,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 7,
    "invalid_yaml": 12,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 15,
    "deployment_failed": 3,
    "duplicate_resources": 2,
    "healthy": 1,
    "unhealthy": 15,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 8,
    "Unknown_error": 7
}
Polaris Scores: [44]


# GPT-3.5: ToT

In [137]:
base_directory = "./gpt3_5/tot/tot_prompt_1"
gpt3_5_tot_prompt_1_results, gpt3_5_tot_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_prompt_1_scores}")

Aggregated Results: ./gpt3_5/tot/tot_prompt_1
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 5,
    "no_mysql_or_wordpress": 20,
    "deployment_failed": 5,
    "duplicate_resources": 4,
    "healthy": 0,
    "unhealthy": 18,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 1,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Unknown_error": 7
}
Polaris Scores: []


In [138]:
base_directory = "./gpt3_5/tot/tot_prompt_2"
gpt3_5_tot_prompt_2_results, gpt3_5_tot_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_prompt_2_scores}")

Aggregated Results: ./gpt3_5/tot/tot_prompt_2
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 21,
    "deployment_failed": 9,
    "duplicate_resources": 8,
    "healthy": 0,
    "unhealthy": 18,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Unknown_error": 7
}
Polaris Scores: []


In [139]:
base_directory = "./gpt3_5/tot/tot_prompt_3"
gpt3_5_tot_prompt_3_results, gpt3_5_tot_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_prompt_3_scores}")

Aggregated Results: ./gpt3_5/tot/tot_prompt_3
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 3,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 14,
    "deployment_failed": 17,
    "duplicate_resources": 16,
    "healthy": 0,
    "unhealthy": 12,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 7,
    "Unknown_error": 5
}
Polaris Scores: []


In [140]:
base_directory = "./gpt3_5/tot/tot_detailed_prompt_1"
gpt3_5_tot_detailed_prompt_1_results, gpt3_5_tot_detailed_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_detailed_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_detailed_prompt_1_scores}")

Aggregated Results: ./gpt3_5/tot/tot_detailed_prompt_1
{
    "total_responses": 50,
    "secrets_used": 39,
    "base64_encoding_needed": 22,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 21,
    "deployment_failed": 8,
    "duplicate_resources": 5,
    "healthy": 0,
    "unhealthy": 16,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Unknown_error": 6
}
Polaris Scores: []


In [141]:
base_directory = "./gpt3_5/tot/tot_detailed_prompt_2"
gpt3_5_tot_detailed_prompt_2_results, gpt3_5_tot_detailed_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_detailed_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_detailed_prompt_2_scores}")

Aggregated Results: ./gpt3_5/tot/tot_detailed_prompt_2
{
    "total_responses": 50,
    "secrets_used": 42,
    "base64_encoding_needed": 23,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 6,
    "no_mysql_or_wordpress": 2,
    "deployment_failed": 14,
    "duplicate_resources": 10,
    "healthy": 1,
    "unhealthy": 25,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 16,
    "Unknown_error": 9
}
Polaris Scores: [36]


In [142]:
base_directory = "./gpt3_5/tot/tot_detailed_prompt_3"
gpt3_5_tot_detailed_prompt_3_results, gpt3_5_tot_detailed_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_detailed_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_detailed_prompt_3_scores}")

YAML not wrapped in ./gpt3_5/tot/tot_detailed_prompt_3/response-18
Aggregated Results: ./gpt3_5/tot/tot_detailed_prompt_3
{
    "total_responses": 50,
    "secrets_used": 18,
    "base64_encoding_needed": 12,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 4,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 10,
    "deployment_failed": 5,
    "duplicate_resources": 5,
    "healthy": 0,
    "unhealthy": 27,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 5,
    "Unknown_error": 22
}
Polaris Scores: []


# GPT-4o: Meta

In [143]:
base_directory = "./gpt4o/meta/meta_system_prompt"
gpt4o_meta_system_prompt_results, gpt4o_meta_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_system_prompt_scores}")

Aggregated Results: ./gpt4o/meta/meta_system_prompt
{
    "total_responses": 50,
    "secrets_used": 40,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 28,
    "unhealthy": 22,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 21,
    "Unknown_error": 1
}
Polaris Scores: [40, 37, 40, 40, 40, 40, 43, 40, 40, 43, 35, 40, 40, 40, 43, 35, 37, 35, 43, 40, 40, 43, 43, 40, 35, 40, 40, 40]


In [144]:
base_directory = "./gpt4o/meta/meta_system_prompt_detailed"
gpt4o_meta_system_prompt_detailed_results, gpt4o_meta_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/meta/meta_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 49,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 44,
    "unhealthy": 5,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 3,
    "Unknown_error": 1
}
Polaris Scores: [44, 44, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 44, 44, 42, 42, 42, 42, 42, 42, 44, 44, 42, 42, 44, 39, 42, 39, 44, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 44]


In [145]:
base_directory = "./gpt4o/meta/meta_meta_prompt"
gpt4o_meta_meta_prompt_results, gpt4o_meta_meta_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_meta_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_meta_prompt_scores}")

Aggregated Results: ./gpt4o/meta/meta_meta_prompt
{
    "total_responses": 50,
    "secrets_used": 22,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 24,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 4,
    "unhealthy": 19,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 8,
    "Unknown_error": 10
}
Polaris Scores: [55, 62, 58, 57]


In [146]:
base_directory = "./gpt4o/meta/meta_meta_prompt_detailed"
gpt4o_meta_meta_prompt_detailed_results, gpt4o_meta_meta_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_meta_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_meta_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/meta/meta_meta_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 37,
    "base64_encoding_needed": 14,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 11,
    "no_mysql_or_wordpress": 6,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 14,
    "unhealthy": 19,
    "PVC_failed": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 7,
    "Unknown_error": 10
}
Polaris Scores: [59, 60, 44, 51, 46, 59, 45, 46, 54, 46, 51, 61, 56, 41]


# GPT-4: Meta

In [147]:
base_directory = "./gpt4/meta/meta_system_prompt"
gpt4_meta_system_prompt_results, gpt4_meta_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_system_prompt_scores}")

Aggregated Results: ./gpt4/meta/meta_system_prompt
{
    "total_responses": 50,
    "secrets_used": 43,
    "base64_encoding_needed": 9,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 8,
    "unhealthy": 39,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 38,
    "Unknown_error": 1
}
Polaris Scores: [40, 37, 40, 35, 40, 43, 40, 40]


In [148]:
base_directory = "./gpt4/meta/meta_system_prompt_detailed"
gpt4_meta_system_prompt_detailed_results, gpt4_meta_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/meta/meta_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 50,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 35,
    "unhealthy": 15,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 14,
    "Unknown_error": 1
}
Polaris Scores: [42, 42, 43, 39, 39, 39, 39, 42, 42, 38, 42, 39, 43, 42, 42, 42, 42, 42, 39, 39, 38, 39, 42, 42, 42, 42, 39, 39, 43, 42, 42, 42, 39, 39, 42]


In [149]:
base_directory = "./gpt4/meta/meta_meta_prompt"
gpt4_meta_meta_prompt_results, gpt4_meta_meta_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_meta_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_meta_prompt_scores}")

Aggregated Results: ./gpt4/meta/meta_meta_prompt
{
    "total_responses": 50,
    "secrets_used": 27,
    "base64_encoding_needed": 20,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 9,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 1,
    "unhealthy": 37,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 3,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 17,
    "Unknown_error": 16
}
Polaris Scores: [43]


In [150]:
base_directory = "./gpt4/meta/meta_meta_prompt_detailed"
gpt4_meta_meta_prompt_detailed_results, gpt4_meta_meta_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_meta_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_meta_prompt_detailed_scores}")

Aggregated Results: ./gpt4/meta/meta_meta_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 21,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 8,
    "unhealthy": 35,
    "PVC_failed": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 2,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 25,
    "Unknown_error": 7
}
Polaris Scores: [44, 46, 46, 58, 58, 46, 64, 48]


# GPT-3.5: Meta

In [151]:
base_directory = "./gpt3_5/meta/meta_system_prompt"
gpt3_5_meta_system_prompt_results, gpt3_5_meta_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_system_prompt_scores}")

Aggregated Results: ./gpt3_5/meta/meta_system_prompt
{
    "total_responses": 50,
    "secrets_used": 3,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 4,
    "duplicate_resources": 0,
    "healthy": 3,
    "unhealthy": 42,
    "PVC_failed": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 40,
    "Unknown_error": 0
}
Polaris Scores: [36, 36, 36]


In [152]:
base_directory = "./gpt3_5/meta/meta_system_prompt_detailed"
gpt3_5_meta_system_prompt_detailed_results, gpt3_5_meta_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/meta/meta_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 26,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 1,
    "unhealthy": 45,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 42,
    "Unknown_error": 2
}
Polaris Scores: [36]


In [153]:
base_directory = "./gpt3_5/meta/meta_meta_prompt"
gpt3_5_meta_meta_prompt_results, gpt3_5_meta_meta_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_meta_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_meta_prompt_scores}")

Aggregated Results: ./gpt3_5/meta/meta_meta_prompt
{
    "total_responses": 50,
    "secrets_used": 4,
    "base64_encoding_needed": 4,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 9,
    "no_mysql_or_wordpress": 13,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 2,
    "unhealthy": 24,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 1,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 14,
    "Unknown_error": 9
}
Polaris Scores: [38, 47]


In [154]:
base_directory = "./gpt3_5/meta/meta_meta_prompt_detailed"
gpt3_5_meta_meta_prompt_detailed_results, gpt3_5_meta_meta_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_meta_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_meta_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/meta/meta_meta_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 4,
    "base64_encoding_needed": 2,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 28,
    "no_mysql_or_wordpress": 10,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 0,
    "unhealthy": 11,
    "PVC_failed": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 3,
    "Unknown_error": 8
}
Polaris Scores: []
