In [89]:
import os
import json

DEBUG = False

In [90]:
def test_funct(base_directory):

    
    # Iterate over all directories in the base directory
    for response_dir in os.listdir(base_directory):
        response_path = os.path.join(base_directory, response_dir)
        
        # Check if it's a directory and matches the response-* pattern
        if os.path.isdir(response_path) and response_dir.startswith("response-"):
            k8sgpt_file = os.path.join(response_path, "k8sgpt.json")
            
            # Check if the testing.json file exists
            if os.path.exists(k8sgpt_file):
                with open(k8sgpt_file, "r") as f:
                    try:
                        k8sgpt_data = json.load(f)
                        if k8sgpt_data["results"]:
                            for result in k8sgpt_data["results"]:
                                if result.get("kind") != "PersistentVolumeClaim" and result.get("kind") != "Pod" and result.get("kind"):
                                    kind = result.get("kind")
                                    for error in result.get("error", []):
                                        #raise Exception(f"{kind} error: {error['Text']} in {k8sgpt_file}")
                                        print(f"{kind} error: {error['Text']} in {k8sgpt_file}")

                        
                    except json.JSONDecodeError:
                        raise Exception(f"Error: Failed to parse JSON in {k8sgpt_file}")

In [91]:
def check_k8sgpt(k8sgpt_file):
    with open(k8sgpt_file, "r") as f:
        try:
            k8sgpt_data = json.load(f)
            if k8sgpt_data["results"]:
                for result in k8sgpt_data["results"]:
                    if result.get("kind") == "PersistentVolumeClaim":
                        for error in result.get("error", []):
                            print(f"PVC error: {error['Text']} in {k8sgpt_file}")
                            return "PVC_failed"
                for result in k8sgpt_data["results"]:
                    if result.get("kind") == "Pod":
                        for error in result.get("error", []):
                            if "Back-off pulling image" in error["Text"]:
                                # raise Exception
                                raise Exception(f"Back-off pulling image in {k8sgpt_file}")
                            elif "not found" in error["Text"] and ("persistentvolumeclaim" in error["Text"] or "secret" in error["Text"] or "configmap" in error["Text"]):
                                DEBUG and print(f"Missing resources: {error['Text']} in {k8sgpt_file}")
                                return "Missing_resources"
                            elif "couldn't find key" in error["Text"]:
                                DEBUG and print(f"Missing key in secret or configmap: {error['Text']} in {k8sgpt_file}")
                                return "Missing_key"
                for result in k8sgpt_data["results"]:
                    if result.get("kind") == "Pod":
                        if "mysql" in result.get("parentObject"):
                            for error in result.get("error", []):
                                if "PersistentVolumeClaims" in error["Text"]:
                                    DEBUG and print(f"MySQL pod has unbound immediate PersistentVolumeClaims in {k8sgpt_file}")
                                    return "MySQL_PVC_unbound"
                                elif "Readiness probe failed" in error["Text"]:
                                    DEBUG and print(f"MySQL Readiness probe failed in {k8sgpt_file}")
                                    return "MySQL_not_ready"
                for result in k8sgpt_data["results"]:
                    if result.get("kind") == "Pod":
                        if "wordpress" in result.get("parentObject"):
                            for error in result.get("error", []):
                                if "PersistentVolumeClaims" in error["Text"]:
                                    print(f"Wordpress pod has unbound immediate PersistentVolumeClaims in {k8sgpt_file}")
                                    return "Wordress_PVC_unbound"
                                elif "Readiness probe failed" in error["Text"]:
                                    DEBUG and print(f"MySQL Readiness probe failed in {k8sgpt_file}")
                                    return "Wordpress_not_ready"
                for result in k8sgpt_data["results"]:
                    if result.get("kind") != "Service":
                        for error in result.get("error", []):
                            print(f"Unknown error in: {error['Text']} in {k8sgpt_file}")
                            return "Unknown_error"
                for result in k8sgpt_data["results"]:
                    if result.get("kind") == "Service":
                        for error in result.get("error", []):
                            print(f"Service error: {error['Text']} in {k8sgpt_file}")
                            return "Service_failed"
                print(f"Unknown error in {k8sgpt_file}")
                return "Unknown_error"
        except json.JSONDecodeError:
            raise Exception(f"Error: Failed to parse JSON in {k8sgpt_file}")

In [92]:
import os  
import json
from collections import defaultdict

def aggregate_test_results(base_directory):

    aggregated_results = {
        "total_responses": 0,
        "secrets_used": 0,
        "base64_encoding_needed": 0,
        "yaml_not_wrapped": 0,
        "invalid_yaml": 0,
        "kubeconform_failed": 0,
        "no_mysql_or_wordpress": 0,
        "deployment_failed": 0,
        "duplicate_resources": 0,
        "healthy": 0,
        "unhealthy": 0,
        "PVC_failed": 0,
        "Missing_resources": 0,
        "Missing_key": 0,
        "MySQL_PVC_unbound": 0,
        "MySQL_not_ready": 0,
        "Wordpress_PVC_unbound": 0,
        "Wordpress_not_ready": 0,
        "Service_failed": 0,
        "Unknown_error": 0
    }
    scores_healthy = []
    
    # Iterate over all directories in the base directory
    for response_dir in os.listdir(base_directory):
        response_path = os.path.join(base_directory, response_dir)
        
        # Check if it's a directory and matches the response-* pattern
        if os.path.isdir(response_path) and response_dir.startswith("response-"):
            testing_file = os.path.join(response_path, "testing.json")
            
            # Check if the testing.json file exists
            if os.path.exists(testing_file):
                with open(testing_file, "r") as f:
                    try:
                        testing_data = json.load(f)

                        # Aggregate results  
                        aggregated_results["total_responses"] += 1

                        # Get extra information
                        if testing_data.get("secrets_found"):
                            aggregated_results["secrets_used"] += 1
                            DEBUG and print(f"secrets used in {response_path}")

                        if testing_data.get("base64_needed"):
                            aggregated_results["base64_encoding_needed"] += 1

                        if testing_data.get("yaml_not_wrapped"):
                            aggregated_results["yaml_not_wrapped"] += 1
                            DEBUG and print(f"YAML not wrapped in {response_path}")

                        elif not testing_data.get("valid_yaml"):
                            aggregated_results["invalid_yaml"] += 1
                        
                        elif not testing_data.get("kubeconform"):
                            aggregated_results["kubeconform_failed"] += 1
                            kubeconform_file = os.path.join(response_path, "conform.json")
                            if not os.path.exists(kubeconform_file):
                                raise Exception(f"conform.json not found in {response_path}")

                        elif not testing_data.get("mysql_found") or not testing_data.get("wordpress_found"):
                            aggregated_results["no_mysql_or_wordpress"] += 1

                        elif not testing_data.get("deployed_successful"):
                            aggregated_results["deployment_failed"] += 1

                            for deployment_error in testing_data.get("deploy_errors"):
                                DEBUG and print(f"Deployment failed for {response_path}, error: {deployment_error}")
                                if "AlreadyExists" in deployment_error:
                                    aggregated_results["duplicate_resources"] += 1
                                    break

                        elif testing_data.get("healthy"):
                            aggregated_results["healthy"] += 1
                            scores_healthy.append(int(testing_data.get("polaris_score")))

                        else:
                            # why did it fail if it was not healthy
                            aggregated_results["unhealthy"] += 1

                            # Check why it failed in the k8sgpt.json file
                            k8sgpt_file = os.path.join(response_path, "k8sgpt.json")
                            if os.path.exists(k8sgpt_file):
                                cause = check_k8sgpt(k8sgpt_file)
                                if cause == "PVC_failed":
                                    aggregated_results["PVC_failed"] += 1
                                elif cause == "Missing_resources":
                                    aggregated_results["Missing_resources"] += 1
                                elif cause == "Missing_key":
                                    aggregated_results["Missing_key"] += 1
                                elif cause == "MySQL_PVC_unbound":
                                    aggregated_results["MySQL_PVC_unbound"] += 1
                                elif cause == "MySQL_not_ready":
                                    aggregated_results["MySQL_not_ready"] += 1
                                elif cause == "Wordpress_PVC_unbound":
                                    aggregated_results["Wordpress_PVC_unbound"] += 1
                                elif cause == "Wordpress_not_ready":
                                    aggregated_results["Wordpress_not_ready"] += 1
                                elif cause == "Service_failed":
                                    aggregated_results["Service_failed"] += 1
                                elif cause == "Unknown_error":
                                    aggregated_results["Unknown_error"] += 1
                                else:
                                    raise Exception(f"Something went wrong in {response_path}")
                                
                            else:
                                raise Exception(f"k8sgpt.json not found in {response_path}")
                        
                    except json.JSONDecodeError:
                        raise Exception(f"Error: Failed to parse JSON in {testing_file}")
            else:
                raise Exception(f"testing.json not found in {response_path}")
    if (aggregated_results["unhealthy"] != aggregated_results["PVC_failed"] + aggregated_results["MySQL_PVC_unbound"] + aggregated_results["MySQL_not_ready"] + aggregated_results["Wordpress_PVC_unbound"] + aggregated_results["Wordpress_not_ready"] + aggregated_results["Unknown_error"] + aggregated_results["Service_failed"] + aggregated_results["Missing_resources"] + aggregated_results["Missing_key"]):
        raise Exception(f"Error count does not match for unhealthy {base_directory}")
    if (aggregated_results["total_responses"] != aggregated_results["healthy"] + aggregated_results["unhealthy"] + aggregated_results["deployment_failed"] + aggregated_results["no_mysql_or_wordpress"] + aggregated_results["kubeconform_failed"] + aggregated_results["invalid_yaml"] + aggregated_results["yaml_not_wrapped"]):
        raise Exception(f"Error count does not match {base_directory}")
    return aggregated_results, scores_healthy


In [93]:
def aggregate_all_test_results(results):  
    # Initialize the aggregated results  
    aggregated_results = {
        "total_responses": 0,
        "secrets_used": 0,
        "base64_encoding_needed": 0,
        "yaml_not_wrapped": 0,
        "invalid_yaml": 0,
        "kubeconform_failed": 0,
        "no_mysql_or_wordpress": 0,
        "deployment_failed": 0,
        "duplicate_resources": 0,
        "healthy": 0,
        "unhealthy": 0,
        "PVC_failed": 0,
        "Missing_resources": 0,
        "Missing_key": 0,
        "MySQL_PVC_unbound": 0,
        "MySQL_not_ready": 0,
        "Wordpress_PVC_unbound": 0,
        "Wordpress_not_ready": 0,
        "Service_failed": 0,
        "Unknown_error": 0
    }
    
    # Iterate through each result  
    for result in results: 
        for key in aggregated_results:  
            if key in result:  
                aggregated_results[key] += result[key]
    if (aggregated_results["unhealthy"] != aggregated_results["PVC_failed"] + aggregated_results["MySQL_PVC_unbound"] + aggregated_results["MySQL_not_ready"] + aggregated_results["Wordpress_PVC_unbound"] + aggregated_results["Wordpress_not_ready"] + aggregated_results["Unknown_error"] + aggregated_results["Service_failed"] + aggregated_results["Missing_resources"] + aggregated_results["Missing_key"]):
        raise Exception(f"Error count does not match for unhealthy results")
    if (aggregated_results["total_responses"] != aggregated_results["healthy"] + aggregated_results["unhealthy"] + aggregated_results["deployment_failed"] + aggregated_results["no_mysql_or_wordpress"] + aggregated_results["kubeconform_failed"] + aggregated_results["invalid_yaml"] + aggregated_results["yaml_not_wrapped"]):
        raise Exception(f"Error count does not match for all results")
    return aggregated_results

# Debug test for finding common errors

In [94]:
test_funct("./gpt4o/cot/ape_prompt")

test_funct("./gpt4o/cot/ape_prompt_detailed")

test_funct("./gpt4o/cot/human_prompt")

test_funct("./gpt4o/cot/human_prompt_detailed")

test_funct("./gpt4o/zero_shot/baseline_system_prompt")

test_funct("./gpt4o/zero_shot/baseline_system_prompt_detailed")

test_funct("./gpt4o/zero_shot/role_system_prompt")

test_funct("./gpt4o/zero_shot/role_system_prompt_detailed")

test_funct("./gpt4o/zero_shot/role_best_system_prompt")

test_funct("./gpt4o/zero_shot/role_best_system_prompt_detailed")

test_funct("./gpt4o/meta/meta_system_prompt")

test_funct("./gpt4o/meta/meta_system_prompt_detailed")

test_funct("./gpt4o/meta/meta_meta_prompt")

test_funct("./gpt4o/meta/meta_meta_prompt_detailed")

test_funct("./gpt4o/tot/tot_prompt_1")

test_funct("./gpt4o/tot/tot_prompt_2")

test_funct("./gpt4o/tot/tot_prompt_3")

test_funct("./gpt4o/tot/tot_detailed_prompt_1")

test_funct("./gpt4o/tot/tot_detailed_prompt_2")

test_funct("./gpt4o/tot/tot_detailed_prompt_3")

Service error: Service has not ready endpoints, pods: [Pod/wordpress-7cd8d99f95-7kkkt], expected 1 in ./gpt4o/cot/ape_prompt/response-24/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-65bf6b588d-2bfm9], expected 1 in ./gpt4o/cot/ape_prompt/response-18/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-6794fd6f75-bmg97], expected 1 in ./gpt4o/cot/ape_prompt/response-20/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-64b7f47697-sjsqq], expected 1 in ./gpt4o/cot/ape_prompt/response-26/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-b8c9d796b-txjjk], expected 1 in ./gpt4o/cot/ape_prompt/response-32/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-74d54c75cb-drnqd], expected 1 in ./gpt4o/cot/ape_prompt/response-11/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-6db97c4c9c-kx276], expected 1 in ./gpt

In [95]:
test_funct("./gpt4/cot/ape_prompt")

test_funct("./gpt4/cot/ape_prompt_detailed")

test_funct("./gpt4/cot/human_prompt")

test_funct("./gpt4/cot/human_prompt_detailed")

test_funct("./gpt4/zero_shot/baseline_system_prompt")

test_funct("./gpt4/zero_shot/baseline_system_prompt_detailed")

test_funct("./gpt4/zero_shot/role_system_prompt")

test_funct("./gpt4/zero_shot/role_system_prompt_detailed")

test_funct("./gpt4/zero_shot/role_best_system_prompt")

test_funct("./gpt4/zero_shot/role_best_system_prompt_detailed")

test_funct("./gpt4/meta/meta_system_prompt")

test_funct("./gpt4/meta/meta_system_prompt_detailed")

test_funct("./gpt4/meta/meta_meta_prompt")

test_funct("./gpt4/meta/meta_meta_prompt_detailed")

test_funct("./gpt4/tot/tot_prompt_1")

test_funct("./gpt4/tot/tot_prompt_2")

test_funct("./gpt4/tot/tot_prompt_3")

test_funct("./gpt4/tot/tot_detailed_prompt_1")

test_funct("./gpt4/tot/tot_detailed_prompt_2")

test_funct("./gpt4/tot/tot_detailed_prompt_3")

Service error: Service has not ready endpoints, pods: [Pod/wordpress-7d87445ddc-p49n8], expected 1 in ./gpt4/cot/ape_prompt/response-24/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-b8c9d796b-wkcf9], expected 1 in ./gpt4/cot/ape_prompt/response-18/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-978d74d76-6m549], expected 1 in ./gpt4/cot/ape_prompt/response-20/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-76cf45c866-cp27z], expected 1 in ./gpt4/cot/ape_prompt/response-26/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-79b8956457-7kcql], expected 1 in ./gpt4/cot/ape_prompt/response-42/k8sgpt.json
Service error: Service has no endpoints, expected label app=mysql in ./gpt4/cot/ape_prompt/response-46/k8sgpt.json
Service error: Service has no endpoints, expected label app=wordpress in ./gpt4/cot/ape_prompt/response-46/k8sgpt.json
Service error: Service has n

In [96]:
test_funct("./gpt3_5/cot/ape_prompt")

test_funct("./gpt3_5/cot/ape_prompt_detailed")

test_funct("./gpt3_5/cot/human_prompt")

test_funct("./gpt3_5/cot/human_prompt_detailed")

test_funct("./gpt3_5/zero_shot/baseline_system_prompt")

test_funct("./gpt3_5/zero_shot/baseline_system_prompt_detailed")

test_funct("./gpt3_5/zero_shot/role_system_prompt")

test_funct("./gpt3_5/zero_shot/role_system_prompt_detailed")

test_funct("./gpt3_5/zero_shot/role_best_system_prompt")

test_funct("./gpt3_5/zero_shot/role_best_system_prompt_detailed")

test_funct("./gpt3_5/meta/meta_system_prompt")

test_funct("./gpt3_5/meta/meta_system_prompt_detailed")

test_funct("./gpt3_5/meta/meta_meta_prompt")

test_funct("./gpt3_5/meta/meta_meta_prompt_detailed")

test_funct("./gpt3_5/tot/tot_prompt_1")

test_funct("./gpt3_5/tot/tot_prompt_2")

test_funct("./gpt3_5/tot/tot_prompt_3")

test_funct("./gpt3_5/tot/tot_detailed_prompt_1")

test_funct("./gpt3_5/tot/tot_detailed_prompt_2")

test_funct("./gpt3_5/tot/tot_detailed_prompt_3")

Service error: Service default/mysql has event Failed to create endpoint for service default/mysql: endpoints "mysql" already exists in ./gpt3_5/cot/ape_prompt/response-24/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-698ddd5b65-bk8nl], expected 1 in ./gpt3_5/cot/ape_prompt/response-24/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-567666ddc5-8mm9k], expected 1 in ./gpt3_5/cot/ape_prompt/response-18/k8sgpt.json
Service error: Service default/mysql has event Failed to create endpoint for service default/mysql: endpoints "mysql" already exists in ./gpt3_5/cot/ape_prompt/response-42/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-59594974cf-qx668], expected 1 in ./gpt3_5/cot/ape_prompt/response-42/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-768f949884-78kzs], expected 1 in ./gpt3_5/cot/ape_prompt/response-5/k8sgpt.json
Service error: Service has not re

# Aggregated test results

## GPT-4o: Zero-Shot

In [177]:
base_directory = "./gpt4o/zero_shot/baseline_system_prompt"
gpt4o_zs_baseline_system_prompt_results, gpt4o_zs_baseline_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")  
print(json.dumps(gpt4o_zs_baseline_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_baseline_system_prompt_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/zero_shot/baseline_system_prompt/response-8/k8sgpt.json
Aggregated Results: ./gpt4o/zero_shot/baseline_system_prompt
{
    "total_responses": 50,
    "secrets_used": 31,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 25,
    "unhealthy": 25,
    "PVC_failed": 1,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 24,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 40, 37, 37, 35, 36, 40, 40, 40, 40, 37, 35, 37, 40, 35, 35, 40, 37, 35, 35, 43, 35, 40, 40, 40]


In [178]:
base_directory = "./gpt4o/zero_shot/baseline_system_prompt_detailed"
gpt4o_zs_baseline_system_prompt_detailed_results, gpt4o_zs_baseline_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")  
print(json.dumps(gpt4o_zs_baseline_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_baseline_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/zero_shot/baseline_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 49,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 43,
    "unhealthy": 7,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 7,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 44, 42, 42, 39, 44, 42, 42, 42, 42, 42, 39, 44, 42, 44, 42, 42, 42, 42, 42, 44, 44, 39, 42, 42, 42, 42]


In [179]:
base_directory = "./gpt4o/zero_shot/role_system_prompt"
gpt4o_zs_role_system_prompt_results, gpt4o_zs_role_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_system_prompt_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_system_prompt
{
    "total_responses": 50,
    "secrets_used": 35,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 1,
    "healthy": 24,
    "unhealthy": 25,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 25,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 35, 40, 40, 40, 40, 41, 40, 35, 35, 35, 40, 38, 40, 40, 35, 40, 37, 40, 40, 36, 40, 38]


In [180]:
base_directory = "./gpt4o/zero_shot/role_system_prompt_detailed"
gpt4o_zs_role_system_prompt_detailed_results, gpt4o_zs_role_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 48,
    "base64_encoding_needed": 5,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 43,
    "unhealthy": 6,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 5,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 44, 40, 42, 42, 39, 42, 42, 44, 44, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 42, 42, 42, 44, 44, 42, 42, 42, 44, 42, 44, 42, 42, 42, 42]


In [181]:
base_directory = "./gpt4o/zero_shot/role_best_system_prompt"
gpt4o_zs_role_best_system_prompt_results, gpt4o_zs_role_best_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_best_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_best_system_prompt_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_best_system_prompt
{
    "total_responses": 50,
    "secrets_used": 36,
    "base64_encoding_needed": 2,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 19,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 19,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [36, 40, 37, 35, 40, 40, 40, 40, 35, 37, 35, 40, 40, 40, 41, 40, 40, 35, 41, 40, 40, 40, 37, 43, 40, 35, 37, 40, 40, 40]


In [182]:
base_directory = "./gpt4o/zero_shot/role_best_system_prompt_detailed"
gpt4o_zs_role_best_system_prompt_detailed_results, gpt4o_zs_role_best_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_zs_role_best_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_zs_role_best_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/zero_shot/role_best_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 10,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 40,
    "unhealthy": 9,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 9,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [44, 44, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 44, 44, 42, 42, 44, 42, 42, 44, 42, 42, 44, 44, 44, 42, 42, 44, 44, 44, 42, 42, 44, 42]


## GPT-3.5: Zero-Shot

In [183]:
base_directory = "./gpt3_5/zero_shot/baseline_system_prompt"
gpt3_5_zs_baseline_system_prompt_results, gpt3_5_zs_baseline_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_baseline_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_baseline_system_prompt_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/baseline_system_prompt/response-44/k8sgpt.json
Aggregated Results: ./gpt3_5/zero_shot/baseline_system_prompt
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 6,
    "duplicate_resources": 0,
    "healthy": 3,
    "unhealthy": 37,
    "PVC_failed": 1,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 36,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35, 35, 32]


In [184]:
base_directory = "./gpt3_5/zero_shot/baseline_system_prompt_detailed"
gpt3_5_zs_baseline_system_prompt_detailed_results, gpt3_5_zs_baseline_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_baseline_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_baseline_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/zero_shot/baseline_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 48,
    "base64_encoding_needed": 24,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 6,
    "duplicate_resources": 2,
    "healthy": 0,
    "unhealthy": 43,
    "PVC_failed": 0,
    "Missing_resources": 12,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 31,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: []


In [185]:
base_directory = "./gpt3_5/zero_shot/role_system_prompt"
gpt3_5_zs_role_system_prompt_results, gpt3_5_zs_role_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_system_prompt_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/role_system_prompt/response-18/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/role_system_prompt/response-5/k8sgpt.json
PVC error: storageclass.storage.k8s.io "my-local-storage" not found in ./gpt3_5/zero_shot/role_system_prompt/response-19/k8sgpt.json
Aggregated Results: ./gpt3_5/zero_shot/role_system_prompt
{
    "total_responses": 50,
    "secrets_used": 0,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 13,
    "duplicate_resources": 0,
    "healthy": 1,
    "unhealthy": 36,
    "PVC_failed": 3,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 32,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35]


In [186]:
base_directory = "./gpt3_5/zero_shot/role_system_prompt_detailed"
gpt3_5_zs_role_system_prompt_detailed_results, gpt3_5_zs_role_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/zero_shot/role_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 44,
    "base64_encoding_needed": 20,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 3,
    "unhealthy": 45,
    "PVC_failed": 0,
    "Missing_resources": 13,
    "Missing_key": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 31,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [39, 38, 38]


In [187]:
base_directory = "./gpt3_5/zero_shot/role_best_system_prompt"
gpt3_5_zs_role_best_system_prompt_results, gpt3_5_zs_role_best_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_best_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_best_system_prompt_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/role_best_system_prompt/response-21/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/role_best_system_prompt/response-40/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/role_best_system_prompt/response-25/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/zero_shot/role_best_system_prompt/response-47/k8sgpt.json
Aggregated Results: ./gpt3_5/zero_shot/role_best_system_prompt
{
    "total_responses": 50,
    "secrets_used": 0,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 4,
    "duplicate_resources": 0,
    "healthy": 0,
    "unhealthy": 45,
    "PVC_failed": 4,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wo

In [188]:
base_directory = "./gpt3_5/zero_shot/role_best_system_prompt_detailed"
gpt3_5_zs_role_best_system_prompt_detailed_results, gpt3_5_zs_role_best_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_zs_role_best_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_zs_role_best_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/zero_shot/role_best_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 41,
    "base64_encoding_needed": 21,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 2,
    "duplicate_resources": 1,
    "healthy": 1,
    "unhealthy": 45,
    "PVC_failed": 0,
    "Missing_resources": 8,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 37,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [41]


## GPT-4: Zero-Shot

In [189]:
base_directory = "./gpt4/zero_shot/baseline_system_prompt"
gpt4_zs_baseline_system_prompt_results, gpt4_zs_baseline_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_baseline_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_baseline_system_prompt_scores}")

Aggregated Results: ./gpt4/zero_shot/baseline_system_prompt
{
    "total_responses": 50,
    "secrets_used": 24,
    "base64_encoding_needed": 11,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 10,
    "unhealthy": 39,
    "PVC_failed": 0,
    "Missing_resources": 4,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 35,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 35, 35, 35, 35, 35, 35, 35, 35]


In [190]:
base_directory = "./gpt4/zero_shot/baseline_system_prompt_detailed"
gpt4_zs_baseline_system_prompt_detailed_results, gpt4_zs_baseline_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_baseline_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_baseline_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/zero_shot/baseline_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 12,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 29,
    "unhealthy": 15,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [39, 42, 42, 44, 39, 42, 42, 44, 43, 42, 42, 42, 44, 42, 39, 42, 42, 42, 42, 43, 42, 42, 44, 42, 42, 43, 42, 42, 42]


In [191]:
base_directory = "./gpt4/zero_shot/role_system_prompt"
gpt4_zs_role_system_prompt_results, gpt4_zs_role_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_system_prompt_scores}")

Aggregated Results: ./gpt4/zero_shot/role_system_prompt
{
    "total_responses": 50,
    "secrets_used": 27,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 6,
    "unhealthy": 42,
    "PVC_failed": 0,
    "Missing_resources": 3,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 39,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35, 40, 35, 37, 35, 40]


In [192]:
base_directory = "./gpt4/zero_shot/role_system_prompt_detailed"
gpt4_zs_role_system_prompt_detailed_results, gpt4_zs_role_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/zero_shot/role_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 50,
    "base64_encoding_needed": 4,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 35,
    "unhealthy": 15,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 44, 39, 41, 42, 44, 42, 42, 42, 39, 43, 42, 39, 42, 39, 42, 42, 42, 43, 42, 42, 46, 43, 43, 42, 42, 43, 42, 42, 43, 42, 43, 39, 39]


In [193]:
base_directory = "./gpt4/zero_shot/role_best_system_prompt"
gpt4_zs_role_best_system_prompt_results, gpt4_zs_role_best_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_best_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_best_system_prompt_scores}")

Aggregated Results: ./gpt4/zero_shot/role_best_system_prompt
{
    "total_responses": 50,
    "secrets_used": 42,
    "base64_encoding_needed": 11,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 5,
    "unhealthy": 44,
    "PVC_failed": 0,
    "Missing_resources": 2,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 42,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [37, 40, 37, 35, 40]


In [194]:
base_directory = "./gpt4/zero_shot/role_best_system_prompt_detailed"
gpt4_zs_role_best_system_prompt_detailed_results, gpt4_zs_role_best_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_zs_role_best_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_zs_role_best_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/zero_shot/role_best_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 49,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 36,
    "unhealthy": 13,
    "PVC_failed": 0,
    "Missing_resources": 2,
    "Missing_key": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [41, 44, 42, 42, 42, 41, 46, 42, 44, 42, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 39, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42]


## GPT-4o: CoT

In [195]:
base_directory = "./gpt4o/cot/human_prompt"
gpt4o_cot_human_prompt_results, gpt4o_cot_human_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_human_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_human_prompt_scores}")

Service error: Service has no endpoints, expected label app=mysql in ./gpt4o/cot/human_prompt/response-29/k8sgpt.json
Service error: Service has no endpoints, expected label app=mysql in ./gpt4o/cot/human_prompt/response-13/k8sgpt.json
Aggregated Results: ./gpt4o/cot/human_prompt
{
    "total_responses": 50,
    "secrets_used": 35,
    "base64_encoding_needed": 18,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 2,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 34,
    "unhealthy": 13,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 11,
    "Service_failed": 2,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 40, 35, 35, 40, 39, 40, 43, 35, 40, 40, 35, 40, 40, 40, 40, 43, 40, 35, 40, 40, 40, 40, 35, 40, 43, 40, 40, 35, 40, 40, 40, 40]


In [196]:
base_directory = "./gpt4o/cot/human_prompt_detailed"
gpt4o_cot_human_prompt_detailed_results, gpt4o_cot_human_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_human_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_human_prompt_detailed_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/cot/human_prompt_detailed/response-38/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/cot/human_prompt_detailed/response-44/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/cot/human_prompt_detailed/response-1/k8sgpt.json
Aggregated Results: ./gpt4o/cot/human_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 48,
    "base64_encoding_needed": 20,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 37,
    "unhealthy": 12,
    "PVC_failed": 3,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 9,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 43, 42, 42, 42, 44, 42, 42, 42, 

In [197]:
base_directory = "./gpt4o/cot/ape_prompt"
gpt4o_cot_ape_prompt_results, gpt4o_cot_ape_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_ape_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_ape_prompt_scores}")

Aggregated Results: ./gpt4o/cot/ape_prompt
{
    "total_responses": 50,
    "secrets_used": 38,
    "base64_encoding_needed": 14,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 19,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 19,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 40, 40, 40, 43, 40, 40, 40, 35, 40, 40, 35, 40, 40, 40, 40, 40, 40, 37, 40, 40, 40, 35, 40, 35, 43, 40, 40, 40]


In [198]:
base_directory = "./gpt4o/cot/ape_prompt_detailed"
gpt4o_cot_ape_prompt_detailed_results, gpt4o_cot_ape_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_cot_ape_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_cot_ape_prompt_detailed_scores}")

Unknown error in: Liveness probe failed: dial tcp 10.244.1.2:80: connect: connection refused in ./gpt4o/cot/ape_prompt_detailed/response-37/k8sgpt.json
Aggregated Results: ./gpt4o/cot/ape_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 25,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 1,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 34,
    "unhealthy": 13,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 11,
    "Service_failed": 0,
    "Unknown_error": 1
}
Polaris Scores: [42, 42, 42, 42, 44, 42, 44, 40, 39, 42, 42, 42, 42, 42, 40, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 44, 42, 44, 42, 40, 44, 44, 42, 42]


## GPT-4: CoT

In [199]:
base_directory = "./gpt4/cot/human_prompt"
gpt4_cot_human_prompt_results, gpt4_cot_human_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_human_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_human_prompt_scores}")

Aggregated Results: ./gpt4/cot/human_prompt
{
    "total_responses": 50,
    "secrets_used": 34,
    "base64_encoding_needed": 23,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 2,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 15,
    "unhealthy": 31,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 31,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 40, 35, 40, 35, 40, 40, 35, 42, 35, 35, 35, 35, 35, 35]


In [200]:
base_directory = "./gpt4/cot/human_prompt_detailed"
gpt4_cot_human_prompt_detailed_results, gpt4_cot_human_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_human_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_human_prompt_detailed_scores}")

Aggregated Results: ./gpt4/cot/human_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 48,
    "base64_encoding_needed": 23,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 31,
    "unhealthy": 17,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 17,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 42, 42, 42, 42, 42, 43, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 43, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42]


In [201]:
base_directory = "./gpt4/cot/ape_prompt"
gpt4_cot_ape_prompt_results, gpt4_cot_ape_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_ape_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_ape_prompt_scores}")

Aggregated Results: ./gpt4/cot/ape_prompt
{
    "total_responses": 50,
    "secrets_used": 34,
    "base64_encoding_needed": 25,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 18,
    "unhealthy": 28,
    "PVC_failed": 0,
    "Missing_resources": 2,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 26,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35, 35, 35, 35, 35, 40, 40, 40, 35, 35, 40, 40, 40, 40, 35, 40, 40, 35]


In [202]:
base_directory = "./gpt4/cot/ape_prompt_detailed"
gpt4_cot_ape_prompt_detailed_results, gpt4_cot_ape_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_cot_ape_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_cot_ape_prompt_detailed_scores}")

Aggregated Results: ./gpt4/cot/ape_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 25,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 26,
    "unhealthy": 22,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 21,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 44, 42, 42, 43, 42, 42, 36, 42, 42, 42, 39, 42, 39, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 42]


## GPT-3.5: CoT

In [203]:
base_directory = "./gpt3_5/cot/human_prompt"
gpt3_5_cot_human_prompt_results, gpt3_5_cot_human_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_human_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_human_prompt_scores}")

Aggregated Results: ./gpt3_5/cot/human_prompt
{
    "total_responses": 50,
    "secrets_used": 10,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 9,
    "unhealthy": 38,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 37,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35, 32, 35, 32, 36, 32, 35, 35, 36]


In [204]:
base_directory = "./gpt3_5/cot/human_prompt_detailed"
gpt3_5_cot_human_prompt_detailed_results, gpt3_5_cot_human_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_human_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_human_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/cot/human_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 41,
    "base64_encoding_needed": 22,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 4,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 2,
    "unhealthy": 41,
    "PVC_failed": 0,
    "Missing_resources": 2,
    "Missing_key": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 37,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [36, 39]


In [205]:
base_directory = "./gpt3_5/cot/ape_prompt"
gpt3_5_cot_ape_prompt_results, gpt3_5_cot_ape_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_ape_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_ape_prompt_scores}")

Aggregated Results: ./gpt3_5/cot/ape_prompt
{
    "total_responses": 50,
    "secrets_used": 10,
    "base64_encoding_needed": 9,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 10,
    "unhealthy": 37,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 37,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [32, 32, 32, 32, 32, 32, 32, 32, 35, 32]


In [206]:
base_directory = "./gpt3_5/cot/ape_prompt_detailed"
gpt3_5_cot_ape_prompt_detailed_results, gpt3_5_cot_ape_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_cot_ape_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_cot_ape_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/cot/ape_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 24,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 3,
    "duplicate_resources": 0,
    "healthy": 7,
    "unhealthy": 39,
    "PVC_failed": 0,
    "Missing_resources": 4,
    "Missing_key": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 33,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [36, 36, 36, 36, 36, 36, 39]


## GPT-4o: ToT

In [207]:
base_directory = "./gpt4o/tot/tot_prompt_1"
gpt4o_tot_prompt_1_results, gpt4o_tot_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_prompt_1_scores}")


PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/tot/tot_prompt_1/response-45/k8sgpt.json
Unknown error in: the last termination reason is Error container=wordpress pod=wordpress-677bb76474-959rr in ./gpt4o/tot/tot_prompt_1/response-4/k8sgpt.json
Aggregated Results: ./gpt4o/tot/tot_prompt_1
{
    "total_responses": 50,
    "secrets_used": 29,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 19,
    "PVC_failed": 1,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 17,
    "Service_failed": 0,
    "Unknown_error": 1
}
Polaris Scores: [40, 43, 40, 35, 43, 35, 37, 35, 35, 37, 37, 35, 40, 40, 35, 35, 43, 37, 37, 40, 35, 40, 40, 40, 40, 35, 40, 40, 37, 40]


In [208]:
base_directory = "./gpt4o/tot/tot_prompt_2"
gpt4o_tot_prompt_2_results, gpt4o_tot_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_prompt_2_scores}")

Unknown error in: the last termination reason is Error container=wordpress pod=wordpress-55fbb9b877-lmcg9 in ./gpt4o/tot/tot_prompt_2/response-28/k8sgpt.json
Service error: Service has not ready endpoints, pods: [Pod/wordpress-bfdf8c4cb-hd9cv], expected 1 in ./gpt4o/tot/tot_prompt_2/response-8/k8sgpt.json
Aggregated Results: ./gpt4o/tot/tot_prompt_2
{
    "total_responses": 50,
    "secrets_used": 28,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 2,
    "duplicate_resources": 2,
    "healthy": 21,
    "unhealthy": 23,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 21,
    "Service_failed": 1,
    "Unknown_error": 1
}
Polaris Scores: [40, 40, 40, 35, 40, 35, 35, 40, 37, 35, 35, 35, 43, 40, 35, 35, 37, 43, 40, 35, 40]


In [209]:
base_directory = "./gpt4o/tot/tot_prompt_3"
gpt4o_tot_prompt_3_results, gpt4o_tot_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_prompt_3_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/tot/tot_prompt_3/response-43/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/tot/tot_prompt_3/response-47/k8sgpt.json
Aggregated Results: ./gpt4o/tot/tot_prompt_3
{
    "total_responses": 50,
    "secrets_used": 20,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 3,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 1,
    "duplicate_resources": 1,
    "healthy": 26,
    "unhealthy": 18,
    "PVC_failed": 2,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 16,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 35, 37, 37, 37, 43, 40, 40, 40, 40, 40, 37, 35, 40, 35, 37, 37, 37, 40, 40, 37, 37, 35, 40, 40, 37]


In [210]:
base_directory = "./gpt4o/tot/tot_detailed_prompt_1"
gpt4o_tot_detailed_prompt_1_results, gpt4o_tot_detailed_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_detailed_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_detailed_prompt_1_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/tot/tot_detailed_prompt_1/response-27/k8sgpt.json
Aggregated Results: ./gpt4o/tot/tot_detailed_prompt_1
{
    "total_responses": 50,
    "secrets_used": 47,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 36,
    "unhealthy": 14,
    "PVC_failed": 1,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 44, 44, 44, 44, 42, 42, 44, 42, 42, 42, 42, 42, 42, 44, 44, 42, 44, 42, 42, 42, 42, 42, 42, 43, 42, 44, 42, 42, 44, 44, 42, 44, 39, 42]


In [211]:
base_directory = "./gpt4o/tot/tot_detailed_prompt_2"
gpt4o_tot_detailed_prompt_2_results, gpt4o_tot_detailed_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_detailed_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_detailed_prompt_2_scores}")

Unknown error in: Liveness probe failed: dial tcp 10.244.1.2:80: connect: connection refused in ./gpt4o/tot/tot_detailed_prompt_2/response-39/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4o/tot/tot_detailed_prompt_2/response-12/k8sgpt.json
Aggregated Results: ./gpt4o/tot/tot_detailed_prompt_2
{
    "total_responses": 50,
    "secrets_used": 50,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 38,
    "unhealthy": 12,
    "PVC_failed": 1,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Service_failed": 0,
    "Unknown_error": 1
}
Polaris Scores: [42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 40, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 42, 42,

In [212]:
base_directory = "./gpt4o/tot/tot_detailed_prompt_3"
gpt4o_tot_detailed_prompt_3_results, gpt4o_tot_detailed_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_tot_detailed_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4o_tot_detailed_prompt_3_scores}")

Aggregated Results: ./gpt4o/tot/tot_detailed_prompt_3
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 30,
    "unhealthy": 18,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 17,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 39, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42]


## GPT-4: ToT

In [213]:
base_directory = "./gpt4/tot/tot_prompt_1"
gpt4_tot_prompt_1_results, gpt4_tot_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_prompt_1_scores}")

Aggregated Results: ./gpt4/tot/tot_prompt_1
{
    "total_responses": 50,
    "secrets_used": 9,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 4,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 13,
    "unhealthy": 31,
    "PVC_failed": 0,
    "Missing_resources": 7,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 24,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [37, 35, 37, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]


In [214]:
base_directory = "./gpt4/tot/tot_prompt_2"
gpt4_tot_prompt_2_results, gpt4_tot_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_prompt_2_scores}")

Aggregated Results: ./gpt4/tot/tot_prompt_2
{
    "total_responses": 50,
    "secrets_used": 20,
    "base64_encoding_needed": 6,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 9,
    "no_mysql_or_wordpress": 2,
    "deployment_failed": 24,
    "duplicate_resources": 23,
    "healthy": 3,
    "unhealthy": 10,
    "PVC_failed": 0,
    "Missing_resources": 6,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 4,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35, 40, 35]


In [215]:
base_directory = "./gpt4/tot/tot_prompt_3"
gpt4_tot_prompt_3_results, gpt4_tot_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_prompt_3_scores}")

Aggregated Results: ./gpt4/tot/tot_prompt_3
{
    "total_responses": 50,
    "secrets_used": 12,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 2,
    "invalid_yaml": 5,
    "kubeconform_failed": 5,
    "no_mysql_or_wordpress": 9,
    "deployment_failed": 5,
    "duplicate_resources": 5,
    "healthy": 1,
    "unhealthy": 23,
    "PVC_failed": 0,
    "Missing_resources": 8,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 15,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [35]


In [216]:
base_directory = "./gpt4/tot/tot_detailed_prompt_1"
gpt4_tot_detailed_prompt_1_results, gpt4_tot_detailed_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_detailed_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_detailed_prompt_1_scores}")

Aggregated Results: ./gpt4/tot/tot_detailed_prompt_1
{
    "total_responses": 50,
    "secrets_used": 32,
    "base64_encoding_needed": 14,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 3,
    "no_mysql_or_wordpress": 14,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 18,
    "unhealthy": 15,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 14,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 44, 42, 42, 42, 44, 42, 42, 42, 39, 42, 44, 42, 42, 42, 39, 43, 42]


In [217]:
base_directory = "./gpt4/tot/tot_detailed_prompt_2"
gpt4_tot_detailed_prompt_2_results, gpt4_tot_detailed_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_detailed_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_detailed_prompt_2_scores}")

Unknown error in: 0/3 nodes are available: 1 node(s) had untolerated taint {node-role.kubernetes.io/control-plane: }, 2 node(s) had untolerated taint {node.kubernetes.io/not-ready: }. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling. in ./gpt4/tot/tot_detailed_prompt_2/response-22/k8sgpt.json
Aggregated Results: ./gpt4/tot/tot_detailed_prompt_2
{
    "total_responses": 50,
    "secrets_used": 30,
    "base64_encoding_needed": 8,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 5,
    "kubeconform_failed": 5,
    "no_mysql_or_wordpress": 8,
    "deployment_failed": 9,
    "duplicate_resources": 9,
    "healthy": 8,
    "unhealthy": 14,
    "PVC_failed": 0,
    "Missing_resources": 5,
    "Missing_key": 1,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 7,
    "Service_failed": 0,
    "Unknown_error": 1
}
Polaris Scores: [44, 42, 42, 42, 42, 40, 44, 42]


In [218]:
base_directory = "./gpt4/tot/tot_detailed_prompt_3"
gpt4_tot_detailed_prompt_3_results, gpt4_tot_detailed_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_tot_detailed_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt4_tot_detailed_prompt_3_scores}")

Unknown error in: Liveness probe failed: dial tcp 10.244.1.2:80: connect: connection refused in ./gpt4/tot/tot_detailed_prompt_3/response-40/k8sgpt.json
Aggregated Results: ./gpt4/tot/tot_detailed_prompt_3
{
    "total_responses": 50,
    "secrets_used": 20,
    "base64_encoding_needed": 9,
    "yaml_not_wrapped": 7,
    "invalid_yaml": 5,
    "kubeconform_failed": 3,
    "no_mysql_or_wordpress": 15,
    "deployment_failed": 3,
    "duplicate_resources": 2,
    "healthy": 1,
    "unhealthy": 16,
    "PVC_failed": 0,
    "Missing_resources": 6,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 9,
    "Service_failed": 0,
    "Unknown_error": 1
}
Polaris Scores: [44]


## GPT-3.5: ToT

In [219]:
base_directory = "./gpt3_5/tot/tot_prompt_1"
gpt3_5_tot_prompt_1_results, gpt3_5_tot_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_prompt_1_scores}")

Unknown error in: the last termination reason is Error container=mysql pod=mysql in ./gpt3_5/tot/tot_prompt_1/response-28/k8sgpt.json
Unknown error in: the last termination reason is Error container=mysql pod=mysql in ./gpt3_5/tot/tot_prompt_1/response-40/k8sgpt.json
Unknown error in: the last termination reason is Error container=mysql pod=mysql in ./gpt3_5/tot/tot_prompt_1/response-12/k8sgpt.json
Aggregated Results: ./gpt3_5/tot/tot_prompt_1
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 5,
    "no_mysql_or_wordpress": 20,
    "deployment_failed": 5,
    "duplicate_resources": 4,
    "healthy": 0,
    "unhealthy": 18,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 1,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 14,
    "Service_failed": 0,
    "Unknown_error": 3
}
Polaris Scores: []


In [220]:
base_directory = "./gpt3_5/tot/tot_prompt_2"
gpt3_5_tot_prompt_2_results, gpt3_5_tot_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_prompt_2_scores}")

Unknown error in: the last termination reason is Error container=mysql pod=wordpress-deployment-66d874d745-cjd4v in ./gpt3_5/tot/tot_prompt_2/response-20/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/tot/tot_prompt_2/response-23/k8sgpt.json
Unknown error in: Ingress default/wordpress-ingress does not specify an Ingress class. in ./gpt3_5/tot/tot_prompt_2/response-35/k8sgpt.json
Aggregated Results: ./gpt3_5/tot/tot_prompt_2
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 21,
    "deployment_failed": 9,
    "duplicate_resources": 8,
    "healthy": 0,
    "unhealthy": 18,
    "PVC_failed": 1,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 14,
    "Service_failed": 0,
    "Unknown_error": 2
}
Polar

In [221]:
base_directory = "./gpt3_5/tot/tot_prompt_3"
gpt3_5_tot_prompt_3_results, gpt3_5_tot_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_prompt_3_scores}")

Unknown error in: the last termination reason is Error container=mysql pod=wordpress-mysql-deployment-859994f47d-c67rj in ./gpt3_5/tot/tot_prompt_3/response-0/k8sgpt.json
Unknown error in: the last termination reason is Error container=mysql pod=wordpress-mysql-deployment-7c868f8-h7qzr in ./gpt3_5/tot/tot_prompt_3/response-49/k8sgpt.json
Aggregated Results: ./gpt3_5/tot/tot_prompt_3
{
    "total_responses": 50,
    "secrets_used": 1,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 3,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 14,
    "deployment_failed": 17,
    "duplicate_resources": 16,
    "healthy": 0,
    "unhealthy": 13,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 1,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Service_failed": 0,
    "Unknown_error": 2
}
Polaris Scores: []


In [222]:
base_directory = "./gpt3_5/tot/tot_detailed_prompt_1"
gpt3_5_tot_detailed_prompt_1_results, gpt3_5_tot_detailed_prompt_1_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_detailed_prompt_1_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_detailed_prompt_1_scores}")

Aggregated Results: ./gpt3_5/tot/tot_detailed_prompt_1
{
    "total_responses": 50,
    "secrets_used": 39,
    "base64_encoding_needed": 22,
    "yaml_not_wrapped": 1,
    "invalid_yaml": 1,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 21,
    "deployment_failed": 8,
    "duplicate_resources": 5,
    "healthy": 0,
    "unhealthy": 17,
    "PVC_failed": 0,
    "Missing_resources": 7,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 10,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: []


In [223]:
base_directory = "./gpt3_5/tot/tot_detailed_prompt_2"
gpt3_5_tot_detailed_prompt_2_results, gpt3_5_tot_detailed_prompt_2_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_detailed_prompt_2_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_detailed_prompt_2_scores}")

Aggregated Results: ./gpt3_5/tot/tot_detailed_prompt_2
{
    "total_responses": 50,
    "secrets_used": 42,
    "base64_encoding_needed": 23,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 6,
    "no_mysql_or_wordpress": 2,
    "deployment_failed": 13,
    "duplicate_resources": 10,
    "healthy": 1,
    "unhealthy": 26,
    "PVC_failed": 0,
    "Missing_resources": 9,
    "Missing_key": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 15,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [36]


In [224]:
base_directory = "./gpt3_5/tot/tot_detailed_prompt_3"
gpt3_5_tot_detailed_prompt_3_results, gpt3_5_tot_detailed_prompt_3_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_tot_detailed_prompt_3_results, indent=4))
print(f"Polaris Scores: {gpt3_5_tot_detailed_prompt_3_scores}")

Aggregated Results: ./gpt3_5/tot/tot_detailed_prompt_3
{
    "total_responses": 50,
    "secrets_used": 19,
    "base64_encoding_needed": 13,
    "yaml_not_wrapped": 2,
    "invalid_yaml": 3,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 11,
    "deployment_failed": 5,
    "duplicate_resources": 5,
    "healthy": 0,
    "unhealthy": 28,
    "PVC_failed": 0,
    "Missing_resources": 21,
    "Missing_key": 2,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 5,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: []


## GPT-4o: Meta

In [225]:
base_directory = "./gpt4o/meta/meta_system_prompt"
gpt4o_meta_system_prompt_results, gpt4o_meta_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_system_prompt_scores}")

Aggregated Results: ./gpt4o/meta/meta_system_prompt
{
    "total_responses": 50,
    "secrets_used": 40,
    "base64_encoding_needed": 0,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 28,
    "unhealthy": 22,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 22,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 37, 40, 40, 40, 40, 43, 40, 40, 43, 35, 40, 40, 40, 43, 35, 37, 35, 43, 40, 40, 43, 43, 40, 35, 40, 40, 40]


In [226]:
base_directory = "./gpt4o/meta/meta_system_prompt_detailed"
gpt4o_meta_system_prompt_detailed_results, gpt4o_meta_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4o/meta/meta_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 49,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 44,
    "unhealthy": 5,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 2,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 3,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [44, 44, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 44, 44, 42, 42, 42, 42, 42, 42, 44, 44, 42, 42, 44, 39, 42, 39, 44, 42, 42, 42, 42, 42, 42, 44, 42, 42, 42, 42, 42, 44]


In [227]:
base_directory = "./gpt4o/meta/meta_meta_prompt"
gpt4o_meta_meta_prompt_results, gpt4o_meta_meta_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_meta_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_meta_prompt_scores}")

Unknown error in: Ingress default/wordpress-ingress does not specify an Ingress class. in ./gpt4o/meta/meta_meta_prompt/response-4/k8sgpt.json
Unknown error in: the last termination reason is Error container=wordpress pod=wordpress-55f9b5b9d-whtgl in ./gpt4o/meta/meta_meta_prompt/response-3/k8sgpt.json
PVC error: storageclass.storage.k8s.io "fast" not found in ./gpt4o/meta/meta_meta_prompt/response-12/k8sgpt.json
Aggregated Results: ./gpt4o/meta/meta_meta_prompt
{
    "total_responses": 50,
    "secrets_used": 22,
    "base64_encoding_needed": 7,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 2,
    "kubeconform_failed": 24,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 4,
    "unhealthy": 19,
    "PVC_failed": 1,
    "Missing_resources": 2,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 1,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 13,
    "Service_failed": 0,
    "Unknown_error": 2
}


In [228]:
base_directory = "./gpt4o/meta/meta_meta_prompt_detailed"
gpt4o_meta_meta_prompt_detailed_results, gpt4o_meta_meta_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4o_meta_meta_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4o_meta_meta_prompt_detailed_scores}")

Unknown error in: Ingress default/wordpress-ingress does not specify an Ingress class. in ./gpt4o/meta/meta_meta_prompt_detailed/response-24/k8sgpt.json
PVC error: storageclass.storage.k8s.io "gp2" not found in ./gpt4o/meta/meta_meta_prompt_detailed/response-21/k8sgpt.json
Unknown error in: 0/3 nodes are available: 1 node(s) had untolerated taint {node-role.kubernetes.io/control-plane: }, 2 node(s) had untolerated taint {node.kubernetes.io/not-ready: }. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling. in ./gpt4o/meta/meta_meta_prompt_detailed/response-34/k8sgpt.json
Unknown error in: Liveness probe failed: dial tcp 10.244.1.4:3306: connect: connection refused in ./gpt4o/meta/meta_meta_prompt_detailed/response-15/k8sgpt.json
PVC error: storageclass.storage.k8s.io "high-performance" not found in ./gpt4o/meta/meta_meta_prompt_detailed/response-2/k8sgpt.json
Unknown error in: Ingress wordpress-app/wordpress-ingress does not specify an Ingress class. in ./gpt

## GPT-4: Meta

In [229]:
base_directory = "./gpt4/meta/meta_system_prompt"
gpt4_meta_system_prompt_results, gpt4_meta_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_system_prompt_scores}")

Aggregated Results: ./gpt4/meta/meta_system_prompt
{
    "total_responses": 50,
    "secrets_used": 43,
    "base64_encoding_needed": 9,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 2,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 1,
    "duplicate_resources": 0,
    "healthy": 8,
    "unhealthy": 39,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 38,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [40, 37, 40, 35, 40, 43, 40, 40]


In [230]:
base_directory = "./gpt4/meta/meta_system_prompt_detailed"
gpt4_meta_system_prompt_detailed_results, gpt4_meta_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_system_prompt_detailed_scores}")

Aggregated Results: ./gpt4/meta/meta_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 50,
    "base64_encoding_needed": 3,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 0,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 35,
    "unhealthy": 15,
    "PVC_failed": 0,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 15,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [42, 42, 43, 39, 39, 39, 39, 42, 42, 38, 42, 39, 43, 42, 42, 42, 42, 42, 39, 39, 38, 39, 42, 42, 42, 42, 39, 39, 43, 42, 42, 42, 39, 39, 42]


In [231]:
base_directory = "./gpt4/meta/meta_meta_prompt"
gpt4_meta_meta_prompt_results, gpt4_meta_meta_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_meta_prompt_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_meta_prompt_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt4/meta/meta_meta_prompt/response-34/k8sgpt.json
Aggregated Results: ./gpt4/meta/meta_meta_prompt
{
    "total_responses": 50,
    "secrets_used": 27,
    "base64_encoding_needed": 20,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 9,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 1,
    "unhealthy": 37,
    "PVC_failed": 1,
    "Missing_resources": 12,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 5,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 19,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [43]


In [232]:
base_directory = "./gpt4/meta/meta_meta_prompt_detailed"
gpt4_meta_meta_prompt_detailed_results, gpt4_meta_meta_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt4_meta_meta_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt4_meta_meta_prompt_detailed_scores}")

PVC error: storageclass.storage.k8s.io "high-durability" not found in ./gpt4/meta/meta_meta_prompt_detailed/response-32/k8sgpt.json
Aggregated Results: ./gpt4/meta/meta_meta_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 21,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 4,
    "no_mysql_or_wordpress": 3,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 8,
    "unhealthy": 35,
    "PVC_failed": 1,
    "Missing_resources": 4,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 4,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 26,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [44, 46, 46, 58, 58, 46, 64, 48]


## GPT-3.5: Meta

In [233]:
base_directory = "./gpt3_5/meta/meta_system_prompt"
gpt3_5_meta_system_prompt_results, gpt3_5_meta_system_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_system_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_system_prompt_scores}")

PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/meta/meta_system_prompt/response-26/k8sgpt.json
PVC error: storageclass.storage.k8s.io "manual" not found in ./gpt3_5/meta/meta_system_prompt/response-17/k8sgpt.json
Aggregated Results: ./gpt3_5/meta/meta_system_prompt
{
    "total_responses": 50,
    "secrets_used": 3,
    "base64_encoding_needed": 1,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 0,
    "deployment_failed": 4,
    "duplicate_resources": 0,
    "healthy": 3,
    "unhealthy": 42,
    "PVC_failed": 2,
    "Missing_resources": 0,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 40,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [36, 36, 36]


In [234]:
base_directory = "./gpt3_5/meta/meta_system_prompt_detailed"
gpt3_5_meta_system_prompt_detailed_results, gpt3_5_meta_system_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_system_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_system_prompt_detailed_scores}")

Aggregated Results: ./gpt3_5/meta/meta_system_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 45,
    "base64_encoding_needed": 26,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 1,
    "no_mysql_or_wordpress": 1,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 1,
    "unhealthy": 45,
    "PVC_failed": 0,
    "Missing_resources": 2,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 1,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 42,
    "Service_failed": 0,
    "Unknown_error": 0
}
Polaris Scores: [36]


In [235]:
base_directory = "./gpt3_5/meta/meta_meta_prompt"
gpt3_5_meta_meta_prompt_results, gpt3_5_meta_meta_prompt_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_meta_prompt_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_meta_prompt_scores}")

Unknown error in: the last termination reason is Error container=mysql pod=mysql-6787599ff5-lnv2q in ./gpt3_5/meta/meta_meta_prompt/response-32/k8sgpt.json
Aggregated Results: ./gpt3_5/meta/meta_meta_prompt
{
    "total_responses": 50,
    "secrets_used": 4,
    "base64_encoding_needed": 4,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 0,
    "kubeconform_failed": 8,
    "no_mysql_or_wordpress": 14,
    "deployment_failed": 2,
    "duplicate_resources": 0,
    "healthy": 2,
    "unhealthy": 24,
    "PVC_failed": 0,
    "Missing_resources": 3,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 2,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 18,
    "Service_failed": 0,
    "Unknown_error": 1
}
Polaris Scores: [38, 47]


In [236]:
base_directory = "./gpt3_5/meta/meta_meta_prompt_detailed"
gpt3_5_meta_meta_prompt_detailed_results, gpt3_5_meta_meta_prompt_detailed_scores = aggregate_test_results(base_directory)

print(f"Aggregated Results: {base_directory}")
print(json.dumps(gpt3_5_meta_meta_prompt_detailed_results, indent=4))
print(f"Polaris Scores: {gpt3_5_meta_meta_prompt_detailed_scores}")

Unknown error in: the last termination reason is Error container=mysql pod=mysql-statefulset-0 in ./gpt3_5/meta/meta_meta_prompt_detailed/response-6/k8sgpt.json
Unknown error in: the last termination reason is Error container=mysql pod=mysql-0 in ./gpt3_5/meta/meta_meta_prompt_detailed/response-17/k8sgpt.json
Unknown error in: Ingress default/wordpress-ingress does not specify an Ingress class. in ./gpt3_5/meta/meta_meta_prompt_detailed/response-1/k8sgpt.json
Aggregated Results: ./gpt3_5/meta/meta_meta_prompt_detailed
{
    "total_responses": 50,
    "secrets_used": 4,
    "base64_encoding_needed": 2,
    "yaml_not_wrapped": 0,
    "invalid_yaml": 1,
    "kubeconform_failed": 28,
    "no_mysql_or_wordpress": 10,
    "deployment_failed": 0,
    "duplicate_resources": 0,
    "healthy": 0,
    "unhealthy": 11,
    "PVC_failed": 0,
    "Missing_resources": 1,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 0,
    "MySQL_not_ready": 0,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready

# Code for succes rate + average score table

In [237]:
def calculate_average_score(scores):
    if len(scores) == 0:
        print(f"Avg score: 0")
    else:
        print(f"Avg score: {sum(scores) / len(scores)}")

In [238]:
def calculate_success_rate(input):
    if isinstance(input, list):
        results = aggregate_all_test_results(input)
    else:
        results = input
    rate = results["healthy"] / results["total_responses"]
    print(f"Succes rate: {rate}")

## Code for performance table

In [159]:
# GPT-4o
print("GPT-4o: zero-shot baseline average")
calculate_success_rate([gpt4o_zs_baseline_system_prompt_results, gpt4o_zs_baseline_system_prompt_detailed_results])
calculate_average_score(gpt4o_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_detailed_scores)

print("GPT-4o: zero-shot baseline system prompt")
calculate_success_rate(gpt4o_zs_baseline_system_prompt_results)
calculate_average_score(gpt4o_zs_baseline_system_prompt_scores)

print("GPT-4o: zero-shot baseline system prompt detailed")
calculate_success_rate(gpt4o_zs_baseline_system_prompt_detailed_results)
calculate_average_score(gpt4o_zs_baseline_system_prompt_detailed_scores)

####
print("GPT-4o: zero-shot role system average")
calculate_success_rate([gpt4o_zs_role_system_prompt_results, gpt4o_zs_role_system_prompt_detailed_results])
calculate_average_score(gpt4o_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_detailed_scores)

print("GPT-4o: zero-shot role system prompt")
calculate_success_rate(gpt4o_zs_role_system_prompt_results)
calculate_average_score(gpt4o_zs_role_system_prompt_scores)

print("GPT-4o: zero-shot role system prompt detailed")
calculate_success_rate(gpt4o_zs_role_system_prompt_detailed_results)
calculate_average_score(gpt4o_zs_role_system_prompt_detailed_scores)

####
print("GPT-4o: zero-shot role best system average")
calculate_success_rate([gpt4o_zs_role_best_system_prompt_results, gpt4o_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt4o_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_detailed_scores)

print("GPT-4o: zero-shot role best system prompt")
calculate_success_rate(gpt4o_zs_role_best_system_prompt_results)
calculate_average_score(gpt4o_zs_role_best_system_prompt_scores)

print("GPT-4o: zero-shot role best system prompt detailed")
calculate_success_rate(gpt4o_zs_role_best_system_prompt_detailed_results)
calculate_average_score(gpt4o_zs_role_best_system_prompt_detailed_scores)

####
print("GPT-4o: cot human average")
calculate_success_rate([gpt4o_cot_human_prompt_results, gpt4o_cot_human_prompt_detailed_results])
calculate_average_score(gpt4o_cot_human_prompt_scores + gpt4o_cot_human_prompt_detailed_scores)

print("GPT-4o: cot human prompt")
calculate_success_rate(gpt4o_cot_human_prompt_results)
calculate_average_score(gpt4o_cot_human_prompt_scores)

print("GPT-4o: cot human prompt detailed")
calculate_success_rate(gpt4o_cot_human_prompt_detailed_results)
calculate_average_score(gpt4o_cot_human_prompt_detailed_scores)

####
print("GPT-4o: cot ape average")
calculate_success_rate([gpt4o_cot_ape_prompt_results, gpt4o_cot_ape_prompt_detailed_results])
calculate_average_score(gpt4o_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_detailed_scores)

print("GPT-4o: cot ape prompt")
calculate_success_rate(gpt4o_cot_ape_prompt_results)
calculate_average_score(gpt4o_cot_ape_prompt_scores)

print("GPT-4o: cot ape prompt detailed")
calculate_success_rate(gpt4o_cot_ape_prompt_detailed_results)
calculate_average_score(gpt4o_cot_ape_prompt_detailed_scores)

####
print("GPT-4o: Meta system average")
calculate_success_rate([gpt4o_meta_system_prompt_results, gpt4o_meta_system_prompt_detailed_results])
calculate_average_score(gpt4o_meta_system_prompt_scores + gpt4o_meta_system_prompt_detailed_scores)

print("GPT-4o: Meta system prompt")
calculate_success_rate(gpt4o_meta_system_prompt_results)
calculate_average_score(gpt4o_meta_system_prompt_scores)

print("GPT-4o: Meta system prompt detailed")
calculate_success_rate(gpt4o_meta_system_prompt_detailed_results)
calculate_average_score(gpt4o_meta_system_prompt_detailed_scores)

####
print("GPT-4o: Meta meta average")
calculate_success_rate([gpt4o_meta_meta_prompt_results, gpt4o_meta_meta_prompt_detailed_results])
calculate_average_score(gpt4o_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_detailed_scores)

print("GPT-4o: Meta meta prompt")
calculate_success_rate(gpt4o_meta_meta_prompt_results)
calculate_average_score(gpt4o_meta_meta_prompt_scores)

print("GPT-4o: Meta meta prompt detailed")
calculate_success_rate(gpt4o_meta_meta_prompt_detailed_results)
calculate_average_score(gpt4o_meta_meta_prompt_detailed_scores)

####
print("GPT-4o: tot prompt 1 average")
calculate_success_rate([gpt4o_tot_prompt_1_results, gpt4o_tot_detailed_prompt_1_results])
calculate_average_score(gpt4o_tot_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores)

print("GPT-4o: tot prompt 1")
calculate_success_rate(gpt4o_tot_prompt_1_results)
calculate_average_score(gpt4o_tot_prompt_1_scores)

print("GPT-4o: tot detailed prompt 1")
calculate_success_rate(gpt4o_tot_detailed_prompt_1_results)
calculate_average_score(gpt4o_tot_detailed_prompt_1_scores)

####
print("GPT-4o: tot prompt 2 average")
calculate_success_rate([gpt4o_tot_prompt_2_results, gpt4o_tot_detailed_prompt_2_results])
calculate_average_score(gpt4o_tot_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores)

print("GPT-4o: tot prompt 2")
calculate_success_rate(gpt4o_tot_prompt_2_results)
calculate_average_score(gpt4o_tot_prompt_2_scores)

print("GPT-4o: tot detailed prompt 2")
calculate_success_rate(gpt4o_tot_detailed_prompt_2_results)
calculate_average_score(gpt4o_tot_detailed_prompt_2_scores)

####
print("GPT-4o: tot prompt 3 average")
calculate_success_rate([gpt4o_tot_prompt_3_results, gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt4o_tot_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)

print("GPT-4o: tot prompt 3")
calculate_success_rate(gpt4o_tot_prompt_3_results)
calculate_average_score(gpt4o_tot_prompt_3_scores)

print("GPT-4o: tot detailed prompt 3")
calculate_success_rate(gpt4o_tot_detailed_prompt_3_results)
calculate_average_score(gpt4o_tot_detailed_prompt_3_scores)

GPT-4o: zero-shot baseline average
Succes rate: 0.68
Avg score: 40.64705882352941
GPT-4o: zero-shot baseline system prompt
Succes rate: 0.5
Avg score: 37.96
GPT-4o: zero-shot baseline system prompt detailed
Succes rate: 0.86
Avg score: 42.2093023255814
GPT-4o: zero-shot role system average
Succes rate: 0.67
Avg score: 40.91044776119403
GPT-4o: zero-shot role system prompt
Succes rate: 0.48
Avg score: 38.333333333333336
GPT-4o: zero-shot role system prompt detailed
Succes rate: 0.86
Avg score: 42.348837209302324
GPT-4o: zero-shot role best system average
Succes rate: 0.7
Avg score: 41.01428571428571
GPT-4o: zero-shot role best system prompt
Succes rate: 0.6
Avg score: 38.8
GPT-4o: zero-shot role best system prompt detailed
Succes rate: 0.8
Avg score: 42.675
GPT-4o: cot human average
Succes rate: 0.71
Avg score: 40.63380281690141
GPT-4o: cot human prompt
Succes rate: 0.68
Avg score: 39.05882352941177
GPT-4o: cot human prompt detailed
Succes rate: 0.74
Avg score: 42.08108108108108
GPT-4o:

In [160]:
# GPT-4
print("GPT-4: zero-shot baseline average")
calculate_success_rate([gpt4_zs_baseline_system_prompt_results, gpt4_zs_baseline_system_prompt_detailed_results])
calculate_average_score(gpt4_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_detailed_scores)

print("GPT-4: zero-shot baseline system prompt")
calculate_success_rate(gpt4_zs_baseline_system_prompt_results)
calculate_average_score(gpt4_zs_baseline_system_prompt_scores)

print("GPT-4: zero-shot baseline system prompt detailed")
calculate_success_rate(gpt4_zs_baseline_system_prompt_detailed_results)
calculate_average_score(gpt4_zs_baseline_system_prompt_detailed_scores)

####
print("GPT-4: zero-shot role system average")
calculate_success_rate([gpt4_zs_role_system_prompt_results, gpt4_zs_role_system_prompt_detailed_results])
calculate_average_score(gpt4_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_detailed_scores)

print("GPT-4: zero-shot role system prompt")
calculate_success_rate(gpt4_zs_role_system_prompt_results)
calculate_average_score(gpt4_zs_role_system_prompt_scores)

print("GPT-4: zero-shot role system prompt detailed")
calculate_success_rate(gpt4_zs_role_system_prompt_detailed_results)
calculate_average_score(gpt4_zs_role_system_prompt_detailed_scores)

####
print("GPT-4: zero-shot role best system average")
calculate_success_rate([gpt4_zs_role_best_system_prompt_results, gpt4_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt4_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_detailed_scores)

print("GPT-4: zero-shot role best system prompt")
calculate_success_rate(gpt4_zs_role_best_system_prompt_results)
calculate_average_score(gpt4_zs_role_best_system_prompt_scores)

print("GPT-4: zero-shot role best system prompt detailed")
calculate_success_rate(gpt4_zs_role_best_system_prompt_detailed_results)
calculate_average_score(gpt4_zs_role_best_system_prompt_detailed_scores)

####
print("GPT-4: cot human average")
calculate_success_rate([gpt4_cot_human_prompt_results, gpt4_cot_human_prompt_detailed_results])
calculate_average_score(gpt4_cot_human_prompt_scores + gpt4_cot_human_prompt_detailed_scores)

print("GPT-4: cot human prompt")
calculate_success_rate(gpt4_cot_human_prompt_results)
calculate_average_score(gpt4_cot_human_prompt_scores)

print("GPT-4: cot human prompt detailed")
calculate_success_rate(gpt4_cot_human_prompt_detailed_results)
calculate_average_score(gpt4_cot_human_prompt_detailed_scores)

####
print("GPT-4: cot ape average")
calculate_success_rate([gpt4_cot_ape_prompt_results, gpt4_cot_ape_prompt_detailed_results])
calculate_average_score(gpt4_cot_ape_prompt_scores + gpt4_cot_ape_prompt_detailed_scores)

print("GPT-4: cot ape prompt")
calculate_success_rate(gpt4_cot_ape_prompt_results)
calculate_average_score(gpt4_cot_ape_prompt_scores)

print("GPT-4: cot ape prompt detailed")
calculate_success_rate(gpt4_cot_ape_prompt_detailed_results)
calculate_average_score(gpt4_cot_ape_prompt_detailed_scores)

####
print("GPT-4: Meta system average")
calculate_success_rate([gpt4_meta_system_prompt_results, gpt4_meta_system_prompt_detailed_results])
calculate_average_score(gpt4_meta_system_prompt_scores + gpt4_meta_system_prompt_detailed_scores)

print("GPT-4: Meta system prompt")
calculate_success_rate(gpt4_meta_system_prompt_results)
calculate_average_score(gpt4_meta_system_prompt_scores)

print("GPT-4: Meta system prompt detailed")
calculate_success_rate(gpt4_meta_system_prompt_detailed_results)
calculate_average_score(gpt4_meta_system_prompt_detailed_scores)

####
print("GPT-4: Meta meta average")
calculate_success_rate([gpt4_meta_meta_prompt_results, gpt4_meta_meta_prompt_detailed_results])
calculate_average_score(gpt4_meta_meta_prompt_scores + gpt4_meta_meta_prompt_detailed_scores)

print("GPT-4: Meta meta prompt")
calculate_success_rate(gpt4_meta_meta_prompt_results)
calculate_average_score(gpt4_meta_meta_prompt_scores)

print("GPT-4: Meta meta prompt detailed")
calculate_success_rate(gpt4_meta_meta_prompt_detailed_results)
calculate_average_score(gpt4_meta_meta_prompt_detailed_scores)

####
print("GPT-4: tot prompt 1 average")
calculate_success_rate([gpt4_tot_prompt_1_results, gpt4_tot_detailed_prompt_1_results])
calculate_average_score(gpt4_tot_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores)

print("GPT-4: tot prompt 1")
calculate_success_rate(gpt4_tot_prompt_1_results)
calculate_average_score(gpt4_tot_prompt_1_scores)

print("GPT-4: tot detailed prompt 1")
calculate_success_rate(gpt4_tot_detailed_prompt_1_results)
calculate_average_score(gpt4_tot_detailed_prompt_1_scores)

####
print("GPT-4: tot prompt 2 average")
calculate_success_rate([gpt4_tot_prompt_2_results, gpt4_tot_detailed_prompt_2_results])
calculate_average_score(gpt4_tot_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores)

print("GPT-4: tot prompt 2")
calculate_success_rate(gpt4_tot_prompt_2_results)
calculate_average_score(gpt4_tot_prompt_2_scores)

print("GPT-4: tot detailed prompt 2")
calculate_success_rate(gpt4_tot_detailed_prompt_2_results)
calculate_average_score(gpt4_tot_detailed_prompt_2_scores)

####
print("GPT-4: tot prompt 3 average")
calculate_success_rate([gpt4_tot_prompt_3_results, gpt4_tot_detailed_prompt_3_results])
calculate_average_score(gpt4_tot_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores)

print("GPT-4: tot prompt 3")
calculate_success_rate(gpt4_tot_prompt_3_results)
calculate_average_score(gpt4_tot_prompt_3_scores)

print("GPT-4: tot detailed prompt 3")
calculate_success_rate(gpt4_tot_detailed_prompt_3_results)
calculate_average_score(gpt4_tot_detailed_prompt_3_scores)

GPT-4: zero-shot baseline average
Succes rate: 0.39
Avg score: 40.38461538461539
GPT-4: zero-shot baseline system prompt
Succes rate: 0.2
Avg score: 35.5
GPT-4: zero-shot baseline system prompt detailed
Succes rate: 0.58
Avg score: 42.06896551724138
GPT-4: zero-shot role system average
Succes rate: 0.41
Avg score: 41.170731707317074
GPT-4: zero-shot role system prompt
Succes rate: 0.12
Avg score: 37.0
GPT-4: zero-shot role system prompt detailed
Succes rate: 0.7
Avg score: 41.885714285714286
GPT-4: zero-shot role best system average
Succes rate: 0.41
Avg score: 41.65853658536585
GPT-4: zero-shot role best system prompt
Succes rate: 0.1
Avg score: 37.8
GPT-4: zero-shot role best system prompt detailed
Succes rate: 0.72
Avg score: 42.19444444444444
GPT-4: cot human average
Succes rate: 0.46
Avg score: 40.391304347826086
GPT-4: cot human prompt
Succes rate: 0.3
Avg score: 37.13333333333333
GPT-4: cot human prompt detailed
Succes rate: 0.62
Avg score: 41.96774193548387
GPT-4: cot ape avera

In [161]:
# GPT-3.5
print("GPT-3.5: zero-shot baseline average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results, gpt3_5_zs_role_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores)

print("GPT-3.5: zero-shot baseline system prompt")
calculate_success_rate(gpt3_5_zs_baseline_system_prompt_results)
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores)

print("GPT-3.5: zero-shot baseline system prompt detailed")
calculate_success_rate(gpt3_5_zs_role_system_prompt_detailed_results)
calculate_average_score(gpt3_5_zs_role_system_prompt_detailed_scores)

####
print("GPT-3.5: zero-shot role system average")
calculate_success_rate([gpt3_5_zs_role_system_prompt_results, gpt3_5_zs_role_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores)

print("GPT-3.5: zero-shot role system prompt")
calculate_success_rate(gpt3_5_zs_role_system_prompt_results)
calculate_average_score(gpt3_5_zs_role_system_prompt_scores)

print("GPT-3.5: zero-shot role system prompt detailed")
calculate_success_rate(gpt3_5_zs_role_system_prompt_detailed_results)
calculate_average_score(gpt3_5_zs_role_system_prompt_detailed_scores)

####
print("GPT-3.5: zero-shot role best system average")
calculate_success_rate([gpt3_5_zs_role_best_system_prompt_results, gpt3_5_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores)

print("GPT-3.5: zero-shot role best system prompt")
calculate_success_rate(gpt3_5_zs_role_best_system_prompt_results)
calculate_average_score(gpt3_5_zs_role_best_system_prompt_scores)

print("GPT-3.5: zero-shot role best system prompt detailed")
calculate_success_rate(gpt3_5_zs_role_best_system_prompt_detailed_results)
calculate_average_score(gpt3_5_zs_role_best_system_prompt_detailed_scores)

####
print("GPT-3.5: cot human average")
calculate_success_rate([gpt3_5_cot_human_prompt_results, gpt3_5_cot_human_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores)

print("GPT-3.5: cot human prompt")
calculate_success_rate(gpt3_5_cot_human_prompt_results)
calculate_average_score(gpt3_5_cot_human_prompt_scores)

print("GPT-3.5: cot human prompt detailed")
calculate_success_rate(gpt3_5_cot_human_prompt_detailed_results)
calculate_average_score(gpt3_5_cot_human_prompt_detailed_scores)

####
print("GPT-3.5: cot ape average")
calculate_success_rate([gpt3_5_cot_ape_prompt_results, gpt3_5_cot_ape_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores)

print("GPT-3.5: cot ape prompt")
calculate_success_rate(gpt3_5_cot_ape_prompt_results)
calculate_average_score(gpt3_5_cot_ape_prompt_scores)

print("GPT-3.5: cot ape prompt detailed")
calculate_success_rate(gpt3_5_cot_ape_prompt_detailed_results)
calculate_average_score(gpt3_5_cot_ape_prompt_detailed_scores)

####
print("GPT-3.5: Meta system average")
calculate_success_rate([gpt3_5_meta_system_prompt_results, gpt3_5_meta_system_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores)

print("GPT-3.5: Meta system prompt")
calculate_success_rate(gpt3_5_meta_system_prompt_results)
calculate_average_score(gpt3_5_meta_system_prompt_scores)

print("GPT-3.5: Meta system prompt detailed")
calculate_success_rate(gpt3_5_meta_system_prompt_detailed_results)
calculate_average_score(gpt3_5_meta_system_prompt_detailed_scores)

####
print("GPT-3.5: Meta meta average")
calculate_success_rate([gpt3_5_meta_meta_prompt_results, gpt3_5_meta_meta_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores)

print("GPT-3.5: Meta meta prompt")
calculate_success_rate(gpt3_5_meta_meta_prompt_results)
calculate_average_score(gpt3_5_meta_meta_prompt_scores)

print("GPT-3.5: Meta meta prompt detailed")
calculate_success_rate(gpt3_5_meta_meta_prompt_detailed_results)
calculate_average_score(gpt3_5_meta_meta_prompt_detailed_scores)

####
print("GPT-3.5: tot prompt 1 average")
calculate_success_rate([gpt3_5_tot_prompt_1_results, gpt3_5_tot_detailed_prompt_1_results])
calculate_average_score(gpt3_5_tot_prompt_1_scores + gpt3_5_tot_detailed_prompt_1_scores)

print("GPT-3.5: tot prompt 1")
calculate_success_rate(gpt3_5_tot_prompt_1_results)
calculate_average_score(gpt3_5_tot_prompt_1_scores)

print("GPT-3.5: tot detailed prompt 1")
calculate_success_rate(gpt3_5_tot_detailed_prompt_1_results)
calculate_average_score(gpt3_5_tot_detailed_prompt_1_scores)

####
print("GPT-3.5: tot prompt 2 average")
calculate_success_rate([gpt3_5_tot_prompt_2_results, gpt3_5_tot_detailed_prompt_2_results])
calculate_average_score(gpt3_5_tot_prompt_2_scores + gpt3_5_tot_detailed_prompt_2_scores)

print("GPT-3.5: tot prompt 2")
calculate_success_rate(gpt3_5_tot_prompt_2_results)
calculate_average_score(gpt3_5_tot_prompt_2_scores)

print("GPT-3.5: tot detailed prompt 2")
calculate_success_rate(gpt3_5_tot_detailed_prompt_2_results)
calculate_average_score(gpt3_5_tot_detailed_prompt_2_scores)

####
print("GPT-3.5: tot prompt 3 average")
calculate_success_rate([gpt3_5_tot_prompt_3_results, gpt3_5_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_3_scores)

print("GPT-3.5: tot prompt 3")
calculate_success_rate(gpt3_5_tot_prompt_3_results)
calculate_average_score(gpt3_5_tot_prompt_3_scores)

print("GPT-3.5: tot detailed prompt 3")
calculate_success_rate(gpt3_5_tot_detailed_prompt_3_results)
calculate_average_score(gpt3_5_tot_detailed_prompt_3_scores)

GPT-3.5: zero-shot baseline average
Succes rate: 0.06
Avg score: 36.166666666666664
GPT-3.5: zero-shot baseline system prompt
Succes rate: 0.06
Avg score: 34.0
GPT-3.5: zero-shot baseline system prompt detailed
Succes rate: 0.06
Avg score: 38.333333333333336
GPT-3.5: zero-shot role system average
Succes rate: 0.04
Avg score: 37.5
GPT-3.5: zero-shot role system prompt
Succes rate: 0.02
Avg score: 35.0
GPT-3.5: zero-shot role system prompt detailed
Succes rate: 0.06
Avg score: 38.333333333333336
GPT-3.5: zero-shot role best system average
Succes rate: 0.01
Avg score: 41.0
GPT-3.5: zero-shot role best system prompt
Succes rate: 0.0
Avg score: 0
GPT-3.5: zero-shot role best system prompt detailed
Succes rate: 0.02
Avg score: 41.0
GPT-3.5: cot human average
Succes rate: 0.11
Avg score: 34.81818181818182
GPT-3.5: cot human prompt
Succes rate: 0.18
Avg score: 34.22222222222222
GPT-3.5: cot human prompt detailed
Succes rate: 0.04
Avg score: 37.5
GPT-3.5: cot ape average
Succes rate: 0.17
Avg s

In [162]:
# Totale average
print("Totale average zero-shot baseline average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results, gpt4_zs_baseline_system_prompt_results, gpt4o_zs_baseline_system_prompt_results, gpt3_5_zs_baseline_system_prompt_detailed_results, gpt4_zs_baseline_system_prompt_detailed_results, gpt4o_zs_baseline_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_scores + gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_baseline_system_prompt_detailed_scores)

print("Totale average zero-shot baseline system prompt")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results, gpt4_zs_baseline_system_prompt_results, gpt4o_zs_baseline_system_prompt_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_scores)

print("Totale average zero-shot baseline system prompt detailed")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_detailed_results, gpt4_zs_baseline_system_prompt_detailed_results, gpt4o_zs_baseline_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_baseline_system_prompt_detailed_scores)

####
print("Totale average zero-shot role system average")
calculate_success_rate([gpt3_5_zs_role_system_prompt_results, gpt4_zs_role_system_prompt_results, gpt4o_zs_role_system_prompt_results, gpt3_5_zs_role_system_prompt_detailed_results, gpt4_zs_role_system_prompt_detailed_results, gpt4o_zs_role_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_detailed_scores)

print("Totale average zero-shot role system prompt")
calculate_success_rate([gpt3_5_zs_role_system_prompt_results, gpt4_zs_role_system_prompt_results, gpt4o_zs_role_system_prompt_results])
calculate_average_score(gpt3_5_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_scores)

print("Totale average zero-shot role system prompt detailed")
calculate_success_rate([gpt3_5_zs_role_system_prompt_detailed_results, gpt4_zs_role_system_prompt_detailed_results, gpt4o_zs_role_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_role_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_detailed_scores)

####
print("Totale average zero-shot role best system average")
calculate_success_rate([gpt3_5_zs_role_best_system_prompt_results, gpt4_zs_role_best_system_prompt_results, gpt4o_zs_role_best_system_prompt_results, gpt3_5_zs_role_best_system_prompt_detailed_results, gpt4_zs_role_best_system_prompt_detailed_results, gpt4o_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_detailed_scores)

print("Totale average zero-shot role best system prompt")
calculate_success_rate([gpt3_5_zs_role_best_system_prompt_results, gpt4_zs_role_best_system_prompt_results, gpt4o_zs_role_best_system_prompt_results])
calculate_average_score(gpt3_5_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_scores)

print("Totale average zero-shot role best system prompt detailed")
calculate_success_rate([gpt3_5_zs_role_best_system_prompt_detailed_results, gpt4_zs_role_best_system_prompt_detailed_results, gpt4o_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_detailed_scores)

####
print("Totale average cot human average")
calculate_success_rate([gpt3_5_cot_human_prompt_results, gpt4_cot_human_prompt_results, gpt4o_cot_human_prompt_results, gpt3_5_cot_human_prompt_detailed_results, gpt4_cot_human_prompt_detailed_results, gpt4o_cot_human_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_human_prompt_scores + gpt4_cot_human_prompt_scores + gpt4o_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt4_cot_human_prompt_detailed_scores + gpt4o_cot_human_prompt_detailed_scores)

print("Totale average cot human prompt")
calculate_success_rate([gpt3_5_cot_human_prompt_results, gpt4_cot_human_prompt_results, gpt4o_cot_human_prompt_results])
calculate_average_score(gpt3_5_cot_human_prompt_scores + gpt4_cot_human_prompt_scores + gpt4o_cot_human_prompt_scores)

print("Totale average cot human prompt detailed")
calculate_success_rate([gpt3_5_cot_human_prompt_detailed_results, gpt4_cot_human_prompt_detailed_results, gpt4o_cot_human_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_human_prompt_detailed_scores + gpt4_cot_human_prompt_detailed_scores + gpt4o_cot_human_prompt_detailed_scores)

####
print("Totale average cot ape average")
calculate_success_rate([gpt3_5_cot_ape_prompt_results, gpt4_cot_ape_prompt_results, gpt4o_cot_ape_prompt_results, gpt3_5_cot_ape_prompt_detailed_results, gpt4_cot_ape_prompt_detailed_results, gpt4o_cot_ape_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_ape_prompt_scores + gpt4_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4o_cot_ape_prompt_detailed_scores)

print("Totale average cot ape prompt")
calculate_success_rate([gpt3_5_cot_ape_prompt_results, gpt4_cot_ape_prompt_results, gpt4o_cot_ape_prompt_results])
calculate_average_score(gpt3_5_cot_ape_prompt_scores + gpt4_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_scores)

print("Totale average cot ape prompt detailed")
calculate_success_rate([gpt3_5_cot_ape_prompt_detailed_results, gpt4_cot_ape_prompt_detailed_results, gpt4o_cot_ape_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_ape_prompt_detailed_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4o_cot_ape_prompt_detailed_scores)

####
print("Totale average Meta system average")
calculate_success_rate([gpt3_5_meta_system_prompt_results, gpt4_meta_system_prompt_results, gpt4o_meta_system_prompt_results, gpt3_5_meta_system_prompt_detailed_results, gpt4_meta_system_prompt_detailed_results, gpt4o_meta_system_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_system_prompt_scores + gpt4_meta_system_prompt_scores + gpt4o_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt4_meta_system_prompt_detailed_scores + gpt4o_meta_system_prompt_detailed_scores)

print("Totale average Meta system prompt")
calculate_success_rate([gpt3_5_meta_system_prompt_results, gpt4_meta_system_prompt_results, gpt4o_meta_system_prompt_results])
calculate_average_score(gpt3_5_meta_system_prompt_scores + gpt4_meta_system_prompt_scores + gpt4o_meta_system_prompt_scores)

print("Totale average Meta system prompt detailed")
calculate_success_rate([gpt3_5_meta_system_prompt_detailed_results, gpt4_meta_system_prompt_detailed_results, gpt4o_meta_system_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_system_prompt_detailed_scores + gpt4_meta_system_prompt_detailed_scores + gpt4o_meta_system_prompt_detailed_scores)

####
print("Totale average Meta meta average")
calculate_success_rate([gpt3_5_meta_meta_prompt_results, gpt4_meta_meta_prompt_results, gpt4o_meta_meta_prompt_results, gpt3_5_meta_meta_prompt_detailed_results, gpt4_meta_meta_prompt_detailed_results, gpt4o_meta_meta_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_meta_prompt_scores + gpt4_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4o_meta_meta_prompt_detailed_scores)

print("Totale average Meta meta prompt")
calculate_success_rate([gpt3_5_meta_meta_prompt_results, gpt4_meta_meta_prompt_results, gpt4o_meta_meta_prompt_results])
calculate_average_score(gpt3_5_meta_meta_prompt_scores + gpt4_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_scores)

print("Totale average Meta meta prompt detailed")
calculate_success_rate([gpt3_5_meta_meta_prompt_detailed_results, gpt4_meta_meta_prompt_detailed_results, gpt4o_meta_meta_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_meta_prompt_detailed_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4o_meta_meta_prompt_detailed_scores)

####
print("Totale average tot prompt 1 average")
calculate_success_rate([gpt3_5_tot_prompt_1_results, gpt4_tot_prompt_1_results, gpt4o_tot_prompt_1_results, gpt3_5_tot_detailed_prompt_1_results, gpt4_tot_detailed_prompt_1_results, gpt4o_tot_detailed_prompt_1_results])
calculate_average_score(gpt3_5_tot_prompt_1_scores + gpt4_tot_prompt_1_scores + gpt4o_tot_prompt_1_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores)

print("Totale average tot prompt 1")
calculate_success_rate([gpt3_5_tot_prompt_1_results, gpt4_tot_prompt_1_results, gpt4o_tot_prompt_1_results])
calculate_average_score(gpt3_5_tot_prompt_1_scores + gpt4_tot_prompt_1_scores + gpt4o_tot_prompt_1_scores)

print("Totale average tot detailed prompt 1")
calculate_success_rate([gpt3_5_tot_detailed_prompt_1_results, gpt4_tot_detailed_prompt_1_results, gpt4o_tot_detailed_prompt_1_results])
calculate_average_score(gpt3_5_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores)

####
print("Totale average tot prompt 2 average")
calculate_success_rate([gpt3_5_tot_prompt_2_results, gpt4_tot_prompt_2_results, gpt4o_tot_prompt_2_results, gpt3_5_tot_detailed_prompt_2_results, gpt4_tot_detailed_prompt_2_results, gpt4o_tot_detailed_prompt_2_results])
calculate_average_score(gpt3_5_tot_prompt_2_scores + gpt4_tot_prompt_2_scores + gpt4o_tot_prompt_2_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores)

print("Totale average tot prompt 2")
calculate_success_rate([gpt3_5_tot_prompt_2_results, gpt4_tot_prompt_2_results, gpt4o_tot_prompt_2_results])
calculate_average_score(gpt3_5_tot_prompt_2_scores + gpt4_tot_prompt_2_scores + gpt4o_tot_prompt_2_scores)

print("Totale average tot detailed prompt 2")
calculate_success_rate([gpt3_5_tot_detailed_prompt_2_results, gpt4_tot_detailed_prompt_2_results, gpt4o_tot_detailed_prompt_2_results])
calculate_average_score(gpt3_5_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores)

####
print("Totale average tot prompt 3 average")
calculate_success_rate([gpt3_5_tot_prompt_3_results, gpt4_tot_prompt_3_results, gpt4o_tot_prompt_3_results, gpt3_5_tot_detailed_prompt_3_results, gpt4_tot_detailed_prompt_3_results, gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_tot_prompt_3_scores + gpt4_tot_prompt_3_scores + gpt4o_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)

print("Totale average tot prompt 3")
calculate_success_rate([gpt3_5_tot_prompt_3_results, gpt4_tot_prompt_3_results, gpt4o_tot_prompt_3_results])
calculate_average_score(gpt3_5_tot_prompt_3_scores + gpt4_tot_prompt_3_scores + gpt4o_tot_prompt_3_scores)

print("Totale average tot detailed prompt 3")
calculate_success_rate([gpt3_5_tot_detailed_prompt_3_results, gpt4_tot_detailed_prompt_3_results, gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_tot_detailed_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)

Totale average zero-shot baseline average
Succes rate: 0.36666666666666664
Avg score: 40.372727272727275
Totale average zero-shot baseline system prompt
Succes rate: 0.25333333333333335
Avg score: 37.0
Totale average zero-shot baseline system prompt detailed
Succes rate: 0.48
Avg score: 42.15277777777778
Totale average zero-shot role system average
Succes rate: 0.37333333333333335
Avg score: 40.88392857142857
Totale average zero-shot role system prompt
Succes rate: 0.20666666666666667
Avg score: 37.96774193548387
Totale average zero-shot role system prompt detailed
Succes rate: 0.54
Avg score: 42.0
Totale average zero-shot role best system average
Succes rate: 0.37333333333333335
Avg score: 41.25
Totale average zero-shot role best system prompt
Succes rate: 0.23333333333333334
Avg score: 38.65714285714286
Totale average zero-shot role best system prompt detailed
Succes rate: 0.5133333333333333
Avg score: 42.42857142857143
Totale average cot human average
Succes rate: 0.4266666666666667

In [163]:
# GPT-4o zero shot
print("GPT-4o zero-shot average")
calculate_success_rate([gpt4o_zs_baseline_system_prompt_results,
                        gpt4o_zs_baseline_system_prompt_detailed_results,
                        gpt4o_zs_role_system_prompt_results,
                        gpt4o_zs_role_system_prompt_detailed_scores,
                        gpt4o_zs_role_best_system_prompt_results,
                        gpt4o_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt4o_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_detailed_scores)

print("GPT-4o cot average")
calculate_success_rate([gpt4o_cot_human_prompt_results,
                        gpt4o_cot_human_prompt_detailed_results,
                        gpt4o_cot_ape_prompt_results,
                        gpt4o_cot_ape_prompt_detailed_results])
calculate_average_score(gpt4o_cot_human_prompt_scores + gpt4o_cot_human_prompt_detailed_scores + gpt4o_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_detailed_scores)

print("GPT-4o Meta average")
calculate_success_rate([gpt4o_meta_system_prompt_results,
                        gpt4o_meta_system_prompt_detailed_results,
                        gpt4o_meta_meta_prompt_results,
                        gpt4o_meta_meta_prompt_detailed_results])
calculate_average_score(gpt4o_meta_system_prompt_scores + gpt4o_meta_system_prompt_detailed_scores + gpt4o_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_detailed_scores)

print("GPT-4o tot average")
calculate_success_rate([gpt4o_tot_prompt_1_results,
                        gpt4o_tot_detailed_prompt_1_results,
                        gpt4o_tot_prompt_2_results,
                        gpt4o_tot_detailed_prompt_2_results,
                        gpt4o_tot_prompt_3_results,
                        gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt4o_tot_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores + gpt4o_tot_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores + gpt4o_tot_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)

# GPT-4o Totale average
print("GPT-4o Totale average")
calculate_success_rate([gpt4o_zs_baseline_system_prompt_results,
                        gpt4o_zs_baseline_system_prompt_detailed_results,
                        gpt4o_zs_role_system_prompt_results,
                        gpt4o_zs_role_system_prompt_detailed_scores,
                        gpt4o_zs_role_best_system_prompt_results,
                        gpt4o_zs_role_best_system_prompt_detailed_results,
                        gpt4o_cot_human_prompt_results,
                        gpt4o_cot_human_prompt_detailed_results,
                        gpt4o_cot_ape_prompt_results,
                        gpt4o_cot_ape_prompt_detailed_results,
                        gpt4o_meta_system_prompt_results,
                        gpt4o_meta_system_prompt_detailed_results,
                        gpt4o_meta_meta_prompt_results,
                        gpt4o_meta_meta_prompt_detailed_results,
                        gpt4o_tot_prompt_1_results,
                        gpt4o_tot_detailed_prompt_1_results,
                        gpt4o_tot_prompt_2_results,
                        gpt4o_tot_detailed_prompt_2_results,
                        gpt4o_tot_prompt_3_results,
                        gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt4o_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_detailed_scores + gpt4o_cot_human_prompt_scores + gpt4o_cot_human_prompt_detailed_scores + gpt4o_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_detailed_scores + gpt4o_meta_system_prompt_scores + gpt4o_meta_system_prompt_detailed_scores + gpt4o_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_detailed_scores + gpt4o_tot_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores + gpt4o_tot_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores + gpt4o_tot_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)


GPT-4o zero-shot average
Succes rate: 0.648
Avg score: 40.858536585365854
GPT-4o cot average
Succes rate: 0.675
Avg score: 40.71111111111111
GPT-4o Meta average
Succes rate: 0.45
Avg score: 43.63333333333333
GPT-4o tot average
Succes rate: 0.6033333333333334
Avg score: 40.46408839779006
GPT-4o Totale average
Succes rate: 0.5978947368421053
Avg score: 41.11783960720131


In [164]:
# GPT-4 zero shot
print("GPT-4 zero-shot average")
calculate_success_rate([gpt4_zs_baseline_system_prompt_results,
                        gpt4_zs_baseline_system_prompt_detailed_results,
                        gpt4_zs_role_system_prompt_results,
                        gpt4_zs_role_system_prompt_detailed_results,
                        gpt4_zs_role_best_system_prompt_results,
                        gpt4_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt4_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_detailed_scores)

print("GPT-4 cot average")
calculate_success_rate([gpt4_cot_human_prompt_results,
                        gpt4_cot_human_prompt_detailed_results,
                        gpt4_cot_ape_prompt_results,
                        gpt4_cot_ape_prompt_detailed_results])
calculate_average_score(gpt4_cot_human_prompt_scores + gpt4_cot_human_prompt_detailed_scores + gpt4_cot_ape_prompt_scores + gpt4_cot_ape_prompt_detailed_scores)

print("GPT-4 Meta average")
calculate_success_rate([gpt4_meta_system_prompt_results,
                        gpt4_meta_system_prompt_detailed_results,
                        gpt4_meta_meta_prompt_results,
                        gpt4_meta_meta_prompt_detailed_results])

calculate_average_score(gpt4_meta_system_prompt_scores + gpt4_meta_system_prompt_detailed_scores + gpt4_meta_meta_prompt_scores + gpt4_meta_meta_prompt_detailed_scores)

print("GPT-4 tot average")
calculate_success_rate([gpt4_tot_prompt_1_results,
                        gpt4_tot_detailed_prompt_1_results,
                        gpt4_tot_prompt_2_results,
                        gpt4_tot_detailed_prompt_2_results,
                        gpt4_tot_prompt_3_results,
                        gpt4_tot_detailed_prompt_3_results])

calculate_average_score(gpt4_tot_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4_tot_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4_tot_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores)

# GPT-4 Totale average
print("GPT-4 Totale average")
calculate_success_rate([gpt4_zs_baseline_system_prompt_results,
                        gpt4_zs_baseline_system_prompt_detailed_results,
                        gpt4_zs_role_system_prompt_results,
                        gpt4_zs_role_system_prompt_detailed_results,
                        gpt4_zs_role_best_system_prompt_results,
                        gpt4_zs_role_best_system_prompt_detailed_results,
                        gpt4_cot_human_prompt_results,
                        gpt4_cot_human_prompt_detailed_results,
                        gpt4_cot_ape_prompt_results,
                        gpt4_cot_ape_prompt_detailed_results,
                        gpt4_meta_system_prompt_results,
                        gpt4_meta_system_prompt_detailed_results,
                        gpt4_meta_meta_prompt_results,
                        gpt4_meta_meta_prompt_detailed_results,
                        gpt4_tot_prompt_1_results,
                        gpt4_tot_detailed_prompt_1_results,
                        gpt4_tot_prompt_2_results,
                        gpt4_tot_detailed_prompt_2_results,
                        gpt4_tot_prompt_3_results,
                        gpt4_tot_detailed_prompt_3_results])
calculate_average_score(gpt4_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4_cot_human_prompt_scores + gpt4_cot_human_prompt_detailed_scores + gpt4_cot_ape_prompt_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4_meta_system_prompt_scores + gpt4_meta_system_prompt_detailed_scores + gpt4_meta_meta_prompt_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4_tot_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4_tot_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4_tot_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores)


GPT-4 zero-shot average
Succes rate: 0.4033333333333333
Avg score: 41.082644628099175
GPT-4 cot average
Succes rate: 0.45
Avg score: 40.144444444444446
GPT-4 Meta average
Succes rate: 0.26
Avg score: 42.25
GPT-4 tot average
Succes rate: 0.14666666666666667
Avg score: 39.61363636363637
GPT-4 Totale average
Succes rate: 0.307
Avg score: 40.79478827361564


In [165]:
# GPT-3.5 zero shot
print("GPT-3.5 zero-shot average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results,
                        gpt3_5_zs_baseline_system_prompt_detailed_results,
                        gpt3_5_zs_role_system_prompt_results,
                        gpt3_5_zs_role_system_prompt_detailed_results,
                        gpt3_5_zs_role_best_system_prompt_results,
                        gpt3_5_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores)

print("GPT-3.5 cot average")
calculate_success_rate([gpt3_5_cot_human_prompt_results,
                        gpt3_5_cot_human_prompt_detailed_results,
                        gpt3_5_cot_ape_prompt_results,
                        gpt3_5_cot_ape_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores)

print("GPT-3.5 Meta average")
calculate_success_rate([gpt3_5_meta_system_prompt_results,
                        gpt3_5_meta_system_prompt_detailed_results,
                        gpt3_5_meta_meta_prompt_results,
                        gpt3_5_meta_meta_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores)

print("GPT-3.5 tot average")
calculate_success_rate([gpt3_5_tot_prompt_1_results,
                        gpt3_5_tot_detailed_prompt_1_results,
                        gpt3_5_tot_prompt_2_results,
                        gpt3_5_tot_detailed_prompt_2_results,
                        gpt3_5_tot_prompt_3_results,
                        gpt3_5_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_tot_prompt_1_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt3_5_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_3_scores)

# GPT-3.5 Totale average
print("GPT-3.5 Totale average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results,
                        gpt3_5_zs_baseline_system_prompt_detailed_results,
                        gpt3_5_zs_role_system_prompt_results,
                        gpt3_5_zs_role_system_prompt_detailed_results,
                        gpt3_5_zs_role_best_system_prompt_results,
                        gpt3_5_zs_role_best_system_prompt_detailed_results,
                        gpt3_5_cot_human_prompt_results,
                        gpt3_5_cot_human_prompt_detailed_results,
                        gpt3_5_cot_ape_prompt_results,
                        gpt3_5_cot_ape_prompt_detailed_results,
                        gpt3_5_meta_system_prompt_results,
                        gpt3_5_meta_system_prompt_detailed_results,
                        gpt3_5_meta_meta_prompt_results,
                        gpt3_5_meta_meta_prompt_detailed_results,
                        gpt3_5_tot_prompt_1_results,
                        gpt3_5_tot_detailed_prompt_1_results,
                        gpt3_5_tot_prompt_2_results,
                        gpt3_5_tot_detailed_prompt_2_results,
                        gpt3_5_tot_prompt_3_results,
                        gpt3_5_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt3_5_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores + gpt3_5_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores + gpt3_5_tot_prompt_1_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt3_5_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_3_scores)


GPT-3.5 zero-shot average
Succes rate: 0.02666666666666667
Avg score: 36.625
GPT-3.5 cot average
Succes rate: 0.14
Avg score: 34.32142857142857
GPT-3.5 Meta average
Succes rate: 0.03
Avg score: 38.166666666666664
GPT-3.5 tot average
Succes rate: 0.0033333333333333335
Avg score: 36.0
GPT-3.5 Totale average
Succes rate: 0.043
Avg score: 35.325581395348834


In [166]:
# Totale average zero shot
print("Totale average zero-shot average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results,
                        gpt4_zs_baseline_system_prompt_results,
                        gpt4o_zs_baseline_system_prompt_results,
                        gpt3_5_zs_baseline_system_prompt_detailed_results,
                        gpt4_zs_baseline_system_prompt_detailed_results,
                        gpt4o_zs_baseline_system_prompt_detailed_results,
                        gpt3_5_zs_role_system_prompt_results,
                        gpt4_zs_role_system_prompt_results,
                        gpt4o_zs_role_system_prompt_results,
                        gpt3_5_zs_role_system_prompt_detailed_results,
                        gpt4_zs_role_system_prompt_detailed_results,
                        gpt4o_zs_role_system_prompt_detailed_results,
                        gpt3_5_zs_role_best_system_prompt_results,
                        gpt4_zs_role_best_system_prompt_results,
                        gpt4o_zs_role_best_system_prompt_results,
                        gpt3_5_zs_role_best_system_prompt_detailed_results,
                        gpt4_zs_role_best_system_prompt_detailed_results,
                        gpt4o_zs_role_best_system_prompt_detailed_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_scores + gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_detailed_scores)

print("Totale average cot average")
calculate_success_rate([gpt3_5_cot_human_prompt_results,
                        gpt4_cot_human_prompt_results,
                        gpt4o_cot_human_prompt_results,
                        gpt3_5_cot_human_prompt_detailed_results,
                        gpt4_cot_human_prompt_detailed_results,
                        gpt4o_cot_human_prompt_detailed_results,
                        gpt3_5_cot_ape_prompt_results,
                        gpt4_cot_ape_prompt_results,
                        gpt4o_cot_ape_prompt_results,
                        gpt3_5_cot_ape_prompt_detailed_results,
                        gpt4_cot_ape_prompt_detailed_results,
                        gpt4o_cot_ape_prompt_detailed_results])
calculate_average_score(gpt3_5_cot_human_prompt_scores + gpt4_cot_human_prompt_scores + gpt4o_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt4_cot_human_prompt_detailed_scores + gpt4o_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_scores + gpt4_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4o_cot_ape_prompt_detailed_scores)

print("Totale average Meta average")
calculate_success_rate([gpt3_5_meta_system_prompt_results,
                        gpt4_meta_system_prompt_results,
                        gpt4o_meta_system_prompt_results,
                        gpt3_5_meta_system_prompt_detailed_results,
                        gpt4_meta_system_prompt_detailed_results,
                        gpt4o_meta_system_prompt_detailed_results,
                        gpt3_5_meta_meta_prompt_results,
                        gpt4_meta_meta_prompt_results,
                        gpt4o_meta_meta_prompt_results,
                        gpt3_5_meta_meta_prompt_detailed_results,
                        gpt4_meta_meta_prompt_detailed_results,
                        gpt4o_meta_meta_prompt_detailed_results])
calculate_average_score(gpt3_5_meta_system_prompt_scores + gpt4_meta_system_prompt_scores + gpt4o_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt4_meta_system_prompt_detailed_scores + gpt4o_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_scores + gpt4_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4o_meta_meta_prompt_detailed_scores)

print("Totale average tot average")
calculate_success_rate([gpt3_5_tot_prompt_1_results,
                        gpt4_tot_prompt_1_results,
                        gpt4o_tot_prompt_1_results,
                        gpt3_5_tot_detailed_prompt_1_results,
                        gpt4_tot_detailed_prompt_1_results,
                        gpt4o_tot_detailed_prompt_1_results,
                        gpt3_5_tot_prompt_2_results,
                        gpt4_tot_prompt_2_results,
                        gpt4o_tot_prompt_2_results,
                        gpt3_5_tot_detailed_prompt_2_results,
                        gpt4_tot_detailed_prompt_2_results,
                        gpt4o_tot_detailed_prompt_2_results,
                        gpt3_5_tot_prompt_3_results,
                        gpt4_tot_prompt_3_results,
                        gpt4o_tot_prompt_3_results,
                        gpt3_5_tot_detailed_prompt_3_results,
                        gpt4_tot_detailed_prompt_3_results,
                        gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_tot_prompt_1_scores + gpt4_tot_prompt_1_scores + gpt4o_tot_prompt_1_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt4_tot_prompt_2_scores + gpt4o_tot_prompt_2_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores + gpt3_5_tot_prompt_3_scores + gpt4_tot_prompt_3_scores + gpt4o_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)

# Totale totale average
print("Totale totale average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results,
                        gpt4_zs_baseline_system_prompt_results,
                        gpt4o_zs_baseline_system_prompt_results,
                        gpt3_5_zs_baseline_system_prompt_detailed_results,
                        gpt4_zs_baseline_system_prompt_detailed_results,
                        gpt4o_zs_baseline_system_prompt_detailed_results,
                        gpt3_5_zs_role_system_prompt_results,
                        gpt4_zs_role_system_prompt_results,
                        gpt4o_zs_role_system_prompt_results,
                        gpt3_5_zs_role_system_prompt_detailed_results,
                        gpt4_zs_role_system_prompt_detailed_results,
                        gpt4o_zs_role_system_prompt_detailed_results,
                        gpt3_5_zs_role_best_system_prompt_results,
                        gpt4_zs_role_best_system_prompt_results,
                        gpt4o_zs_role_best_system_prompt_results,
                        gpt3_5_zs_role_best_system_prompt_detailed_results,
                        gpt4_zs_role_best_system_prompt_detailed_results,
                        gpt4o_zs_role_best_system_prompt_detailed_results,
                        gpt3_5_cot_human_prompt_results,
                        gpt4_cot_human_prompt_results,
                        gpt4o_cot_human_prompt_results,
                        gpt3_5_cot_human_prompt_detailed_results,
                        gpt4_cot_human_prompt_detailed_results,
                        gpt4o_cot_human_prompt_detailed_results,
                        gpt3_5_cot_ape_prompt_results,
                        gpt4_cot_ape_prompt_results,
                        gpt4o_cot_ape_prompt_results,
                        gpt3_5_cot_ape_prompt_detailed_results,
                        gpt4_cot_ape_prompt_detailed_results,
                        gpt4o_cot_ape_prompt_detailed_results,
                        gpt3_5_meta_system_prompt_results,
                        gpt4_meta_system_prompt_results,
                        gpt4o_meta_system_prompt_results,
                        gpt3_5_meta_system_prompt_detailed_results,
                        gpt4_meta_system_prompt_detailed_results,
                        gpt4o_meta_system_prompt_detailed_results,
                        gpt3_5_meta_meta_prompt_results,
                        gpt4_meta_meta_prompt_results,
                        gpt4o_meta_meta_prompt_results,
                        gpt3_5_meta_meta_prompt_detailed_results,
                        gpt4_meta_meta_prompt_detailed_results,
                        gpt4o_meta_meta_prompt_detailed_results,
                        gpt3_5_tot_prompt_1_results,
                        gpt4_tot_prompt_1_results,
                        gpt4o_tot_prompt_1_results,
                        gpt3_5_tot_detailed_prompt_1_results,
                        gpt4_tot_detailed_prompt_1_results,
                        gpt4o_tot_detailed_prompt_1_results,
                        gpt3_5_tot_prompt_2_results,
                        gpt4_tot_prompt_2_results,
                        gpt4o_tot_prompt_2_results,
                        gpt3_5_tot_detailed_prompt_2_results,
                        gpt4_tot_detailed_prompt_2_results,
                        gpt4o_tot_detailed_prompt_2_results,
                        gpt3_5_tot_prompt_3_results,
                        gpt4_tot_prompt_3_results,
                        gpt4o_tot_prompt_3_results,
                        gpt3_5_tot_detailed_prompt_3_results,
                        gpt4_tot_detailed_prompt_3_results,
                        gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_scores + gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_detailed_scores + gpt3_5_cot_human_prompt_scores + gpt4_cot_human_prompt_scores + gpt4o_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt4_cot_human_prompt_detailed_scores + gpt4o_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_scores + gpt4_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4o_cot_ape_prompt_detailed_scores + gpt3_5_meta_system_prompt_scores + gpt4_meta_system_prompt_scores + gpt4o_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt4_meta_system_prompt_detailed_scores + gpt4o_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_scores + gpt4_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4o_meta_meta_prompt_detailed_scores + gpt3_5_tot_prompt_1_scores + gpt4_tot_prompt_1_scores + gpt4o_tot_prompt_1_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt4_tot_prompt_2_scores + gpt4o_tot_prompt_2_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores + gpt3_5_tot_prompt_3_scores + gpt4_tot_prompt_3_scores + gpt4o_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)


Totale average zero-shot average
Succes rate: 0.3711111111111111
Avg score: 40.83832335329341
Totale average cot average
Succes rate: 0.4216666666666667
Avg score: 39.80237154150198
Totale average Meta average
Succes rate: 0.24666666666666667
Avg score: 42.92567567567568
Totale average tot average
Succes rate: 0.2511111111111111
Avg score: 40.2787610619469
Totale totale average
Succes rate: 0.32033333333333336
Avg score: 40.755463059313215


In [167]:
# GPT-4o not detailed average
print("GPT-4o not detailed average")
calculate_success_rate([gpt4o_zs_baseline_system_prompt_results,
                        gpt4o_zs_role_system_prompt_results,
                        gpt4o_zs_role_best_system_prompt_results,
                        gpt4o_cot_human_prompt_results,
                        gpt4o_cot_ape_prompt_results,
                        gpt4o_meta_system_prompt_results,
                        gpt4o_meta_meta_prompt_results,
                        gpt4o_tot_prompt_1_results,
                        gpt4o_tot_prompt_2_results,
                        gpt4o_tot_prompt_3_results])
calculate_average_score(gpt4o_zs_baseline_system_prompt_scores + gpt4o_zs_role_system_prompt_scores + gpt4o_zs_role_best_system_prompt_scores + gpt4o_cot_human_prompt_scores + gpt4o_cot_ape_prompt_scores + gpt4o_meta_system_prompt_scores + gpt4o_meta_meta_prompt_scores + gpt4o_tot_prompt_1_scores + gpt4o_tot_prompt_2_scores + gpt4o_tot_prompt_3_scores)


# GPT-4o detailed average
print("GPT-4o detailed average")
calculate_success_rate([gpt4o_zs_baseline_system_prompt_detailed_results,
                        gpt4o_zs_role_system_prompt_detailed_results,
                        gpt4o_zs_role_best_system_prompt_detailed_results,
                        gpt4o_cot_human_prompt_detailed_results,
                        gpt4o_cot_ape_prompt_detailed_results,
                        gpt4o_meta_system_prompt_detailed_results,
                        gpt4o_meta_meta_prompt_detailed_results,
                        gpt4o_tot_detailed_prompt_1_results,
                        gpt4o_tot_detailed_prompt_2_results,
                        gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt4o_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_detailed_scores + gpt4o_cot_human_prompt_detailed_scores + gpt4o_cot_ape_prompt_detailed_scores + gpt4o_meta_system_prompt_detailed_scores + gpt4o_meta_meta_prompt_detailed_scores + gpt4o_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_3_scores)


GPT-4o not detailed average
Succes rate: 0.504
Avg score: 38.95634920634921
GPT-4o detailed average
Succes rate: 0.718
Avg score: 42.63509749303621


In [168]:
# GPT-4 not detailed average
print("GPT-4 not detailed average")
calculate_success_rate([gpt4_zs_baseline_system_prompt_results,
                        gpt4_zs_role_system_prompt_results,
                        gpt4_zs_role_best_system_prompt_results,
                        gpt4_cot_human_prompt_results,
                        gpt4_cot_ape_prompt_results,
                        gpt4_meta_system_prompt_results,
                        gpt4_meta_meta_prompt_results,
                        gpt4_tot_prompt_1_results,
                        gpt4_tot_prompt_2_results,
                        gpt4_tot_prompt_3_results])
calculate_average_score(gpt4_zs_baseline_system_prompt_scores + gpt4_zs_role_system_prompt_scores + gpt4_zs_role_best_system_prompt_scores + gpt4_cot_human_prompt_scores + gpt4_cot_ape_prompt_scores + gpt4_meta_system_prompt_scores + gpt4_meta_meta_prompt_scores + gpt4_tot_prompt_1_scores + gpt4_tot_prompt_2_scores + gpt4_tot_prompt_3_scores)

# GPT-4 detailed average
print("GPT-4 detailed average")
calculate_success_rate([gpt4_zs_baseline_system_prompt_detailed_results,
                        gpt4_zs_role_system_prompt_detailed_results,
                        gpt4_zs_role_best_system_prompt_detailed_results,
                        gpt4_cot_human_prompt_detailed_results,
                        gpt4_cot_ape_prompt_detailed_results,
                        gpt4_meta_system_prompt_detailed_results,
                        gpt4_meta_meta_prompt_detailed_results,
                        gpt4_tot_detailed_prompt_1_results,
                        gpt4_tot_detailed_prompt_2_results,
                        gpt4_tot_detailed_prompt_3_results])
calculate_average_score(gpt4_zs_baseline_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4_cot_human_prompt_detailed_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4_meta_system_prompt_detailed_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_3_scores)


GPT-4 not detailed average
Succes rate: 0.16
Avg score: 37.0
GPT-4 detailed average
Succes rate: 0.454
Avg score: 42.13215859030837


In [169]:
# GPT-3.5 not detailed average
print("GPT-3.5 not detailed average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results,
                        gpt3_5_zs_role_system_prompt_results,
                        gpt3_5_zs_role_best_system_prompt_results,
                        gpt3_5_cot_human_prompt_results,
                        gpt3_5_cot_ape_prompt_results,
                        gpt3_5_meta_system_prompt_results,
                        gpt3_5_meta_meta_prompt_results,
                        gpt3_5_tot_prompt_1_results,
                        gpt3_5_tot_prompt_2_results,
                        gpt3_5_tot_prompt_3_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt3_5_zs_role_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt3_5_cot_human_prompt_scores + gpt3_5_cot_ape_prompt_scores + gpt3_5_meta_system_prompt_scores + gpt3_5_meta_meta_prompt_scores + gpt3_5_tot_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt3_5_tot_prompt_3_scores)

# GPT-3.5 detailed average
print("GPT-3.5 detailed average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_detailed_results,
                        gpt3_5_zs_role_system_prompt_detailed_results,
                        gpt3_5_zs_role_best_system_prompt_detailed_results,
                        gpt3_5_cot_human_prompt_detailed_results,
                        gpt3_5_cot_ape_prompt_detailed_results,
                        gpt3_5_meta_system_prompt_detailed_results,
                        gpt3_5_meta_meta_prompt_detailed_results,
                        gpt3_5_tot_detailed_prompt_1_results,
                        gpt3_5_tot_detailed_prompt_2_results,
                        gpt3_5_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_detailed_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_detailed_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt3_5_tot_detailed_prompt_3_scores)


GPT-3.5 not detailed average
Succes rate: 0.056
Avg score: 34.32142857142857
GPT-3.5 detailed average
Succes rate: 0.03
Avg score: 37.2


In [170]:
# Totale not detailed average
print("Totale not detailed average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_results,
                        gpt4_zs_baseline_system_prompt_results,
                        gpt4o_zs_baseline_system_prompt_results,
                        gpt3_5_zs_role_system_prompt_results,
                        gpt4_zs_role_system_prompt_results,
                        gpt4o_zs_role_system_prompt_results,
                        gpt3_5_zs_role_best_system_prompt_results,
                        gpt4_zs_role_best_system_prompt_results,
                        gpt4o_zs_role_best_system_prompt_results,
                        gpt3_5_cot_human_prompt_results,
                        gpt4_cot_human_prompt_results,
                        gpt4o_cot_human_prompt_results,
                        gpt3_5_cot_ape_prompt_results,
                        gpt4_cot_ape_prompt_results,
                        gpt4o_cot_ape_prompt_results,
                        gpt3_5_meta_system_prompt_results,
                        gpt4_meta_system_prompt_results,
                        gpt4o_meta_system_prompt_results,
                        gpt3_5_meta_meta_prompt_results,
                        gpt4_meta_meta_prompt_results,
                        gpt4o_meta_meta_prompt_results,
                        gpt3_5_tot_prompt_1_results,
                        gpt4_tot_prompt_1_results,
                        gpt4o_tot_prompt_1_results,
                        gpt3_5_tot_prompt_2_results,
                        gpt4_tot_prompt_2_results,
                        gpt4o_tot_prompt_2_results,
                        gpt3_5_tot_prompt_3_results,
                        gpt4_tot_prompt_3_results,
                        gpt4o_tot_prompt_3_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_scores + gpt3_5_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_scores + gpt3_5_cot_human_prompt_scores + gpt4_cot_human_prompt_scores + gpt4o_cot_human_prompt_scores + gpt3_5_cot_ape_prompt_scores + gpt4_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_scores + gpt3_5_meta_system_prompt_scores + gpt4_meta_system_prompt_scores + gpt4o_meta_system_prompt_scores + gpt3_5_meta_meta_prompt_scores + gpt4_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_scores + gpt3_5_tot_prompt_1_scores + gpt4_tot_prompt_1_scores + gpt4o_tot_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt4_tot_prompt_2_scores + gpt4o_tot_prompt_2_scores + gpt3_5_tot_prompt_3_scores + gpt4_tot_prompt_3_scores + gpt4o_tot_prompt_3_scores)

# Totale detailed average
print("Totale detailed average")
calculate_success_rate([gpt3_5_zs_baseline_system_prompt_detailed_results,
                        gpt4_zs_baseline_system_prompt_detailed_results,
                        gpt4o_zs_baseline_system_prompt_detailed_results,
                        gpt3_5_zs_role_system_prompt_detailed_results,
                        gpt4_zs_role_system_prompt_detailed_results,
                        gpt4o_zs_role_system_prompt_detailed_results,
                        gpt3_5_zs_role_best_system_prompt_detailed_results,
                        gpt4_zs_role_best_system_prompt_detailed_results,
                        gpt4o_zs_role_best_system_prompt_detailed_results,
                        gpt3_5_cot_human_prompt_detailed_results,
                        gpt4_cot_human_prompt_detailed_results,
                        gpt4o_cot_human_prompt_detailed_results,
                        gpt3_5_cot_ape_prompt_detailed_results,
                        gpt4_cot_ape_prompt_detailed_results,
                        gpt4o_cot_ape_prompt_detailed_results,
                        gpt3_5_meta_system_prompt_detailed_results,
                        gpt4_meta_system_prompt_detailed_results,
                        gpt4o_meta_system_prompt_detailed_results,
                        gpt3_5_meta_meta_prompt_detailed_results,
                        gpt4_meta_meta_prompt_detailed_results,
                        gpt4o_meta_meta_prompt_detailed_results,
                        gpt3_5_tot_detailed_prompt_1_results,
                        gpt4_tot_detailed_prompt_1_results,
                        gpt4o_tot_detailed_prompt_1_results,
                        gpt3_5_tot_detailed_prompt_2_results,
                        gpt4_tot_detailed_prompt_2_results,
                        gpt4o_tot_detailed_prompt_2_results,
                        gpt3_5_tot_detailed_prompt_3_results,
                        gpt4_tot_detailed_prompt_3_results,
                        gpt4o_tot_detailed_prompt_3_results])
calculate_average_score(gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_detailed_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt4_cot_human_prompt_detailed_scores + gpt4o_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_detailed_scores + gpt4_cot_ape_prompt_detailed_scores + gpt4o_cot_ape_prompt_detailed_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt4_meta_system_prompt_detailed_scores + gpt4o_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_detailed_scores + gpt4_meta_meta_prompt_detailed_scores + gpt4o_meta_meta_prompt_detailed_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_1_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_2_scores + gpt3_5_tot_detailed_prompt_3_scores + gpt4_tot_detailed_prompt_3_scores + gpt4o_tot_detailed_prompt_3_scores)


Totale not detailed average
Succes rate: 0.24
Avg score: 38.16111111111111
Totale detailed average
Succes rate: 0.40066666666666667
Avg score: 42.30948419301165


# Code for fails table

In [171]:
def extract_fails(input):
    if isinstance(input, list):
        results = aggregate_all_test_results(input)
    else:
        results = input
    other_deployment_errors = results["deployment_failed"] - results["duplicate_resources"]
    mysql_failed = results["MySQL_PVC_unbound"] + results["MySQL_not_ready"]
    other_health_errors = results["Service_failed"] + results["Unknown_error"] + results["Wordpress_PVC_unbound"]
    result_string = "," + str(results["yaml_not_wrapped"]) + "," + str(results["no_mysql_or_wordpress"]) + "," + str(results["invalid_yaml"]) + "," + str(results["kubeconform_failed"]) + "," + str(results["duplicate_resources"]) + "," + str(other_deployment_errors) + "," + str(results["PVC_failed"]) + "," + str(results["Missing_resources"]) + "," + str(results["Missing_key"]) + "," + str(mysql_failed) + "," + str(results["Wordpress_not_ready"]) + "," + str(other_health_errors) + "," + str(results["total_responses"] - results["healthy"])
    if (results["yaml_not_wrapped"] + results["no_mysql_or_wordpress"] + results["invalid_yaml"] + results["kubeconform_failed"] + results["duplicate_resources"] + other_deployment_errors + results["PVC_failed"] + results["Missing_resources"] + results["Missing_key"] + mysql_failed + results["Wordpress_not_ready"] + other_health_errors) != (results["total_responses"] - results["healthy"]):
        raise Exception("Error fail count does not match total responses")
    return result_string

In [172]:

# gpt4o
print("GPT-4o" + extract_fails([gpt4o_zs_baseline_system_prompt_results,
                                gpt4o_zs_baseline_system_prompt_detailed_results,
                                gpt4o_zs_role_system_prompt_results,
                                gpt4o_zs_role_system_prompt_detailed_results,
                                gpt4o_zs_role_best_system_prompt_results,
                                gpt4o_zs_role_best_system_prompt_detailed_results,
                                gpt4o_cot_human_prompt_results,
                                gpt4o_cot_human_prompt_detailed_results,
                                gpt4o_cot_ape_prompt_results,
                                gpt4o_cot_ape_prompt_detailed_results,
                                gpt4o_meta_system_prompt_results,
                                gpt4o_meta_system_prompt_detailed_results,
                                gpt4o_meta_meta_prompt_results,
                                gpt4o_meta_meta_prompt_detailed_results,
                                gpt4o_tot_prompt_1_results,
                                gpt4o_tot_detailed_prompt_1_results,
                                gpt4o_tot_prompt_2_results,
                                gpt4o_tot_detailed_prompt_2_results,
                                gpt4o_tot_prompt_3_results,
                                gpt4o_tot_detailed_prompt_3_results]))
# gpt4o zero-shot baseline
print("Zero-Shot" + extract_fails([gpt4o_zs_baseline_system_prompt_results,
                                   gpt4o_zs_baseline_system_prompt_detailed_results]))

# gpt4o zero-shot role-based
print("Role-Based" + extract_fails([gpt4o_zs_role_system_prompt_results,
                                    gpt4o_zs_role_system_prompt_detailed_results,
                                    gpt4o_zs_role_best_system_prompt_results,
                                    gpt4o_zs_role_best_system_prompt_detailed_results]))

# gpt4o cot
print("Zero-Shot CoT" + extract_fails([gpt4o_cot_human_prompt_results,
                             gpt4o_cot_human_prompt_detailed_results,
                             gpt4o_cot_ape_prompt_results,
                             gpt4o_cot_ape_prompt_detailed_results]))

# gpt4o meta
print("Meta prompting" + extract_fails([gpt4o_meta_system_prompt_results,
                             gpt4o_meta_system_prompt_detailed_results]))

# gpt4o meta-meta
print("Meta meta prompting" + extract_fails([gpt4o_meta_meta_prompt_results,
                             gpt4o_meta_meta_prompt_detailed_results]))

# gpt4o tot
print("ToT-style prompts" + extract_fails([gpt4o_tot_prompt_1_results,
                             gpt4o_tot_detailed_prompt_1_results,
                             gpt4o_tot_prompt_2_results,
                             gpt4o_tot_detailed_prompt_2_results,
                             gpt4o_tot_prompt_3_results,
                             gpt4o_tot_detailed_prompt_3_results]))

# gpt4
print("GPT-4" + extract_fails([gpt4_zs_baseline_system_prompt_results,
                                gpt4_zs_baseline_system_prompt_detailed_results,
                                gpt4_zs_role_system_prompt_results,
                                gpt4_zs_role_system_prompt_detailed_results,
                                gpt4_zs_role_best_system_prompt_results,
                                gpt4_zs_role_best_system_prompt_detailed_results,
                                gpt4_cot_human_prompt_results,
                                gpt4_cot_human_prompt_detailed_results,
                                gpt4_cot_ape_prompt_results,
                                gpt4_cot_ape_prompt_detailed_results,
                                gpt4_meta_system_prompt_results,
                                gpt4_meta_system_prompt_detailed_results,
                                gpt4_meta_meta_prompt_results,
                                gpt4_meta_meta_prompt_detailed_results,
                                gpt4_tot_prompt_1_results,
                                gpt4_tot_detailed_prompt_1_results,
                                gpt4_tot_prompt_2_results,
                                gpt4_tot_detailed_prompt_2_results,
                                gpt4_tot_prompt_3_results,
                                gpt4_tot_detailed_prompt_3_results]))

# gpt4 zero-shot baseline
print("Zero-Shot" + extract_fails([gpt4_zs_baseline_system_prompt_results,
                                   gpt4_zs_baseline_system_prompt_detailed_results]))

# gpt4 zero-shot role-based
print("Role-Based" + extract_fails([gpt4_zs_role_system_prompt_results,
                                    gpt4_zs_role_system_prompt_detailed_results,
                                    gpt4_zs_role_best_system_prompt_results,
                                    gpt4_zs_role_best_system_prompt_detailed_results]))

# gpt4 cot
print("Zero-Shot CoT" + extract_fails([gpt4_cot_human_prompt_results,
                             gpt4_cot_human_prompt_detailed_results,
                             gpt4_cot_ape_prompt_results,
                             gpt4_cot_ape_prompt_detailed_results]))

# gpt4 meta
print("Meta prompting" + extract_fails([gpt4_meta_system_prompt_results,
                             gpt4_meta_system_prompt_detailed_results]))

# gpt4 meta-meta
print("Meta meta prompting" + extract_fails([gpt4_meta_meta_prompt_results,
                             gpt4_meta_meta_prompt_detailed_results]))

# gpt4 tot
print("ToT-style prompts" + extract_fails([gpt4_tot_prompt_1_results,
                             gpt4_tot_detailed_prompt_1_results,
                             gpt4_tot_prompt_2_results,
                             gpt4_tot_detailed_prompt_2_results,
                             gpt4_tot_prompt_3_results,
                             gpt4_tot_detailed_prompt_3_results]))

# gpt3.5
print("GPT-3.5" + extract_fails([gpt3_5_zs_baseline_system_prompt_results,
                                gpt3_5_zs_baseline_system_prompt_detailed_results,
                                gpt3_5_zs_role_system_prompt_results,
                                gpt3_5_zs_role_system_prompt_detailed_results,
                                gpt3_5_zs_role_best_system_prompt_results,
                                gpt3_5_zs_role_best_system_prompt_detailed_results,
                                gpt3_5_cot_human_prompt_results,
                                gpt3_5_cot_human_prompt_detailed_results,
                                gpt3_5_cot_ape_prompt_results,
                                gpt3_5_cot_ape_prompt_detailed_results,
                                gpt3_5_meta_system_prompt_results,
                                gpt3_5_meta_system_prompt_detailed_results,
                                gpt3_5_meta_meta_prompt_results,
                                gpt3_5_meta_meta_prompt_detailed_results,
                                gpt3_5_tot_prompt_1_results,
                                gpt3_5_tot_detailed_prompt_1_results,
                                gpt3_5_tot_prompt_2_results,
                                gpt3_5_tot_detailed_prompt_2_results,
                                gpt3_5_tot_prompt_3_results,
                                gpt3_5_tot_detailed_prompt_3_results]))

# gpt3.5 zero-shot baseline
print("Zero-Shot" + extract_fails([gpt3_5_zs_baseline_system_prompt_results,
                                   gpt3_5_zs_baseline_system_prompt_detailed_results]))

# gpt3.5 zero-shot role-based
print("Role-Based" + extract_fails([gpt3_5_zs_role_system_prompt_results,
                                    gpt3_5_zs_role_system_prompt_detailed_results,
                                    gpt3_5_zs_role_best_system_prompt_results,
                                    gpt3_5_zs_role_best_system_prompt_detailed_results]))

# gpt3.5 cot
print("Zero-Shot CoT" + extract_fails([gpt3_5_cot_human_prompt_results,
                             gpt3_5_cot_human_prompt_detailed_results,
                             gpt3_5_cot_ape_prompt_results,
                             gpt3_5_cot_ape_prompt_detailed_results]))

# gpt3.5 meta
print("Meta prompting" + extract_fails([gpt3_5_meta_system_prompt_results,
                             gpt3_5_meta_system_prompt_detailed_results]))

# gpt3.5 meta-meta
print("Meta meta prompting" + extract_fails([gpt3_5_meta_meta_prompt_results,
                             gpt3_5_meta_meta_prompt_detailed_results]))

# gpt3.5 tot
print("ToT-style prompts" + extract_fails([gpt3_5_tot_prompt_1_results,
                             gpt3_5_tot_detailed_prompt_1_results,
                             gpt3_5_tot_prompt_2_results,
                             gpt3_5_tot_detailed_prompt_2_results,
                             gpt3_5_tot_prompt_3_results,
                             gpt3_5_tot_detailed_prompt_3_results]))

# Total
print("Total" + extract_fails([gpt4o_zs_baseline_system_prompt_results,
                                gpt4o_zs_baseline_system_prompt_detailed_results,
                                gpt4o_zs_role_system_prompt_results,
                                gpt4o_zs_role_system_prompt_detailed_results,
                                gpt4o_zs_role_best_system_prompt_results,
                                gpt4o_zs_role_best_system_prompt_detailed_results,
                                gpt4o_cot_human_prompt_results,
                                gpt4o_cot_human_prompt_detailed_results,
                                gpt4o_cot_ape_prompt_results,
                                gpt4o_cot_ape_prompt_detailed_results,
                                gpt4o_meta_system_prompt_results,
                                gpt4o_meta_system_prompt_detailed_results,
                                gpt4o_meta_meta_prompt_results,
                                gpt4o_meta_meta_prompt_detailed_results,
                                gpt4o_tot_prompt_1_results,
                                gpt4o_tot_detailed_prompt_1_results,
                                gpt4o_tot_prompt_2_results,
                                gpt4o_tot_detailed_prompt_2_results,
                                gpt4o_tot_prompt_3_results,
                                gpt4o_tot_detailed_prompt_3_results,
                                gpt4_zs_baseline_system_prompt_results,
                                gpt4_zs_baseline_system_prompt_detailed_results,
                                gpt4_zs_role_system_prompt_results,
                                gpt4_zs_role_system_prompt_detailed_results,
                                gpt4_zs_role_best_system_prompt_results,
                                gpt4_zs_role_best_system_prompt_detailed_results,
                                gpt4_cot_human_prompt_results,
                                gpt4_cot_human_prompt_detailed_results,
                                gpt4_cot_ape_prompt_results,
                                gpt4_cot_ape_prompt_detailed_results,
                                gpt4_meta_system_prompt_results,
                                gpt4_meta_system_prompt_detailed_results,
                                gpt4_meta_meta_prompt_results,
                                gpt4_meta_meta_prompt_detailed_results,
                                gpt4_tot_prompt_1_results,
                                gpt4_tot_detailed_prompt_1_results,
                                gpt4_tot_prompt_2_results,
                                gpt4_tot_detailed_prompt_2_results,
                                gpt4_tot_prompt_3_results,
                                gpt4_tot_detailed_prompt_3_results,
                                gpt3_5_zs_baseline_system_prompt_results,
                                gpt3_5_zs_baseline_system_prompt_detailed_results,
                                gpt3_5_zs_role_system_prompt_results,
                                gpt3_5_zs_role_system_prompt_detailed_results,
                                gpt3_5_zs_role_best_system_prompt_results,
                                gpt3_5_zs_role_best_system_prompt_detailed_results,
                                gpt3_5_cot_human_prompt_results,
                                gpt3_5_cot_human_prompt_detailed_results,
                                gpt3_5_cot_ape_prompt_results,
                                gpt3_5_cot_ape_prompt_detailed_results,
                                gpt3_5_meta_system_prompt_results,
                                gpt3_5_meta_system_prompt_detailed_results,
                                gpt3_5_meta_meta_prompt_results,
                                gpt3_5_meta_meta_prompt_detailed_results,
                                gpt3_5_tot_prompt_1_results,
                                gpt3_5_tot_detailed_prompt_1_results,
                                gpt3_5_tot_prompt_2_results,
                                gpt3_5_tot_detailed_prompt_2_results,
                                gpt3_5_tot_prompt_3_results,
                                gpt3_5_tot_detailed_prompt_3_results]))

# Total zero-shot
print("Zero-Shot" + extract_fails([gpt4o_zs_baseline_system_prompt_results,
                                gpt4o_zs_baseline_system_prompt_detailed_results,
                                gpt4_zs_baseline_system_prompt_results,
                                gpt4_zs_baseline_system_prompt_detailed_results,
                                gpt3_5_zs_baseline_system_prompt_results,
                                gpt3_5_zs_baseline_system_prompt_detailed_results]))

# Total role-based
print("Role-Based" + extract_fails([gpt4o_zs_role_system_prompt_results,
                                gpt4o_zs_role_system_prompt_detailed_results,
                                gpt4o_zs_role_best_system_prompt_results,
                                gpt4o_zs_role_best_system_prompt_detailed_results,
                                gpt4_zs_role_system_prompt_results,
                                gpt4_zs_role_system_prompt_detailed_results,
                                gpt4_zs_role_best_system_prompt_results,
                                gpt4_zs_role_best_system_prompt_detailed_results,
                                gpt3_5_zs_role_system_prompt_results,
                                gpt3_5_zs_role_system_prompt_detailed_results,
                                gpt3_5_zs_role_best_system_prompt_results,
                                gpt3_5_zs_role_best_system_prompt_detailed_results]))

# Total cot
print("Zero-Shot CoT" + extract_fails([gpt4o_cot_human_prompt_results,
                                gpt4o_cot_human_prompt_detailed_results,
                                gpt4o_cot_ape_prompt_results,
                                gpt4o_cot_ape_prompt_detailed_results,
                                gpt4_cot_human_prompt_results,
                                gpt4_cot_human_prompt_detailed_results,
                                gpt4_cot_ape_prompt_results,
                                gpt4_cot_ape_prompt_detailed_results,
                                gpt3_5_cot_human_prompt_results,
                                gpt3_5_cot_human_prompt_detailed_results,
                                gpt3_5_cot_ape_prompt_results,
                                gpt3_5_cot_ape_prompt_detailed_results]))

# Total meta
print("Meta prompting" + extract_fails([gpt4o_meta_system_prompt_results,
                                gpt4o_meta_system_prompt_detailed_results,
                                gpt4_meta_system_prompt_results,
                                gpt4_meta_system_prompt_detailed_results,
                                gpt3_5_meta_system_prompt_results,
                                gpt3_5_meta_system_prompt_detailed_results]))

# Total meta-meta
print("Meta meta prompting" + extract_fails([gpt4o_meta_meta_prompt_results,
                                gpt4o_meta_meta_prompt_detailed_results,
                                gpt4_meta_meta_prompt_results,
                                gpt4_meta_meta_prompt_detailed_results,
                                gpt3_5_meta_meta_prompt_results,
                                gpt3_5_meta_meta_prompt_detailed_results]))

# Total tot
print("ToT-style prompts" + extract_fails([gpt4o_tot_prompt_1_results,
                                gpt4o_tot_detailed_prompt_1_results,
                                gpt4o_tot_prompt_2_results,
                                gpt4o_tot_detailed_prompt_2_results,
                                gpt4o_tot_prompt_3_results,
                                gpt4o_tot_detailed_prompt_3_results,
                                gpt4_tot_prompt_1_results,
                                gpt4_tot_detailed_prompt_1_results,
                                gpt4_tot_prompt_2_results,
                                gpt4_tot_detailed_prompt_2_results,
                                gpt4_tot_prompt_3_results,
                                gpt4_tot_detailed_prompt_3_results,
                                gpt3_5_tot_prompt_1_results,
                                gpt3_5_tot_detailed_prompt_1_results,
                                gpt3_5_tot_prompt_2_results,
                                gpt3_5_tot_detailed_prompt_2_results,
                                gpt3_5_tot_prompt_3_results,
                                gpt3_5_tot_detailed_prompt_3_results]))

# Total detailed
print("Total detailed prompts" + extract_fails([gpt4o_zs_baseline_system_prompt_detailed_results,
                                gpt4o_zs_role_system_prompt_detailed_results,
                                gpt4o_zs_role_best_system_prompt_detailed_results,
                                gpt4o_cot_human_prompt_detailed_results,
                                gpt4o_cot_ape_prompt_detailed_results,
                                gpt4o_meta_system_prompt_detailed_results,
                                gpt4o_meta_meta_prompt_detailed_results,
                                gpt4o_tot_detailed_prompt_1_results,
                                gpt4o_tot_detailed_prompt_2_results,
                                gpt4o_tot_detailed_prompt_3_results,
                                gpt4_zs_baseline_system_prompt_detailed_results,
                                gpt4_zs_role_system_prompt_detailed_results,
                                gpt4_zs_role_best_system_prompt_detailed_results,
                                gpt4_cot_human_prompt_detailed_results,
                                gpt4_cot_ape_prompt_detailed_results,
                                gpt4_meta_system_prompt_detailed_results,
                                gpt4_meta_meta_prompt_detailed_results,
                                gpt4_tot_detailed_prompt_1_results,
                                gpt4_tot_detailed_prompt_2_results,
                                gpt4_tot_detailed_prompt_3_results,
                                gpt3_5_zs_baseline_system_prompt_detailed_results,
                                gpt3_5_zs_role_system_prompt_detailed_results,
                                gpt3_5_zs_role_best_system_prompt_detailed_results,
                                gpt3_5_cot_human_prompt_detailed_results,
                                gpt3_5_cot_ape_prompt_detailed_results,
                                gpt3_5_meta_system_prompt_detailed_results,
                                gpt3_5_meta_meta_prompt_detailed_results,
                                gpt3_5_tot_detailed_prompt_1_results,
                                gpt3_5_tot_detailed_prompt_2_results,
                                gpt3_5_tot_detailed_prompt_3_results]))

# Total not detailed
print("Total not detailed prompts" + extract_fails([gpt4o_zs_baseline_system_prompt_results,
                                gpt4o_zs_role_system_prompt_results,
                                gpt4o_zs_role_best_system_prompt_results,
                                gpt4o_cot_human_prompt_results,
                                gpt4o_cot_ape_prompt_results,
                                gpt4o_meta_system_prompt_results,
                                gpt4o_meta_meta_prompt_results,
                                gpt4o_tot_prompt_1_results,
                                gpt4o_tot_prompt_2_results,
                                gpt4o_tot_prompt_3_results,
                                gpt4_zs_baseline_system_prompt_results,
                                gpt4_zs_role_system_prompt_results,
                                gpt4_zs_role_best_system_prompt_results,
                                gpt4_cot_human_prompt_results,
                                gpt4_cot_ape_prompt_results,
                                gpt4_meta_system_prompt_results,
                                gpt4_meta_meta_prompt_results,
                                gpt4_tot_prompt_1_results,
                                gpt4_tot_prompt_2_results,
                                gpt4_tot_prompt_3_results,
                                gpt3_5_zs_baseline_system_prompt_results,
                                gpt3_5_zs_role_system_prompt_results,
                                gpt3_5_zs_role_best_system_prompt_results,
                                gpt3_5_cot_human_prompt_results,
                                gpt3_5_cot_ape_prompt_results,
                                gpt3_5_meta_system_prompt_results,
                                gpt3_5_meta_meta_prompt_results,
                                gpt3_5_tot_prompt_1_results,
                                gpt3_5_tot_prompt_2_results,
                                gpt3_5_tot_prompt_3_results]))

GPT-4o,2,12,9,42,4,3,12,3,0,7,280,15,389
Zero-Shot,0,0,0,0,0,0,1,0,0,0,31,0,32
Role-Based,0,0,0,1,1,2,0,0,0,1,58,0,63
Zero-Shot CoT,2,0,4,2,0,0,3,0,0,1,50,3,65
Meta prompting,0,0,0,1,0,0,0,0,0,2,25,0,28
Meta meta prompting,0,6,2,35,0,1,3,2,0,3,22,8,82
ToT-style prompts,0,6,3,3,3,0,5,1,0,0,94,4,119
GPT-4,10,61,22,53,39,7,2,66,4,9,418,2,693
Zero-Shot,0,0,0,5,0,2,0,5,1,0,48,0,61
Role-Based,0,0,1,2,0,1,0,8,2,0,104,0,118
Zero-Shot CoT,0,3,2,6,0,1,0,3,0,0,95,0,110
Meta prompting,0,0,0,2,0,1,0,1,0,0,53,0,57
Meta meta prompting,0,6,0,13,0,0,2,16,0,9,45,0,91
ToT-style prompts,10,52,19,25,39,2,0,33,1,0,73,2,256
GPT-3.5,4,120,22,59,51,53,11,86,9,5,526,11,957
Zero-Shot,0,3,1,1,2,10,1,12,0,0,67,0,97
Role-Based,0,0,3,0,1,20,7,23,1,0,140,0,195
Zero-Shot CoT,0,3,5,3,0,6,0,7,4,0,144,0,172
Meta prompting,0,1,0,2,0,6,2,2,0,1,82,0,96
Meta meta prompting,0,24,1,36,0,2,0,4,0,2,25,4,98
ToT-style prompts,4,89,12,17,48,9,1,38,4,2,68,7,299
Total,16,193,53,154,94,63,25,155,13,21,1224,28,2039
Zero-Shot,0,3,1,6,2,

In [173]:
agg_list = [
    gpt4o_zs_baseline_system_prompt_results,
    gpt4o_zs_baseline_system_prompt_detailed_results,
    gpt4o_zs_role_system_prompt_results,
    gpt4o_zs_role_system_prompt_detailed_results,
    gpt4o_zs_role_best_system_prompt_results,
    gpt4o_zs_role_best_system_prompt_detailed_results
]

results_gpt4o_zs = aggregate_all_test_results(agg_list)

scores_gpt4o_zs = gpt4o_zs_baseline_system_prompt_scores + gpt4o_zs_baseline_system_prompt_detailed_scores + gpt4o_zs_role_system_prompt_scores + gpt4o_zs_role_system_prompt_detailed_scores + gpt4o_zs_role_best_system_prompt_scores + gpt4o_zs_role_best_system_prompt_detailed_scores

agg_list = [
    gpt4o_cot_human_prompt_results,
    gpt4o_cot_human_prompt_detailed_results,
    gpt4o_cot_ape_prompt_results,
    gpt4o_cot_ape_prompt_detailed_results
]

results_gpt4o_cot = aggregate_all_test_results(agg_list)

scores_gpt4o_cot = gpt4o_cot_human_prompt_scores + gpt4o_cot_human_prompt_detailed_scores + gpt4o_cot_ape_prompt_scores + gpt4o_cot_ape_prompt_detailed_scores

agg_list = [
    gpt4o_tot_prompt_1_results,
    gpt4o_tot_prompt_2_results,
    gpt4o_tot_prompt_3_results,
    gpt4o_tot_detailed_prompt_1_results,
    gpt4o_tot_detailed_prompt_2_results,
    gpt4o_tot_detailed_prompt_3_results
]

results_gpt4o_tot = aggregate_all_test_results(agg_list)

scores_gpt4o_tot = gpt4o_tot_prompt_1_scores + gpt4o_tot_prompt_2_scores + gpt4o_tot_prompt_3_scores + gpt4o_tot_detailed_prompt_1_scores + gpt4o_tot_detailed_prompt_2_scores + gpt4o_tot_detailed_prompt_3_scores

agg_list = [
    gpt4o_meta_system_prompt_results,
    gpt4o_meta_system_prompt_detailed_results,
    gpt4o_meta_meta_prompt_results,
    gpt4o_meta_meta_prompt_detailed_results
]

results_gpt4o_meta = aggregate_all_test_results(agg_list)

scores_gpt4o_meta = gpt4o_meta_system_prompt_scores + gpt4o_meta_system_prompt_detailed_scores + gpt4o_meta_meta_prompt_scores + gpt4o_meta_meta_prompt_detailed_scores

agg_list = [
    results_gpt4o_zs,
    results_gpt4o_cot,
    results_gpt4o_tot,
    results_gpt4o_meta
]

results_gpt4o = aggregate_all_test_results(agg_list)

scores_gpt4o = scores_gpt4o_zs + scores_gpt4o_cot + scores_gpt4o_tot + scores_gpt4o_meta


In [174]:
agg_list = [
    gpt4_zs_baseline_system_prompt_results,
    gpt4_zs_baseline_system_prompt_detailed_results,
    gpt4_zs_role_system_prompt_results,
    gpt4_zs_role_system_prompt_detailed_results,
    gpt4_zs_role_best_system_prompt_results,
    gpt4_zs_role_best_system_prompt_detailed_results
]

results_gpt4_zs = aggregate_all_test_results(agg_list)

scores_gpt4_zs = gpt4_zs_baseline_system_prompt_scores + gpt4_zs_baseline_system_prompt_detailed_scores + gpt4_zs_role_system_prompt_scores + gpt4_zs_role_system_prompt_detailed_scores + gpt4_zs_role_best_system_prompt_scores + gpt4_zs_role_best_system_prompt_detailed_scores

agg_list = [
    gpt4_cot_human_prompt_results,
    gpt4_cot_human_prompt_detailed_results,
    gpt4_cot_ape_prompt_results,
    gpt4_cot_ape_prompt_detailed_results
]

results_gpt4_cot = aggregate_all_test_results(agg_list)

scores_gpt4_cot = gpt4_cot_human_prompt_scores + gpt4_cot_human_prompt_detailed_scores + gpt4_cot_ape_prompt_scores + gpt4_cot_ape_prompt_detailed_scores

agg_list = [
    gpt4_tot_prompt_1_results,
    gpt4_tot_prompt_2_results,
    gpt4_tot_prompt_3_results,
    gpt4_tot_detailed_prompt_1_results,
    gpt4_tot_detailed_prompt_2_results,
    gpt4_tot_detailed_prompt_3_results
]

results_gpt4_tot = aggregate_all_test_results(agg_list)

scores_gpt4_tot = gpt4_tot_prompt_1_scores + gpt4_tot_prompt_2_scores + gpt4_tot_prompt_3_scores + gpt4_tot_detailed_prompt_1_scores + gpt4_tot_detailed_prompt_2_scores + gpt4_tot_detailed_prompt_3_scores

agg_list = [
    gpt4_meta_system_prompt_results,
    gpt4_meta_system_prompt_detailed_results,
    gpt4_meta_meta_prompt_results,
    gpt4_meta_meta_prompt_detailed_results
]

results_gpt4_meta = aggregate_all_test_results(agg_list)

scores_gpt4_meta = gpt4_meta_system_prompt_scores + gpt4_meta_system_prompt_detailed_scores + gpt4_meta_meta_prompt_scores + gpt4_meta_meta_prompt_detailed_scores

agg_list = [
    results_gpt4_zs,
    results_gpt4_cot,
    results_gpt4_tot,
    results_gpt4_meta
]

results_gpt4 = aggregate_all_test_results(agg_list)

scores_gpt4 = scores_gpt4_zs + scores_gpt4_cot + scores_gpt4_tot + scores_gpt4_meta

In [175]:
agg_list = [
    gpt3_5_zs_baseline_system_prompt_results,
    gpt3_5_zs_baseline_system_prompt_detailed_results,
    gpt3_5_zs_role_system_prompt_results,
    gpt3_5_zs_role_system_prompt_detailed_results,
    gpt3_5_zs_role_best_system_prompt_results,
    gpt3_5_zs_role_best_system_prompt_detailed_results
]

results_gpt3_5_zs = aggregate_all_test_results(agg_list)

scores_gpt3_5_zs = gpt3_5_zs_baseline_system_prompt_scores + gpt3_5_zs_baseline_system_prompt_detailed_scores + gpt3_5_zs_role_system_prompt_scores + gpt3_5_zs_role_system_prompt_detailed_scores + gpt3_5_zs_role_best_system_prompt_scores + gpt3_5_zs_role_best_system_prompt_detailed_scores

agg_list = [
    gpt3_5_cot_human_prompt_results,
    gpt3_5_cot_human_prompt_detailed_results,
    gpt3_5_cot_ape_prompt_results,
    gpt3_5_cot_ape_prompt_detailed_results
]

results_gpt3_5_cot = aggregate_all_test_results(agg_list)

scores_gpt3_5_cot = gpt3_5_cot_human_prompt_scores + gpt3_5_cot_human_prompt_detailed_scores + gpt3_5_cot_ape_prompt_scores + gpt3_5_cot_ape_prompt_detailed_scores

agg_list = [
    gpt3_5_tot_prompt_1_results,
    gpt3_5_tot_prompt_2_results,
    gpt3_5_tot_prompt_3_results,
    gpt3_5_tot_detailed_prompt_1_results,
    gpt3_5_tot_detailed_prompt_2_results,
    gpt3_5_tot_detailed_prompt_3_results
]

results_gpt3_5_tot = aggregate_all_test_results(agg_list)

scores_gpt3_5_tot = gpt3_5_tot_prompt_1_scores + gpt3_5_tot_prompt_2_scores + gpt3_5_tot_prompt_3_scores + gpt3_5_tot_detailed_prompt_1_scores + gpt3_5_tot_detailed_prompt_2_scores + gpt3_5_tot_detailed_prompt_3_scores

agg_list = [
    gpt3_5_meta_system_prompt_results,
    gpt3_5_meta_system_prompt_detailed_results,
    gpt3_5_meta_meta_prompt_results,
    gpt3_5_meta_meta_prompt_detailed_results
]

results_gpt3_5_meta = aggregate_all_test_results(agg_list)

scores_gpt3_5_meta = gpt3_5_meta_system_prompt_scores + gpt3_5_meta_system_prompt_detailed_scores + gpt3_5_meta_meta_prompt_scores + gpt3_5_meta_meta_prompt_detailed_scores

agg_list = [
    results_gpt3_5_zs,
    results_gpt3_5_cot,
    results_gpt3_5_tot,
    results_gpt3_5_meta
]

results_gpt3_5 = aggregate_all_test_results(agg_list)

scores_gpt3_5 = scores_gpt3_5_zs + scores_gpt3_5_cot + scores_gpt3_5_tot + scores_gpt3_5_meta

## Print aggregated results

### GPT-4o

In [176]:
print(f"Aggregated Results: GPT-4o")
print(json.dumps(results_gpt4o, indent=4))
print(f"Polaris Scores: {scores_gpt4o}")

Aggregated Results: GPT-4o
{
    "total_responses": 1000,
    "secrets_used": 779,
    "base64_encoding_needed": 145,
    "yaml_not_wrapped": 2,
    "invalid_yaml": 9,
    "kubeconform_failed": 42,
    "no_mysql_or_wordpress": 12,
    "deployment_failed": 7,
    "duplicate_resources": 4,
    "healthy": 611,
    "unhealthy": 317,
    "PVC_failed": 12,
    "Missing_resources": 3,
    "Missing_key": 0,
    "MySQL_PVC_unbound": 4,
    "MySQL_not_ready": 3,
    "Wordpress_PVC_unbound": 0,
    "Wordpress_not_ready": 280,
    "Service_failed": 3,
    "Unknown_error": 12
}
Polaris Scores: [40, 40, 37, 37, 35, 36, 40, 40, 40, 40, 37, 35, 37, 40, 35, 35, 40, 37, 35, 35, 43, 35, 40, 40, 40, 42, 42, 42, 44, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 42, 44, 44, 42, 42, 39, 44, 42, 42, 42, 42, 42, 39, 44, 42, 44, 42, 42, 42, 42, 42, 44, 44, 39, 42, 42, 42, 42, 40, 35, 35, 40, 40, 40, 40, 41, 40, 35, 35, 35, 40, 38, 40, 40, 35, 40, 37, 40, 40, 36, 40, 38, 42, 44, 40, 42, 42, 39, 42, 42, 44, 44, 42,