In [1]:
import numpy
import pandas
import requests
import datetime

import random
import subprocess
import shlex
import json


from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus

In [2]:
PROMETHEUS_TOKEN=!oc whoami -t 
PROMETHEUS_TOKEN=PROMETHEUS_TOKEN[0]

In [3]:
PROMETHEUS_HOST = !kubectl -n openshift-monitoring get route -l app.kubernetes.io/name=thanos-query -o json | jq --raw-output '.items[0].spec.host'
PROMETHEUS_HOST = "https://" + PROMETHEUS_HOST[0]
PROMETHEUS_HOST

'https://thanos-querier-openshift-monitoring.apps.rosa.eqxvi-f7vz7-2xx.vooh.p3.openshiftapps.com'

In [4]:
datetime.datetime.now(datetime.timezone.utc).isoformat()

'2025-05-21T06:27:42.320565+00:00'

In [5]:
datetime.datetime.now().timestamp()

1747808864.289058

In [6]:
krkn_prom = KrknPrometheus(
    prometheus_url=PROMETHEUS_HOST,
    prometheus_bearer_token=PROMETHEUS_TOKEN
)

In [7]:
krkn_prom.process_query('increase(kube_pod_container_status_restarts_total{namespace="robot-shop"}[2m]) > 0')

[]

In [8]:
krkn_prom.process_prom_query_in_range(
    'sum(kube_pod_container_status_restarts_total{namespace="robot-shop"})',
    start_time=datetime.datetime.now(),
    end_time=datetime.datetime.now(),
    granularity=100
)

[{'metric': {}, 'values': [[1747808877, '0']]}]

In [236]:
# Sample Config
SCENARIOS = [
    {
        "name": "application-outages",
        "params": {
            "namespace": [ "robot-shop" ],
            "pod-selector": ["{service: cart}", "{service: catalogue}", "{service: dispatch}","{service: mongodb}","{service: mysql}", "{service: payment}", "{service: rabbitmq}", "{service: ratings}", "{service: redis}", "{service: shipping}", "{service: user}", "{service: web}"],
            "block-traffic-type": ["[Ingress]", "[Egress]"]
        }
    },
    # {
    #     "name": "pod-scenarios",
    #     "params": {
    #         "namespace": ["openshift-dns"],
    #         "name-pattern": ["dns-default.*", "node-resolver.*"],
    #     }
    # }
]

KUBECONFIG = "../tmp/kubeconfig.yaml"

PROMQL_METRIC = ""

CHAOS_DURATION = 120

POPULATION_SIZE = 10

random.seed(123)

In [134]:
def create_population(population_size=10):
    population = []
    
    while len(population) != population_size:
        random_scenario = random.choice(SCENARIOS)
        population_member = { "name": random_scenario['name'], 'params': {} }
        for param_name in random_scenario['params']:
            population_member['params'][param_name] = random.choice(random_scenario['params'][param_name])
        population.append(population_member)

    return population

In [142]:
sample_population = create_population(10)
sample_population[0]

{'name': 'pod-scenarios',
 'params': {'namespace': 'openshift-dns', 'name-pattern': 'dns-default.*'}}

In [44]:
def run_shell(command):
    logs = ""
    command = shlex.split(command)
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    
    for line in process.stdout:
        print(line, end="")
        logs += line
    
    process.wait()

    return logs

In [45]:
run_shell("ls -l")

total 16
-rw-r--r--. 1 rashetty rashetty 13547 May 20 12:57 gen_algorithm.ipynb


'total 16\n-rw-r--r--. 1 rashetty rashetty 13547 May 20 12:57 gen_algorithm.ipynb\n'

In [250]:
PODMAN_TEMPLATE = 'podman run --env-host=true -e TELEMETRY_PROMETHEUS_BACKUP="False" -e WAIT_DURATION=0 -e DURATION={chaos_duration} {env_list} --net=host -v {kubeconfig}:/home/krkn/.kube/config:Z containers.krkn-chaos.dev/krkn-chaos/krkn-hub:{name}'

def run_krkn_scenario_with_hub(scenario, kubeconfig):
    env_list = ""
    
    for param in scenario['params']:
        param_value = scenario['params'][param]
        # Krkn Hub param are uppercase env name with underscore
        param_name = param.upper().replace('-', '_')
        env = f' -e {param_name}="{param_value}" '
        env_list += env
    
    krkn_hub_command = PODMAN_TEMPLATE.format(
        kubeconfig=kubeconfig,
        name=scenario['name'],
        env_list=env_list,
        chaos_duration=CHAOS_DURATION
    )
    
    print("Running command: ", krkn_hub_command)
    
    start_time = datetime.datetime.now()

    log_out = run_shell(krkn_hub_command)
    
    end_time = datetime.datetime.now()
    
    return {
        "cmd": krkn_hub_command,
        "start_time": start_time,
        "end_time": end_time,
        "log": log_out,
    }

In [144]:
sample_population[0]

{'name': 'pod-scenarios',
 'params': {'namespace': 'openshift-dns', 'name-pattern': 'dns-default.*'}}

In [145]:
sample_run = run_krkn_scenario_with_hub(sample_population[0], "../tmp/kubeconfig.yaml")

Running command:  podman run --env-host=true -e WAIT_DURATION=0 -e DURATION=120  -e NAMESPACE="openshift-dns"  -e NAME_PATTERN="dns-default.*"  --net=host -v ../tmp/kubeconfig.yaml:/home/krkn/.kube/config:Z containers.krkn-chaos.dev/krkn-chaos/krkn-hub:pod-scenarios
[1m20-05-2025T14:19:14 Checking if OpenShift client is installed[0m
/usr/bin/oc
 _              _              
| | ___ __ __ _| | _____ _ __  
| |/ / '__/ _` | |/ / _ \ '_ \ 
|   <| | | (_| |   <  __/ | | |
|_|\_\_|  \__,_|_|\_\___|_| |_|
                               

2025-05-20 14:19:15,423 [INFO] Starting kraken
2025-05-20 14:19:15,429 [INFO] Initializing client to talk to the Kubernetes cluster
2025-05-20 14:19:15,429 [INFO] Generated a uuid for the run: 4f3ca26a-bc36-4121-bdf4-002021432839
2025-05-20 14:19:16,279 [INFO] Detected distribution openshift
2025-05-20 14:19:22,840 [INFO] Fetching cluster info
2025-05-20 14:19:25,327 [INFO] 4.18.13
2025-05-20 14:19:25,327 [INFO] Server URL: https://api.ujoep-icmtj-7vx.c2

In [147]:
sample_run

{'cmd': 'podman run --env-host=true -e WAIT_DURATION=0 -e DURATION=120  -e NAMESPACE="openshift-dns"  -e NAME_PATTERN="dns-default.*"  --net=host -v ../tmp/kubeconfig.yaml:/home/krkn/.kube/config:Z containers.krkn-chaos.dev/krkn-chaos/krkn-hub:pod-scenarios',
 'start_time': datetime.datetime(2025, 5, 20, 19, 49, 14, 609442),
 'end_time': datetime.datetime(2025, 5, 20, 19, 53, 0, 933759),
 'log': '\x1b[1m20-05-2025T14:19:14 Checking if OpenShift client is installed\x1b[0m\n/usr/bin/oc\n _              _              \n| | ___ __ __ _| | _____ _ __  \n| |/ / \'__/ _` | |/ / _ \\ \'_ \\ \n|   <| | | (_| |   <  __/ | | |\n|_|\\_\\_|  \\__,_|_|\\_\\___|_| |_|\n                               \n\n2025-05-20 14:19:15,423 [INFO] Starting kraken\n2025-05-20 14:19:15,429 [INFO] Initializing client to talk to the Kubernetes cluster\n2025-05-20 14:19:15,429 [INFO] Generated a uuid for the run: 4f3ca26a-bc36-4121-bdf4-002021432839\n2025-05-20 14:19:16,279 [INFO] Detected distribution openshift\n2025

In [148]:
krkn_prom.process_prom_query_in_range(
    'sum(kube_pod_container_status_restarts_total{namespace="robot-shop"})',
    start_time=datetime.datetime.now(),
    end_time=datetime.datetime.now(),
    granularity=100
)

[{'metric': {}, 'values': [[1747751654, '0']]}]

In [None]:
# time_a = (datetime.datetime.now() - datetime.timedelta(minutes=30))
# time_b = datetime.datetime.now()

# (time_b - time_a).total_seconds() // 60

30.0

In [None]:
FITNESS_SCORE = ""

def restart_count_fitness_in_robotshop(start, end):
    try:
        restart_count_at_beginning = krkn_prom.process_prom_query_in_range(
            'sum(kube_pod_container_status_restarts_total{namespace="robot-shop"})',
            start_time=start,
            end_time=start,
            granularity=100
        )[0]['values'][-1][1]

        
        restart_count_at_end = krkn_prom.process_prom_query_in_range(
            'sum(kube_pod_container_status_restarts_total{namespace="robot-shop"})',
            start_time=end,
            end_time=end,
            granularity=100
        )[0]['values'][-1][1]
        
        return int(restart_count_at_end) - int(restart_count_at_beginning)
    except Exception as e:
        print("Error: ", e);
        raise Exception("Something happened...")

In [159]:
def scenario_unique_key(scenario):
    # convert scenario into tuple of keys for unique matching at later point
    params = []
    for param in sorted(scenario['params']):
        value = scenario['params'][param]
        params.append(value)
    return tuple([scenario['name']] + params)

scenario_unique_key(sample_population[1])

('pod-scenarios', 'node-resolver.*', 'openshift-dns')

In [160]:
def calculate_fitness(scenario):
    # run test
    scenario_result = run_krkn_scenario_with_hub(scenario, KUBECONFIG)
    
    # Calculate fitness
    start = scenario_result['start_time']
    end = scenario_result['end_time']
    fitness_score =  restart_count_fitness_in_robotshop(start, end)
    
    # Create result
    return {
        'id': scenario_unique_key(scenario),
        'scenario': scenario,
        'result': scenario_result,
        'fitness_score': fitness_score,
    }

In [161]:
fitness_result = calculate_fitness(sample_population[1])

Running command:  podman run --env-host=true -e WAIT_DURATION=0 -e DURATION=120  -e NAMESPACE="openshift-dns"  -e NAME_PATTERN="node-resolver.*"  --net=host -v ../tmp/kubeconfig.yaml:/home/krkn/.kube/config:Z containers.krkn-chaos.dev/krkn-chaos/krkn-hub:pod-scenarios
[1m20-05-2025T14:45:49 Checking if OpenShift client is installed[0m
/usr/bin/oc
 _              _              
| | ___ __ __ _| | _____ _ __  
| |/ / '__/ _` | |/ / _ \ '_ \ 
|   <| | | (_| |   <  __/ | | |
|_|\_\_|  \__,_|_|\_\___|_| |_|
                               

2025-05-20 14:45:49,727 [INFO] Starting kraken
2025-05-20 14:45:49,734 [INFO] Initializing client to talk to the Kubernetes cluster
2025-05-20 14:45:49,734 [INFO] Generated a uuid for the run: 10d66c47-f48a-4cf2-8227-3ea6a6ebf42b
2025-05-20 14:45:50,738 [INFO] Detected distribution openshift
2025-05-20 14:45:54,888 [INFO] Fetching cluster info
2025-05-20 14:45:57,389 [INFO] 4.18.13
2025-05-20 14:45:57,389 [INFO] Server URL: https://api.ujoep-icmtj-7vx.

In [164]:
fitness_result

{'id': ('pod-scenarios', 'node-resolver.*', 'openshift-dns'),
 'scenario': {'name': 'pod-scenarios',
  'params': {'namespace': 'openshift-dns', 'name-pattern': 'node-resolver.*'}},
 'result': {'cmd': 'podman run --env-host=true -e WAIT_DURATION=0 -e DURATION=120  -e NAMESPACE="openshift-dns"  -e NAME_PATTERN="node-resolver.*"  --net=host -v ../tmp/kubeconfig.yaml:/home/krkn/.kube/config:Z containers.krkn-chaos.dev/krkn-chaos/krkn-hub:pod-scenarios',
  'start_time': datetime.datetime(2025, 5, 20, 20, 15, 48, 845789),
  'end_time': datetime.datetime(2025, 5, 20, 20, 19, 30, 18431),
  'log': '\x1b[1m20-05-2025T14:45:49 Checking if OpenShift client is installed\x1b[0m\n/usr/bin/oc\n _              _              \n| | ___ __ __ _| | _____ _ __  \n| |/ / \'__/ _` | |/ / _ \\ \'_ \\ \n|   <| | | (_| |   <  __/ | | |\n|_|\\_\\_|  \\__,_|_|\\_\\___|_| |_|\n                               \n\n2025-05-20 14:45:49,727 [INFO] Starting kraken\n2025-05-20 14:45:49,734 [INFO] Initializing client to ta

In [168]:
def select_parents(fitness_scores):
    """
    Selects two parents using Roulette Wheel Selection (proportionate selection).
    Higher fitness means higher chance of being selected.
    """
    total_fitness = sum([x['fitness_score'] for x in fitness_scores])

    scenarios = [x['scenario'] for x in fitness_scores]

    if total_fitness == 0:  # Handle case where all fitness scores are zero
        return random.choice(scenarios), random.choice(scenarios)

    # Normalize fitness scores to get probabilities
    probabilities = [x['fitness_score'] / total_fitness for x in fitness_scores]

    # Select parents based on probabilities
    parent1 = random.choices(scenarios, weights=probabilities, k=1)[0]
    parent2 = random.choices(scenarios, weights=probabilities, k=1)[0]
    return parent1, parent2

In [170]:
sample_population[0]

{'name': 'pod-scenarios',
 'params': {'namespace': 'openshift-dns', 'name-pattern': 'dns-default.*'}}

In [178]:
def crossover(scenarioA, scenarioB):
    common_params = set(scenarioA['params'].keys()) & set(scenarioB['params'].keys()) - set(['namespace'])
    if len(common_params) == 0:
        # no common parameter, currenty we return parents as is and hope for mutation
        # adopt some different strategy
        return scenarioA, scenarioB
    else:
        # if there are common params, lets switch values between them
        for param in common_params:
            if random.random() < 0.8:
                # swap param values
                valueA = scenarioA['params'][param]
                valueB = scenarioB['params'][param]

                scenarioA['params'][param] = valueB
                scenarioB['params'][param] = valueA

        return scenarioA, scenarioB

In [237]:
sample1 = sample_population[8]
sample2 = sample_population[6]

print(sample1, sample2)

sample1, sample2 = crossover(sample1, sample2)

print(sample1, sample2)

{'name': 'application-outages', 'params': {'namespace': 'robot-shop', 'pod-selector': '{service: shipping}', 'block-traffic-type': '[Ingress]'}} {'name': 'application-outages', 'params': {'namespace': 'openshift-dns', 'pod-selector': '{service: dispatch}', 'block-traffic-type': '[Egress]'}}
{'name': 'application-outages', 'params': {'namespace': 'robot-shop', 'pod-selector': '{service: dispatch}', 'block-traffic-type': '[Egress]'}} {'name': 'application-outages', 'params': {'namespace': 'openshift-dns', 'pod-selector': '{service: shipping}', 'block-traffic-type': '[Ingress]'}}


In [219]:
SCENARIOS

[{'name': 'application-outages',
  'params': {'namespace': ['robot-shop'],
   'pod-selector': ['{service: cart}',
    '{service: catalogue}',
    '{service: dispatch}',
    '{service: mongodb}',
    '{service: mysql}',
    '{service: rabbitmq}',
    '{service: ratings}',
    '{service: shipping}',
    '{service: user}',
    '{service: web}'],
   'block-traffic-type': []}}]

In [239]:
def look_up_value_for_scenario_param(name, value, param):
    scenario_copy = SCENARIOS.copy()
    for scenario in scenario_copy:
        if scenario['name'] == name:
            param_values = [x for x in scenario['params'][param]]
            param_values.remove(value)
            if len(param_values) == 0:
                return value
            return random.choice(param_values)
    return value

def mutate(scenario):
    name = scenario['name']
    for param in scenario['params'].keys():
        if param != 'namespace' and random.random() < 0.3:
            if isinstance(param, str):
                value = scenario['params'][param]
                scenario['params'][param] = look_up_value_for_scenario_param(name, value, param)
    return scenario

In [247]:
sample1 = sample_population[8]

print(sample1)

sample1 = mutate(sample1)

print(sample1)

{'name': 'application-outages', 'params': {'namespace': 'robot-shop', 'pod-selector': '{service: cart}', 'block-traffic-type': '[Ingress]'}}
{'name': 'application-outages', 'params': {'namespace': 'robot-shop', 'pod-selector': '{service: catalogue}', 'block-traffic-type': '[Egress]'}}


In [248]:
def genetic_algorithm(init_population_size=10, generations=10):
    if init_population_size == 0 or generations == 0:
        print("Please specify population and generations greater than 0.")
        return

    population = create_population(init_population_size)

    best_fitness = []
    population_processed_so_far = {}
    
    for generation in range(generations):
        print(f"-"*20)
        print(f"        Generation {generation + 1} (Population Size: {len(population)})")

        if len(population) == 0:
            # No population :(, skip everything
            print("No population found, breaking out!")
            break

        # Evaluate fitness of the current population
        fitness_scores = [calculate_fitness(member) for member in population]
        
        # Find the best individual in the current generation
        # Note: If there is no best solution, it will still consider based on sorting order
        fitness_scores = sorted(fitness_scores, key=lambda x:x['fitness_score'], reverse=True)
        best_fitness.append(fitness_scores[0])

        # Track Calculated Population
        # We don't want to add a same parent back to population since its already been included
        for fitness_result in fitness_scores:
            population_processed_so_far[fitness_result['id']] = fitness_result

        # Repopulate off-springs  
        population = []      
        for _ in range(init_population_size//2):
            parent1, parent2 = select_parents(fitness_scores)
            child1, child2 = crossover(parent1, parent2)
            child1 = mutate(child1)
            child2 = mutate(child2)

            if scenario_unique_key(child1) not in population_processed_so_far:
                population.append(child1)
            
            if scenario_unique_key(child2) not in population_processed_so_far:
                population.append(child2)

        print(f"  Best Fitness: {fitness_scores[0]['fitness_score']}")
        
    return best_fitness, population_processed_so_far

In [251]:
best_fitness, all_population = genetic_algorithm(init_population_size=3, generations=5)

--------------------
        Generation 1 (Population Size: 3)
Running command:  podman run --env-host=true -e TELEMETRY_PROMETHEUS_BACKUP="False" -e WAIT_DURATION=0 -e DURATION=120  -e NAMESPACE="robot-shop"  -e POD_SELECTOR="{service: payment}"  -e BLOCK_TRAFFIC_TYPE="[Egress]"  --net=host -v ../tmp/kubeconfig.yaml:/home/krkn/.kube/config:Z containers.krkn-chaos.dev/krkn-chaos/krkn-hub:application-outages
[1m20-05-2025T15:52:56 Checking if OpenShift client is installed[0m
/usr/bin/oc
 _              _              
| | ___ __ __ _| | _____ _ __  
| |/ / '__/ _` | |/ / _ \ '_ \ 
|   <| | | (_| |   <  __/ | | |
|_|\_\_|  \__,_|_|\_\___|_| |_|
                               

2025-05-20 15:52:57,442 [INFO] Starting kraken
2025-05-20 15:52:57,449 [INFO] Initializing client to talk to the Kubernetes cluster
2025-05-20 15:52:57,449 [INFO] Generated a uuid for the run: d67c2223-b517-4720-a7a0-6f67e6dbaabb
2025-05-20 15:52:58,330 [INFO] Detected distribution openshift
2025-05-20 15:53:01,9

KeyboardInterrupt: 