In [2]:
# import modules
from pathlib import Path
import torch
import logging
import shutil
from collections import deque, OrderedDict
from dataclasses import dataclass
from random import shuffle, sample
from typing import Dict, Any, Optional
import json
from functools import partial, update_wrapper
import numpy as np
import time
import pickle
import random

import ase
from ase.db import connect
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from fff.learning.gc.ase import SchnetCalculator
from fff.learning.gc.functions import GCSchNetForcefield
from fff.learning.gc.models import SchNet, load_pretrained_model
from fff.learning.util.messages import TorchMessage
from fff.sampling.md import MolecularDynamics
from fff.simulation import run_calculator, _run_calculator
from fff.simulation.utils import read_from_string, write_to_string


In [3]:
## util functions
## dataclass
@dataclass
class Trajectory:
    """Tracks the state of searching along individual trajectories

    We mark the starting point, the last point produced from sampling,
    and the last point we produced that has been validated
    """
    id: int  # ID number of the
    starting: ase.Atoms  # Starting point of the trajectory
    current_timestep = 0  # How many timesteps have been used so far
    last_validated: ase.Atoms = None  # Last validated point on the trajectory
    current: ase.Atoms = None  # Last point produced along the trajectory
    last_run_length: int = 0  # How long between current and last_validated
    name: str = None  # Name of the trajectory

    def __post_init__(self):
        self.last_validated = self.current = self.starting

    def update_current_structure(self, strc: ase.Atoms, run_length: int):
        """Update the structure that has yet to be updated

        Args:
            strc: Structure produced by sampling
            run_length: How many timesteps were performed in sampling run
        """
        self.current = strc.copy()
        self.last_run_length = run_length

    def set_validation(self, success: bool):
        """Set whether the trajectory was successfully validated

        Args:
            success: Whether the validation was successful
        """
        if success:
            self.last_validated = self.current  # Move the last validated forward
            self.current_timestep += self.last_run_length


@dataclass
class SimulationTask:
    atoms: ase.Atoms  # Structure to be run
    traj_id: int  # Which trajectory this came from
    ml_eng: float  # Energy predicted from machine learning model
    ml_std: Optional[float] = None  # Uncertainty of the model

@dataclass
class my_SimulationTask:
    simu_task: SimulationTask # basic information store in SimulationTask
    dft_energy: Optional[float] = None  # DFT energy of the structure
    dft_time: Optional[dict[int,float]] = None  # Dictionary to store DFT run times for different CPU cores
    

# Apply wrappers to functions that will be used to fix certain requirements
def _wrap(func, **kwargs):
    out = partial(func, **kwargs)
    update_wrapper(out, func)
    return out

In [4]:
# path and varaibles
## pash on wsl
multisite_path = "/home/yxx/work/project/colmena/multisite_"
training_set = multisite_path + \
    "/data/forcefields/starting-model/initial-database.db"
model_path = multisite_path + "/data/forcefields/starting-model/starting-model"
search_path = training_set
out_dir = Path(multisite_path) / f"evt/idea_simple_validate/temp"
fff_temp = Path(multisite_path) / f"evt/idea_simple_validate/fff_temp"
out_dir.mkdir(parents=True, exist_ok=True)

starting_model = torch.load(model_path, map_location='cpu')

num_epochs = 16
huber_deltas = (1, 10)
sampler_kwargs = {'device': "cpu", 'timestep': 0.1, 'log_interval': 10}
sampler = MolecularDynamics()
n_models = 1
n_qc_workers = 8
min_run_length = 200
max_run_length = 2000
energy_tolerance = 0.1
infer_chunk_size = 100

# search space queue
to_audit: dict[int, Trajectory] = {}  # Trajectories that need to be audited
audit_results: deque[float] = deque(maxlen=50)  # Results of the last 50 audits
task_queue_audit = []
inference_pool = []

# Prepare the initial model
StartModelMessage = TorchMessage(starting_model)
ActiveModelMessage = SchnetCalculator(starting_model)

# get model
schnet = GCSchNetForcefield(starting_model)

# copy training data
train_path = out_dir / "train.db"
shutil.copyfile(training_set, train_path)

# Load in the search space
with connect(search_path) as db:
    search_space = [Trajectory(i, x.toatoms(), name=x.get(
        'filename', f'traj-{i}')) for i, x in enumerate(db.select(''))]
    shuffle(search_space)
    search_space = deque(search_space)
    
# Load in the training dataset
with connect(train_path) as db:
    all_examples = np.array([x.toatoms() for x in db.select("")], dtype=object)


In [5]:
## task define
# train task
# Prepare the dataset
train_sets = []
valid_sets = []
n_train = int(len(all_examples) * 0.9)
for _ in range(n_models):
    shuffle(all_examples)
    train_sets.append(all_examples[:n_train])
    valid_sets.append(all_examples[n_train:])

# store model and log
model_msgs = []
train_logs = []

# train model function
my_train_schnet = _wrap(schnet.train, num_epochs=num_epochs, device='cuda',
                        patience=8, reset_weights=False,
                        huber_deltas=huber_deltas)

# train model
print("start train")
start_time = time.time()
for i in range(0, 1):
    for i, train_set in enumerate(valid_sets):
        model_msg, train_log = my_train_schnet(
            model_msg=StartModelMessage, train_data=train_set, valid_data=valid_sets[i])
        model_msgs.append(model_msg)
        train_logs.append(train_log)

    # store model
    # now we just test one model
    model_save_path = out_dir / "model.pth"
    with open(model_save_path, 'wb') as fp:
        torch.save(model_msgs[0].get_model(), fp)
    # Save the training data
    with open(out_dir / 'training-history.json', 'a') as fp:
        print(json.dumps(train_logs[0].to_dict(orient='list')), file=fp)
    # save model message
    with open(out_dir / 'model_msgs.pkl', 'wb') as fp:
        pickle.dump(model_msgs, fp)

    active_model_proxy = SchnetCalculator(model_msgs[0].get_model())
    StartModelMessage = TorchMessage(model_msgs[0].get_model()) 
    
print("end train at"+str(time.time()-start_time))
#-------------------------------------------------------------------------------
# sampling task
# use model sampling
my_run_dynamics = _wrap(sampler.run_sampling, **sampler_kwargs)

# filter small atjjjoms
filtered_search_space = deque()
for i in range(len(search_space)):
    tt = search_space.popleft()
    if (len(tt.starting)<50):
        filtered_search_space.append(tt)
shuffle(filtered_search_space)
search_space=filtered_search_space
# sampling tasks loop, prepare task data
for i in range(0, 100):
    # Pick the next eligible trajectory and start from the last validated structure
    trajectory = search_space.popleft()
    starting_point = trajectory.starting

    # Initialize the structure if need be
    if trajectory.current_timestep == 0:
        MaxwellBoltzmannDistribution(starting_point, temperature_K=100)
        print('Initialized temperature to 100K')
    # Add the structure to a list of those being validated
    to_audit[trajectory.id] = trajectory

    # Determine the run length based on observations of errors
    run_length = min_run_length
    if len(audit_results) > n_qc_workers:
        # Predict run length given audit error
        error_per_step = np.median(audit_results)
        target_error = energy_tolerance * 2
        estimated_run_length = int(target_error / error_per_step)
        print(
            f'Estimated run length of {estimated_run_length} steps to have an error of {target_error:.3f} eV/atom')
        # Keep to within the user-defined bounds
        run_length = max(min_run_length, min(
            max_run_length, estimated_run_length))

    # do sampling
    audit, traj = my_run_dynamics(
        atoms=starting_point, steps=run_length, calc=active_model_proxy)
    # print(audit)
    # print(len(traj))
    # add to list
    to_audit[trajectory.id].update_current_structure(audit, run_length)
    task_queue_audit.append(SimulationTask(
        atoms=traj[-1], traj_id=trajectory.id, ml_eng=traj[-1].get_potential_energy()))
    inference_pool.extend(traj)

print(len(task_queue_audit))
print(len(inference_pool))
## pickletask_queue_audit and inference_pool
with open(out_dir / 'task_queue_audit.pkl', 'wb') as fp:
    pickle.dump(task_queue_audit, fp)
with open(out_dir / 'inference_pool.pkl', 'wb') as fp:
    pickle.dump(inference_pool, fp)

#--------------------------------------------------------------------------------
# simulation task
# get simulation task data
# to_run = task_queue_audit[-1]

def simulation_task(to_run:SimulationTask,tempdir:str="./ffftemp",num_threads:int=8):
    """Run a simulation task

    Args:
        to_run: SimulationTask to be run
        tempdir: Temporary directory for running simulations
        num_threads: Number of threads to use for simulation
    """
    calc = dict(calc='psi4', method='pbe0-d3', basis='aug-cc-pvdz', num_threads=num_threads)
    ml_eng = to_run.ml_eng
    atoms = to_run.atoms
    atoms.set_center_of_mass([0, 0, 0])
    xyz = write_to_string(atoms, 'xyz')
    value = _run_calculator(xyz, calc, tempdir)
    atoms = read_from_string(value, 'json')
    dft_energy = atoms.get_potential_energy()
    diff_energy = abs(dft_energy - ml_eng) / len(atoms)
    return diff_energy


#--------------------------------------------------------------------------------
# inference task
inference_batch = inference_pool[0:100]
inference_proxies: list[None] = [None] * n_models
inference_proxies[0] = SchnetCalculator(model_msgs[0])
my_eval_schnet = _wrap(schnet.evaluate, device='cuda')
my_eval_schnet("/home/yxx/work/project/colmena/multisite_/evt/idea_simple_validate/temp/model.pth",inference_batch)


start train
end train at10.358051538467407
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 100K
Initialized temperature to 10

([-27016.697265625,
  -27016.79296875,
  -27016.935546875,
  -27017.08984375,
  -27017.220703125,
  -27017.3125,
  -27017.369140625,
  -27017.3828125,
  -27017.328125,
  -27017.197265625,
  -27017.044921875,
  -27016.912109375,
  -27016.853515625,
  -27016.884765625,
  -27017.001953125,
  -27017.158203125,
  -27017.283203125,
  -27017.392578125,
  -27017.44921875,
  -22858.14453125,
  -22858.484375,
  -22858.849609375,
  -22858.9921875,
  -22859.005859375,
  -22859.18359375,
  -22859.466796875,
  -22859.59375,
  -22859.37890625,
  -22859.064453125,
  -22858.875,
  -22858.927734375,
  -22859.091796875,
  -22859.1015625,
  -22859.03515625,
  -22859.171875,
  -22859.4921875,
  -22859.71875,
  -22859.671875,
  -31173.955078125,
  -31174.03515625,
  -31174.154296875,
  -31174.27734375,
  -31174.37890625,
  -31174.427734375,
  -31174.40625,
  -31174.31640625,
  -31174.177734375,
  -31174.0546875,
  -31173.96875,
  -31173.916015625,
  -31173.90234375,
  -31173.94921875,
  -31174.056640625,
  

In [17]:
## only run task
# train task
start_time = time.time()
_1,_2=my_train_schnet(model_msg=StartModelMessage, train_data=train_set, valid_data=valid_sets[0])
print("end train at"+str(time.time()-start_time))

# sampling task
start_time = time.time()
# 0~100 random array
random_array = np.random.randint(0, 100, 100)
starting_point = search_space[random_array[0]].starting
 
audit, traj = my_run_dynamics(
    atoms=starting_point, steps=run_length, calc=active_model_proxy)
print("end sampling at"+str(time.time()-start_time))

# simulation task
start_time = time.time()
to_run = task_queue_audit[-1]
diff_energy = simulation_task(to_run)
print("end simulation at"+str(time.time()-start_time))

# inference task
start_time = time.time()
inference_batch = inference_pool[0:100]
inference_proxies: list[None] = [None] * n_models
inference_proxies[0] = SchnetCalculator(model_msgs[0])
my_eval_schnet = _wrap(schnet.evaluate, device='cuda')
my_eval_schnet("/home/yxx/work/project/colmena/multisite_/evt/idea_simple_validate/temp/model.pth",inference_batch)
print("end inference at"+str(time.time()-start_time))

## prepare task input data queue
## append model
# input: structure
# output: model

## append sampling data
# input: model search space 
# output: task_queue_audit

## append simulation data
# input: task_queue_audit
# output: diff_energy

## append inference data
# input: inference_pool
# output: selected structure for trian

end train at6.07136869430542
end sampling at2.787306547164917
  Threads set to 8 by Python driver.
  Threads set to 8 by Python driver.


ValueError: Calculation failed: 
Fatal Error: Matrix::power: C_DSYEV failed
Error occurred in file: /scratch/psilocaluser/conda-builds/psi4-multiout_1657298395608/work/psi4/src/psi4/libmints/matrix.cc on line: 2330
The most recent 5 function calls were:

psi::FittingMetric::form_eig_inverse(double)
psi::DiskDFJK::preiterations()


In [13]:
## total resources
total_cpu = 64
total_gpu = 4

## task resources requirements(not accurate, experience)
train_resources = {'cpu': 1, 'gpu': 1}
sampling_resources = {'cpu': 1, 'gpu': 1}
simulation_resources = {'cpu': 8, 'gpu': 0} # simulation cpu resources is not fixed, it can ranged from 1 to 16
inference_resources = {'cpu': 1, 'gpu': 1}

## total runs
train_totals = 12
sampling_totals = 60
simulation_totals = 60
inference_totals = 12

## a batch runs
train_nums = 4
sampling_nums = 20
simulation_nums = 20
inference_nums = 4

## running sequence
### suppose already get sequence and nums from workflow
tasks = []
# tasks.append({'name': 'train', 'resources': train_resources, 'nums': train_nums})
# tasks.append({'name': 'sampling', 'resources': sampling_resources, 'nums': sampling_nums})
# tasks.append({'name': 'simulation', 'resources': simulation_resources, 'nums': simulation_nums})
# tasks.append({'name': 'inference', 'resources': inference_resources, 'nums': inference_nums})
evo_simulation_task = {'name': 'simulation', 'resources': simulation_resources}
evo_train_task = {'name': 'train', 'resources': train_resources}
evo_sampling_task = {'name': 'sampling', 'resources': sampling_resources}
evo_inference_task = {'name': 'inference', 'resources': inference_resources}

## time consume
trainning_time = 50
sampling_time = 10
inference_time = 10
# simulation time
## nonlinear performance for cpu cores
## nonlinear preformance for atoms length
with open(out_dir / 'length_time', 'rb') as fp:
    length_time = pickle.load(fp)
with open(out_dir / 'cpu_time', 'rb') as fp:
    cpu_time = pickle.load(fp)
print(length_time)
print(cpu_time)

[(12, 5.850254774093628), (15, 7.936183929443359), (18, 14.705122470855713), (21, 16.75410395860672), (24, 21.41688847541809), (27, 24.086347818374634), (30, 31.15443527698517), (33, 40.727830992804634), (36, 48.143983125686646), (39, 59.534986893335976), (42, 72.19054274559021), (45, 83.08321416378021), (48, 97.04547603925069), (51, 107.61373349598476), (54, 133.2091362476349), (57, 133.03949835896492), (60, 174.3393987417221), (63, 203.84687733650208), (66, 245.14138960838318), (69, 267.0610399246216), (72, 330.6697278022766), (75, 359.20263490080833)]
{1: 459.49200105667114, 2: 308.5443150997162, 4: 178.95574569702148, 8: 114.76378226280212, 12: 88.14041328430176, 16: 76.19267272949219, 20: 68.74859023094177, 24: 64.68302941322327, 28: 63.44424271583557, 32: 66.51880431175232, 36: 69.2843005657196, 40: 70.93301844596863, 44: 69.64240193367004, 48: 70.88759207725525, 52: 69.63654398918152, 56: 71.50614309310913, 60: 71.06448984146118, 64: 72.96044158935547}


In [15]:
GLOBAL_TRAIN_ID = 0
GLOBAL_SAMPLING_ID = 0
GLOBAL_SIMULATION_ID = 0
GLOBAL_INFERENCE_ID = 0

def adjust_task_remaining(task_allocation={}, task={}):
    ## adjust task to be processed
    ## if task nums is less, provision more resources
    ## if task nums is more, provision less resources
    ## tasks_nums = min(remain_task, epoch_task * 2)
    ## resources per task = max(1,resources * 2 / task_nums)
    ## task nums = resources / resources per task
    
    task_nums = {"train": 4, "sampling": 20, "simulation": 20, "inference": 4}
    
    return task_nums

def dummy_allocate(tasks_nums,total_resources:dict={'cpu': 64, 'gpu': 4}):
    total_cpus = total_resources['cpu']
    remaining_cpus = total_cpus - sum(tasks_nums.values())
    
    if remaining_cpus < 0:
        raise ValueError("Not enough CPUs for all tasks")
    global GLOBAL_TRAIN_ID
    global GLOBAL_SAMPLING_ID
    global GLOBAL_SIMULATION_ID
    global GLOBAL_INFERENCE_ID
    
    task_queue = []
    task_queue.extend([{'name': 'train', "task_id": i+ GLOBAL_TRAIN_ID} for i in range(tasks_nums['train'])])
    # GLOBAL_TRAIN_ID += tasks_nums['train']
    task_queue.extend([{'name': 'sampling', "task_id": i+ GLOBAL_SAMPLING_ID} for i in range(tasks_nums['sampling'])])
    # GLOBAL_SAMPLING_ID += tasks_nums['sampling']
    task_queue.extend([{'name': 'simulation', "task_id": i+ GLOBAL_SIMULATION_ID} for i in range(tasks_nums['simulation'])])
    # GLOBAL_SIMULATION_ID += tasks_nums['simulation']
    task_queue.extend([{'name': 'inference', "task_id": i+ GLOBAL_INFERENCE_ID} for i in range(tasks_nums['inference'])])
    # GLOBAL_INFERENCE_ID += tasks_nums['inference']

    task_allocation = [{"name": task["name"], "task_id": task["task_id"] ,"resources": {"cpu": 1}} for task in task_queue]
    
    total_time = 0
    for task in task_queue:
        if task['name'] == "train":
            total_time += trainning_time
        if task['name'] == "sampling":
            total_time += sampling_time
        if task['name'] == "simulation":
            total_time += estimate_simulation_time(task_queue_audit[task['task_id']].atoms.get_positions().shape[0], 1, length_time, cpu_time)
        if task['name'] == "inference":
            total_time += inference_time
    
    weights = {}
    for task in task_queue:
        if task['name'] == "train":
            weight = trainning_time / total_time
        elif task['name'] == "sampling":
            weight = sampling_time / total_time
        elif task['name'] == "simulation":
            weight = estimate_simulation_time(task_queue_audit[task['task_id']].atoms.get_positions().shape[0], 1, length_time, cpu_time) / total_time
        elif task['name'] == "inference":
            weight = inference_time / total_time
        
        weights[(task["name"], task["task_id"])] = weight
        extra_cpus = int(weight * remaining_cpus)
        for task_alloc in task_allocation:
            if task_alloc["name"] == task["name"] and task_alloc["task_id"] == task["task_id"]:
                task_alloc["resources"]["cpu"] += extra_cpus
                remaining_cpus -= extra_cpus
                break

    # if there are still remaining CPUs due to rounding, assign them to tasks
    # based on their weights
    for task_key, _ in sorted(weights.items(), key=lambda item: item[1], reverse=True):
        if remaining_cpus <= 0:
            break
        for task_alloc in task_allocation:
            if task_alloc["name"] == task_key[0] and task_alloc["task_id"] == task_key[1]:
                task_alloc["resources"]["cpu"] += 1
                remaining_cpus -= 1
                break

    return task_allocation
## allocate base on NJC fairness
## def njc_fairness(task_allocation, task):
##     pass


def get_piority(task):
    pass # get piority from workflow
    if task['name'] == 'train':
        return 1
    elif task['name'] == 'sampling':
        return 2
    elif task['name'] == 'simulation':
        return 3
    elif task['name'] == 'inference':
        return 4

def estimate_simulation_time(molecule_length, cpu_cores, length_times, core_times):
    closest_length = min(length_times, key=lambda x:abs(x[0]-molecule_length))
    length_time = closest_length[1]

    closest_cores = min(core_times.keys(), key=lambda x:abs(x-cpu_cores))
    core_time = core_times[closest_cores]

    return length_time*core_time/100

def calculate_resource_usage(task_allocation):
    total_cpu_usage, total_gpu_usage = 0, 0
    for task, resources in task_allocation.items():
        total_cpu_usage += resources['cpu']
        total_gpu_usage += resources['gpu']

    return total_cpu_usage, total_gpu_usage

def generate_population(population_size, total_cpu, total_gpu):
    population = []
    ## initial nums for each task can be set manually
    task_nums = adjust_task_remaining() # get nums from workflow
    
    for _ in range(population_size):
        ## allocate by time weight
        task_allocation = dummy_allocate(task_nums, {'cpu': total_cpu, 'gpu': total_gpu})
        
        
        ## random allocate tasks
        # individual = []
        # remaining_cpu, remaining_gpu = total_cpu, total_gpu

        # while remaining_cpu > 0 and remaining_gpu > 0:
        #     # for now, we just set random cpu resources for simulation task
        #     task = random.choice(tasks)
        #     if task['name'] == 'simulation':
        #         cpu_usage = random.randint(1, min(remaining_cpu, task['resources']['cpu']))
        #         individual.append({'name': task['name'], 'id': len(individual)+1, 'resources': {'cpu': cpu_usage, 'gpu': gpu_usage}})
        #         remaining_cpu -= cpu_usage
        #     # cpu_usage = random.randint(1, min(remaining_cpu, task['resources']['cpu']))
        #     # gpu_usage = random.randint(0, min(remaining_gpu, task['resources']['gpu']))
        #     else:
        #         individual.append({'name': task['name'], 'id': len(individual)+1, 'resources': tasks[task['name']].resources})           
        #         remaining_cpu -= tasks[task['name']].resources.cpu
        #         remaining_gpu -= tasks[task['name']].resources.gpu

        population.append(task_allocation)

    return population

def evaluate_score(total_time, max_time):
    pass

def fitness(tasks,info):
    ## get total time as throughput
    ## get max task time as smallest generation time
    total_time = 0
    max_time = 0
    for task in tasks:
        if task['name'] == "train":
            total_time += trainning_time
            # max_time = max(max_time, trainning_time)
        if task['name'] == "sampling":
            total_time += sampling_time
            # max_time = max(max_time, sampling_time)
        if task['name'] == "simulation":
            simulation_time = estimate_simulation_time(task_queue_audit[task['task_id']].atoms.get_positions().shape[0], 1, length_time, cpu_time)
            total_time += simulation_time
            max_time = max(max_time, simulation_time)
        if task['name'] == "inference":
            total_time += inference_time
            max_time = max(max_time, inference_time)
    
    ## weights of this two metric
    # How to measure the throughput that this batch of reduced time can increase?


    return score

def crossover(individual1, individual2):
    # randomly select a crossover point
    crossover_point = random.randint(0, min(len(individual1), len(individual2)))

    # create new individuals by exchanging tasks
    new_individual1 = individual1[:crossover_point] + individual2[crossover_point:]
    new_individual2 = individual2[:crossover_point] + individual1[crossover_point:]

    # make sure new individuals do not exceed resource limits
    new_individual1 = adjust_individual(new_individual1)
    new_individual2 = adjust_individual(new_individual2)

    return new_individual1, new_individual2

def adjust_individual(individual):
    total_cpu_usage, total_gpu_usage = calculate_resource_usage(individual)

    while total_cpu_usage > total_cpu or total_gpu_usage > total_gpu:
        # randomly select a task to decrease resource or remove
        task_index = random.randint(0, len(individual)-1)
        if individual[task_index]['resources']['cpu'] > 1:
            individual[task_index]['resources']['cpu'] -= 1
            total_cpu_usage -= 1
        elif individual[task_index]['resources']['gpu'] > 0:
            individual[task_index]['resources']['gpu'] -= 1
            total_gpu_usage -= 1
        else:
            del individual[task_index]

    return individual

def mutate(task_allocation):
    total_cpu_usage, total_gpu_usage = calculate_resource_usage(task_allocation)

    while total_cpu_usage > total_cpu or total_gpu_usage > total_gpu:
        # randomly select a task to decrease resource
        task = random.choice(list(task_allocation.keys()))
        if task_allocation[task]['cpu'] > 1:
            task_allocation[task]['cpu'] -= 1
            total_cpu_usage -= 1
        if task_allocation[task]['gpu'] > 0:
            task_allocation[task]['gpu'] -= 1
            total_gpu_usage -= 1

    return task_allocation

def run_ga(pop_size, num_generations):
    population = generate_population(pop_size)

    for gen in range(num_generations):
        scores = [fitness(ind) for ind in population]

        population = [population[i] for i in np.argsort(scores)]

        next_population = population[:pop_size//10]

        for i in range(pop_size//10, pop_size):
            if i < pop_size//2:
                ind1, ind2 = random.sample(population[:pop_size//2], 2)
                crossover(ind1, ind2)
                next_population.append(ind1)
            else:
                ind = random.choice(population)
                mutate(ind)
                next_population.append(ind)
        
        population = next_population

    return max(population, key=fitness)

# best_individual = run_ga(100, 100)
# print(best_individual)

In [41]:
def estimate_simulation_time(molecule_length, cpu_cores, length_times, core_times):
    closest_length = min(length_times, key=lambda x:abs(x[0]-molecule_length))
    length_time = closest_length[1]

    closest_cores = min(core_times.keys(), key=lambda x:abs(x-cpu_cores))
    core_time = core_times[closest_cores]

    return length_time*core_time/70

rrtt = estimate_simulation_time(42, 1, length_time, cpu_time)
print(rrtt)

473.8710991934057


In [16]:

task_nums = {"train": 4, "sampling": 8, "simulation": 8, "inference": 4}
# dummy_allocate(tasks_nums=task_nums,total_resources={'cpu': 64, 'gpu': 4})
generate_population(10, 64, 4)

[[{'name': 'train', 'task_id': 0, 'resources': {'cpu': 1}},
  {'name': 'train', 'task_id': 1, 'resources': {'cpu': 1}},
  {'name': 'train', 'task_id': 2, 'resources': {'cpu': 1}},
  {'name': 'train', 'task_id': 3, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 0, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 1, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 2, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 3, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 4, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 5, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 6, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 7, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 8, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 9, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 10, 'resources': {'cpu': 1}},
  {'name': 'sampling', 'task_id': 11, 'resources': {'cpu': 1}},
  