# Code for automating the launching process using Slurm on a GPU cluster
**Author:** Josep María Barbéra Civera

**Date:** September 2022

Based on similar implementation by Thomas Isensee

In [1]:
import os
import numpy as np
from scipy.interpolate import griddata
from scipy import special
from scipy import signal
from scipy.optimize import curve_fit
from math import log10,log


import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from matplotlib.collections import PatchCollection
from matplotlib.patches import ConnectionPatch
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.lines import Line2D
import matplotlib.image as mpimg

from scipy import special
from scipy.special import exp1
import scipy.optimize as opt

import math

# import vtk
# from vtk.util.numpy_support import vtk_to_numpy
# print(mpl.__version__)

3.5.3


### Template for the *input file* used in our simulations.
You should use you own template here

In [2]:
input_template = '''10		number of file outputs 
10000	number of needle tracking outputs
0		number of needles tracked (if <=0: auto x_max and y_max)
0		compress results files to *.tar.gz
1		verbosity: 0 = Nothing ; 1 = Files I/O ; 2 = Time iterations

630		number of grid points in x-direction (including 2 for BC)
510		number of grid points in y-direction (including 2 for BC)
1.00	delta_x

600.	t_end: final time / (s)
.5 		tau:   safety factor for time step adaptation

1000	itermax: maximal number of pressure iterations in one time step
.001	eps :    stopping tolerance for pressure iteration
1.1		omg :    relaxation parameter for SOR iteration
.9		gamma:   upwind differencing factor   

2.e-5	Velocity / (m/s)
1.e4	Gradient / (K/m
4.		Composition / (wt%)
1.5e-9	Diffusion / (m^2/s)
.14		Partition
1.6		Liquidus slope / (K/wt%)    (>0)
2.4e-7	Gibbs-Thomson coefficient / (Km)
.02		Anisotropy
-1		Sigma
5.7e-7	ν: Viscosity is / (m^2/s)
.01		betaC: solutal coefficient of volume expansion (0 if no buoyancy)
0.		betaT: thermal coefficient of volume expansion (0 if no buoyancy)
0.		GX:   volume forces, e.g. gravity / (m/s^2)       
9.81	GY  / (m/s^2)

0.		Δ_0: fields initial values
0.		u_0
0.		v_0 

## Creating files

### Function for creating the *run file*: it is the .sh file for launching simulations from Slurm
Here, we are modifying the content and adapting it to the necessary changes for other simulations.

In [4]:

def create_run_file(path, server, gpu, executable_name, executables_path, jobname):
    original_string = """#!/bin/bash

    #
    #--------- Specify:: NAME for the GPU job -------------
    #SBATCH --job-name="b01"
    #SBATCH --output="b01.out"

    #--------------- Specify:: NODE and GPU to run the job on -------------
    #SBATCH --partition="GPU-RTX3090"
    #SBATCH --nodelist="ironman"
    #SBATCH --gres=gpu:gpu0:1

    module load gcc/8.2.0
    module load cuda10.1/toolkit/10.1.243
    ../DevDNNFF3DISO b01 0"""
    
    original_string = original_string.splitlines()
    
    filename_out = path + '/' + jobname + '.sh'
    file_out = open(filename_out, "w")

    ### keys to replace
    key0 = '#SBATCH --job-name="b01"'
    replace0 = '#SBATCH --job-name="' + jobname + '"'
    
    key1 = '#SBATCH --output="b01.out"'
    replace1 = '#SBATCH --output="' + jobname + '.out"'
    
    key20 = '#SBATCH --partition="'
    if (server == 'hulk'):
        replace21 = '#SBATCH --partition="Hulk"'
    elif (server == 'thor' or server == 'ironman'):
        replace21 = '#SBATCH --partition="GPU-RTX3090"'
    elif (server == 'obelix'):
        replace21 = '#SBATCH --partition="Obelix"'
    elif (server == 'asterix'):
        replace21 = '#SBATCH --partition="Asterix"'
    else:
        print('Specify server!')

    key30 = '#SBATCH --nodelist='
    replace31 = '#SBATCH --nodelist="' + server + '"'
        
    key4 = '#SBATCH --gres=gpu:gpu0:1'
    if (server == 'obelix'):
        replace4 = '#SBATCH --gres=gpu:1'
    else:
        replace4 = '#SBATCH --gres=gpu:gpu' + str(gpu) + ':1'
    
    key5 = 'module load cuda10.1/toolkit/10.1.243'    
    key6 = '../DevDNNFF3DISO b01 0'
        
    if(server == 'thor' or server == 'ironman'):
        replace5 = 'module load cuda11.2/toolkit/11.2.2'
        replace6 = executables_path + '/' + executable_name + ' ' + jobname + ' 0'
        
    if(server == 'obelix' or server == 'asterix' or server == 'hulk'):
        replace5 = 'module load cuda10.1/toolkit/10.1.243'
        replace6 = executables_path + '/' + executable_name + ' ' + jobname + ' 0'

    for line in original_string:
        line = line.strip()
        #print(line)

        if(key0 in line):
            changes0 = line.replace(key0, replace0)
        elif(key1 in line):
            changes0 = line.replace(key1, replace1)
        elif(key20 in line):
            changes0 = replace21
        elif(key30 in line):
            changes0 = replace31
        elif(key4 in line):
            changes0 = line.replace(key4, replace4)
        elif(key5 in line):
            changes0 = line.replace(key5, replace5)
        elif(key6 in line):
            changes0 = line.replace(key6, replace6)
        else:
            changes0 = line
            
        file_out.write(changes0 + '\n')
        
    file_out.close()


### Function to modify the *input file* template from above and adapt it to the necessary changes for other simulations.
You will need to adapt it to your own variables and simulations parameters.

In [5]:
def create_input_file(path, Velocity, needle_number, dx, Nx, Ny, time, Difussion, partition, jobname):
    filename_out = path + '/' + jobname + '.in'
    file_out = open(filename_out, "w")
    
    ### keys to replace
    
    key0 = 'number of grid points in x-direction (including 2 for BC)'
    replace0 = str(Nx) + '	number of grid points in x-direction (including 2 for BC)'
    
    key1 = 'number of grid points in y-direction (including 2 for BC)'
    replace1 = str(Ny) + '	number of grid points in y-direction (including 2 for BC)'
    
    key2 = 'delta_x'
    replace2 = str(dx) + '		delta_x'
    
    key3 = 't_end: final time / (s)'
    replace3 = str(time) + '	t_end: final time / (s)'
    
    key4 = 'Velocity / (m/s)'
    replace4 = str(Velocity) + '	Velocity / (m/s)'
    
    key5 = 'Diffusion / (m^2/s)'
    replace5 = str(Difussion) + '	Diffusion / (m^2/s)'
    
    key6 = '.14		Partition'
    replace6 = str(partition) + '	Partition'
    
    key7 = 'Initial number of grains !! Warning: Maximum number 256 hard-coded!! (if<0, random distribution)'
    replace7 = '-' + str(needle_number) + '	Initial number of grains !! Warning: Maximum number 256 hard-coded!! (if<0, random distribution)'
    
    original_string = input_template.splitlines()
    stop_before_last_line = 0
    for line in original_string:
        line = line.strip()

        if(stop_before_last_line>0):
            stop_before_last_line += 1

        if(key0 in line):
            changes0 = replace0
        elif(key1 in line):
            changes0 = replace1
        elif(key2 in line):
            changes0 = replace2
        elif(key3 in line):
            changes0 = replace3
        elif(key4 in line):
            changes0 = replace4
        elif(key5 in line):
            changes0 = replace5
        elif(key6 in line):
            changes0 = replace6
        elif(key7 in line):
            changes0 = replace7
        else:
            changes0 = line
        if(stop_before_last_line < 3):
            file_out.write(changes0 + '\n')
            #print(changes0, 'stop =', stop_before_last_line)
        
    file_out.close()

### Function for creating a global launcher for all the simulations.
The idea is to create one input file and one launch file (also called *sbatch file*) per simulation (so we can end with 100*2=200 files). And instead of launching one per one... we can create a global launcher. It will be called: "sbatch_all.sh" and we will launch it directly to slurm via: "sh sbatch_all.sh". This file will have as many lines as simulations. Each line will consist in something like this: "sbatch My_sim_name.sh". And this "My_sim_same.sh" is the launch file.

In [6]:
def create_global_run_file(simulations_path, path, out_list, project_name):
    file_out = open(path + '/sbatch_all.sh', "w")
    
    for out in out_list:
        file_out.write(out + '\n')
    
    file_out.close()
    
    
    file_scp = open(simulations_path + '/scp_' + project_name + '.sh', "w")
    file_scp.write('scp -rp ' + project_name + '/*in' +  ' josep.barbera@kratos:/mnt/beegfs/home/josep.barbera/' + project_name + '\n')
    file_scp.write('scp -rp ' + project_name + '/*sh' +  ' josep.barbera@kratos:/mnt/beegfs/home/josep.barbera/' + project_name)

## Project folder structure
Here, we will establish a folder structure to organize the new simulation group efficiently.

In [10]:
simulations_path = '/home/josepbarbera/Documents/Simulations/'  # this should be changed... that's my simulations path

os.chdir(simulations_path)

project_name = 'Partition_coefficient_0.2'
project_path = simulations_path + '/' + project_name

# check if project exists, if not, it is created
if os.path.isdir(project_path): 
    print(project_name,' folder already exists')
else:
    os.mkdir(project_path)   
    
os.chdir(project_path)

main_project_folders = ['input_files', 'output_files', 'launchers']

# check if main project's folders exist, if not they are created
for folder in main_project_folders:
    folder_path = project_path + '/' + folder
    if os.path.isdir(folder_path): 
        print(' ', folder, ' folder already exists')
    else:
        os.mkdir(folder_path)
os.chdir(project_path)

Partition_coefficient_0.2  folder already exists
  input_files  folder already exists
  output_files  folder already exists
  launchers  folder already exists


## Parameters
Here, you may consider defining certain fixed parameters or values for your simulations. (I'm omitting mine for privacy reasons.)

## Cluster

In [14]:
executables_path = '/mnt/beegfs/home/josep.barbera/Codes' #this should be changed... that's my Codes directory

## Simulations matrix
Here is the most important part of the code.

Define your matrix of simulations. Things will change of course... but the main idea may help you.

In [15]:
#             ([order, server,vel, Nneedles, gpus,dx,    Nx,  Ny, time, end name]) (for no end_name type 0)*Nneedles could be an array
simulations = ([1,   'thor',     9, [9],   [0],   0.8,   630, 510, 200, "longer"],
               [2,   'thor',    10, [7],   [1],   0.7,   630, 510, 120, "longer"])

In [None]:
## Here an example of a total amount of 32 simulations.
#simulations = ([1, 4, [14, 18, 21, 30], [4, 5, 6, 7], 1.24, 630, 638, 120],
#               [5, 5, [10, 15, 21, 30], [5, 6, 7, 4], 1.1, 630, 638, 120],
#               [2, 6, [ 9, 15, 21, 30], [6, 7, 4, 5], 1.0, 630, 510, 100],
#               [6, 7, [ 9, 14, 18, 21], [7, 4, 5, 6], 0.94, 630, 510, 100],
#               [3, 8, [ 7, 14, 18, 21], [4, 5, 6, 7], 0.9, 630, 398, 90],
#               [7, 9, [ 7, 10, 14, 18], [5, 6, 7, 4], 0.8, 630, 398, 90],
#               [4, 10,[ 7, 10, 14, 18], [6, 7, 4, 5], 0.7, 630, 398, 90],
#               [8, 9, [14, 18], [4, 5], 0.8, 1470, 398, 90],
#               [9, 10,[10, 14], [6, 7], 0.7, 1470, 398, 90])

### Other minor functions

In [16]:
# Sorting list of tuples according to a key
def first(n):  
    return n[0] 

In [17]:
# function to sort the tuple     
def sort(list_of_tuples):  
    return sorted(list_of_tuples, key = first)

In [18]:
# Dictionary for executables
executables = {
    'thor': 'DNNFF2DDIR_IRON_THOR_ELIM',
    'ironman': 'DNNFF2DDIR_IRON_THOR_ELIM',
    'obelix': 'DNNFF2DDIR_OBELIX_ELIM',
    'asterix': 'DNNFF2DDIR_ASTERIX_ELIM',
    'hulk': 'DNNFF2DDIR_HULK_ELIM'
}

## Function for automatizing the process.
It is also a really important part of the code. It will iterate through your matrix and create for each simulation the input and the launch files. At the end it creates the global launcher.


In [19]:
def create_input_case(project_name, simulations_path, project_path, 
                      executables_path, simulations, k, D):
    sorted_sims = sort(simulations)
    out_list = []
    j = 0
    for i in range(len(sorted_sims)):
        server = sorted_sims[i][1]
        V = sorted_sims[i][2]
        v = V
        V = V * 1.e-5
        Needle_array = sorted_sims[i][3]
        GPUs = sorted_sims[i][4]
        dx = sorted_sims[i][5]
        Nx = sorted_sims[i][6]
        Ny = sorted_sims[i][7]
        t_end = sorted_sims[i][8]
        end_name = sorted_sims [i][9]
        j = 0
        for needle in Needle_array:
            GPU = GPUs[j]
            d = str(D)
            if isinstance(end_name, str):
                jobname = 'v' + str(v) + '_N' + str(needle) + '_Nx' + str(Nx) 
                + '_dx' + str(dx) + '_D' + d[:d.rfind("e")] + '_k' + str(k) 
                + '_AlCu_' + end_name
            else:
                jobname = 'v' + str(v) + '_N' + str(needle) + '_Nx' + str(Nx) 
                + '_dx' + str(dx) + '_D' + d[:d.rfind("e")] + '_k' + str(k) 
                + '_AlCu'
            create_run_file(project_path, server, GPU, executables[server], 
                            executables_path, jobname)
            create_input_file(project_path, V, needle, dx, Nx, Ny, t_end, D, 
                              k, jobname)
            out_list.append('sbatch '+ jobname + '.sh')
            j = j + 1
            
    create_global_run_file(simulations_path, project_path, out_list, project_name)

## Here execution begins
To initiate the process, exercise caution, as failing to do so may result in the creation of a substantial number of files.

In [20]:
create_input_case(project_name, simulations_path, project_path, executables_path, simulations, k, D)