In [1]:
import numpy as np
import os
import csv
import subprocess

In [2]:
def tail(filename, n=100, chunk_size=1024):
    """Read the last `n` lines of a file efficiently."""
    lines = []
    buffer = ''
    
    with open(filename, 'rb') as f:
        f.seek(0, 2)  # move to end of file
        file_size = f.tell()
        block_end = file_size

        while len(lines) <= n and block_end > 0:
            # Calculate how much to read (avoid negative seek)
            block_start = max(0, block_end - chunk_size)
            f.seek(block_start)
            chunk = f.read(block_end - block_start).decode('utf-8', errors='replace')
            buffer = chunk + buffer
            lines = buffer.splitlines()
            block_end -= chunk_size

    return lines[-n:]

## Important Paths to set

In [3]:
output_location = "/vera/u/jerbo/my_ptmp/L25n256_suite"
run_location = "/vera/u/jerbo/TNG-arepo/run/L25n256_suite"
template_location = run_location + "/template"

## Check status of runs

In [4]:
# check status of all runs:
completed_runs = []
failed_runs = []
cancelled_runs = []
still_running = []
pending_runs = []

with open(run_location+"/slurm_job_ids.txt", "r") as file:
    for row in file:
        count = int(row.split()[0][:-1])
        job_id = row.split()[-1]
        result = subprocess.run(["sacct", "-j", f"{job_id}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
        sbatch_output = result.stdout.strip()
        
        if "FAILED" in sbatch_output:
            failed_runs.append(count)
        elif "CANCELLED" in sbatch_output:
            cancelled_runs.append(count)
        elif "RUNNING" in sbatch_output:
            still_running.append(count)
        elif "PENDING" in sbatch_output:
            pending_runs.append(count)
        if not "FAILED" in sbatch_output and not "PENDING" in sbatch_output and not "CANCELLED" in sbatch_output and not "RUNNING" in sbatch_output:
            completed_runs.append(count)
            
        #print(count)
        #print(sbatch_output)

In [5]:
print(f"Completed runs: {len(completed_runs)}")
print(f"Failed runs: {len(failed_runs)}")
print(f"Cancelled runs: {len(cancelled_runs)}")
print(f"Still running: {len(still_running)}")
print(f"Pending runs: {len(pending_runs)}")

Completed runs: 78
Failed runs: 7
Cancelled runs: 0
Still running: 0
Pending runs: 0


## Anaylsis of failed runs

In [6]:
grid_csv_file_path = "/vera/u/jerbo/code/TNG-arepo-scripts/running_sims/L25n256/grid_lhs_constrained.csv"
grid_csv_stats_file_path = "/vera/u/jerbo/code/TNG-arepo-scripts/running_sims/L25n256/grid_lhs_constrained_basic_stats.csv"
grid_point_indices = failed_runs

full_grid_stats = []
with open(grid_csv_stats_file_path) as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader, None)
    for i in reader:
        full_grid_stats.append(i)

with open(grid_csv_file_path, newline='') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader, None)
    interestingrows=[row for idx, row in enumerate(reader) if idx in grid_point_indices]

for i in range(len(interestingrows)):
    for j in range(len(interestingrows[i])):
        interestingrows[i][j] = float(interestingrows[i][j])
    
interestingrows = np.array(interestingrows).T
print("-------------- Analysis of failed runs --------------")
for i, x in enumerate(interestingrows):
    print("Parameter:", header[i])
    print(f"Failed runs:       Mean = {x.mean():.5f}        std  = {x.std():.5f}")
    print(f"From full grid:    Mean = {float(full_grid_stats[i][2]):.5f}        std  = {float(full_grid_stats[i][3]):.5f}")
    
    diff_mean_percent = abs(x.mean()-float(full_grid_stats[i][2]))/x.mean() * 100
    diff_std_percent = abs(x.std()-float(full_grid_stats[i][3]))/x.std() * 100
    print(f"Difference:       dMean = {diff_mean_percent:.1f}%          dstd  = {diff_std_percent:.1f}%")
    print("")
    print("List of values:")
    print(x)
    print("-----------------------------")

-------------- Analysis of failed runs --------------
Parameter: Omega_m
Failed runs:       Mean = 0.33366        std  = 0.09960
From full grid:    Mean = 0.29551        std  = 0.10380
Difference:       dMean = 11.4%          dstd  = 4.2%

List of values:
[0.31483586 0.17851445 0.41271069 0.42858483]
-----------------------------
Parameter: Omega_b
Failed runs:       Mean = 0.02427        std  = 0.01451
From full grid:    Mean = 0.04946        std  = 0.03044
Difference:       dMean = 103.8%          dstd  = 109.7%

List of values:
[0.00197535 0.02408584 0.02870767 0.04231638]
-----------------------------
Parameter: Omega_Lambda
Failed runs:       Mean = 0.66634        std  = 0.09960
From full grid:    Mean = 0.70449        std  = 0.10380
Difference:       dMean = 5.7%          dstd  = 4.2%

List of values:
[0.68516414 0.82148555 0.58728931 0.57141517]
-----------------------------
Parameter: Hubble_parameter
Failed runs:       Mean = 0.74291        std  = 0.07015
From full grid:    Me

## Analysus of sucessful runs

In [7]:
finished_runs = []
restart_runs = []
missclassified_runs = []

for gp in completed_runs:
    print(f"------------ {gp} --------------")
    gp_run_location = run_location + f'/gridpoint{gp}'
    files_in_gp_run_location = os.listdir(gp_run_location)
    
    if 'running_done' in files_in_gp_run_location:
        tail_cpu_txt_file = tail(output_location + f'/gridpoint{gp}/cpu.txt', n=100)
        last_step_row = [row for row in tail_cpu_txt_file if 'Step' in row][-1].split(",")
        last_step_list = [part for part in last_step_row if 'Time' in part][0].split()
        time = float(last_step_list[-1])
        if np.isclose(time, 1):
            print("Finished run")
            finished_runs.append(gp)
        else:
            last_cumm_time = [row for row in tail_cpu_txt_file if 'total' in row][-1].split(",")
            time_needed_so_far = float(last_cumm_time[0].split()[3])/3600  # in hours
            expected_runtime_for_rest = time_needed_so_far * (1 - time)
            print(f"Restart needed. Currently at Time {time}")
            print(f"Expected further run time {expected_runtime_for_rest:.1f} hours")
            restart_runs.append(gp)
    
    if 'running' in files_in_gp_run_location:
        tail_cpu_txt_file = tail(output_location + f'/gridpoint{gp}/cpu.txt', n=100)
        last_step_row = [row for row in tail_cpu_txt_file if 'Step' in row][-1].split(",")
        last_step_list = [part for part in last_step_row if 'Time' in part][0].split()
        time = float(last_step_list[-1])
        if np.isclose(time, 1):
            print("Finished run")
            finished_runs.append(gp)
        else:
            print("Something went wrong in the classification with this one! Not sure what happened here...")
            missclassified_runs.append(gp)

------------ 0 --------------
Finished run
Finished run
------------ 1 --------------
Finished run
Finished run
------------ 2 --------------
Finished run
------------ 3 --------------
Finished run
Finished run
------------ 4 --------------
Finished run
Finished run
------------ 5 --------------
Finished run
Finished run
------------ 6 --------------
Finished run
------------ 7 --------------
Finished run
Finished run
------------ 8 --------------
Finished run
Finished run
------------ 9 --------------
Finished run
Finished run
------------ 10 --------------
Finished run
------------ 11 --------------
Finished run
------------ 12 --------------
Finished run
Finished run
------------ 13 --------------
Finished run
------------ 14 --------------
Finished run
Finished run
------------ 15 --------------
Finished run
------------ 16 --------------
Finished run
Finished run
------------ 17 --------------
Finished run
Finished run
------------ 18 --------------
Finished run
Finished run
-----

## Final Results

In [8]:
print("Sucessful runs:", len(set(completed_runs)))
print(" -- of that finished:", len(set(finished_runs)))
print(" -- of that to restart:", len(restart_runs))
print(" -- of that missclassified:", len(set(missclassified_runs)))
print(f"Failed runs: {len(set(failed_runs))}")
print(f"Cancelled runs: {len(set(cancelled_runs))}")
print(f"Still running: {len(set(still_running))}")
print(f"Pending runs: {len(set(pending_runs))}")

Sucessful runs: 47
 -- of that finished: 46
 -- of that to restart: 1
 -- of that missclassified: 1
Failed runs: 4
Cancelled runs: 0
Still running: 0
Pending runs: 0


In [11]:
print(set(finished_runs))
print("--- Not finished ---")
for i in range(50):
    if i not in finished_runs:
        print(i)
print("--- Not finished ---")
print(set(restart_runs))
print("Failed:", failed_runs)
print(missclassified_runs)
# Runs that I manually delted the restart files from :
# [2, 6, 10, 11, 13, 15, 19, 23, 24, 25, 27, 29, 30, 31]

# 40 is done indeed
# 45 has the peano grid error
# 48 has the peano grid error
# 34 also has peano grid error - but was the one I restarted because of the error! :(

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 43, 44, 46, 47, 49}
--- Not finished ---
34
40
45
48
--- Not finished ---
{48}
Failed: [34, 40, 45, 48, 34, 45, 48]
[48]


## Restart the runs that were not completed (but sucessful)

In [10]:
assert False, "Dont run this carelessly"
# edit scrip.slurm for the reruns
# also add the still running ones, as they will likely also need a restart once they are completed
restart_runs = restart_runs + still_running

for gp in restart_runs:
    run_gridpoint_path = run_location + f"/gridpoint{gp}"
    
    path_to_slurm_script = run_gridpoint_path + "/script.slurm"
    
    file_content = []
    with open(path_to_slurm_script, "r", encoding="utf-8") as file:
        for row in file:
            if "srun ./Arepo_L25n256 param_L25n256.txt" in row:
                row = f"srun ./Arepo_L25n256 param_L25n256.txt 1"
            file_content.append(row)

    with open(path_to_slurm_script, "w") as file:
        for row in file_content:
            file.write(row)

AssertionError: Dont run this carelessly

In [None]:
assert False, "Dont run this carelessly"
# actually submit the sbatch jobs for the reruns
for gp in restart_runs:
    run_gridpoint_path = run_location + f"/gridpoint{gp}"
    
    os.chdir(run_gridpoint_path)
    
    # submit the job script to slurm
    slurm_script = "script.slurm"
    result = subprocess.run(["sbatch", slurm_script], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    sbatch_output = result.stdout.strip()
    
    with open(run_location+"/slurm_job_ids.txt", "a") as myfile:
        myfile.write(f"{gp}: {sbatch_output}\n")
