In [7]:
import re
import json
from pathlib import Path as pt

In [None]:
runtime_path_inp = input("Enter the runtime path ('same', '<path>'): ").strip().lower()
runtime_uuid_inp = input("Enter the model configuration ('<uuid>'/'all'): ").strip().lower()

In [9]:
########################
# Runtime variables
########################

if runtime_path_inp == "same":
    runtime_path = "."
else:
    runtime_path = runtime_path_inp

runtime_uuid = runtime_uuid_inp

config_path = f"{runtime_path}/configs"
tokenizers_path = f"{runtime_path}/tokenizers"
inputs_path = f"{runtime_path}/inputs"
outputs_path = f"{runtime_path}/outputs"
models_path = f"{runtime_path}/models"

In [10]:
########################
# Extract file components
########################
    
runtime_uuids = []
if runtime_uuid == "all":
    # Define the directory
    directory = pt(f"{outputs_path}")

    # Regex pattern for GUID
    pattern = re.compile(r'report_([a-f0-9\-]{36})\.json')

    # Find matching files and extract GUIDs
    for file in directory.glob('report_*.json'):
        match = pattern.match(file.name)
        if match:
            runtime_uuids.append(match.group(1))
else:
    # If a specific UUID is provided, add it to the list
    runtime_uuids.append(runtime_uuid)

# Loop through each GUID to extract weights
for runtime_uuid in runtime_uuids:
        # Load the model from a single file
        output_path_inp = f"{outputs_path}/report_{runtime_uuid}.json"
        with open(output_path_inp, "r") as f:
            output = json.load(f)

        # Store the model config to a single file
        config = output['config']        
        
        config_path_out = f"{config_path}/config_{runtime_uuid}.json"
        with open(config_path_out, "w", encoding="utf-8") as f:
            json.dump(config, f, indent=4)

        # Store the model report to a single file
        report = {
            "num_epochs": output['analysis']['num_epochs'],
            "train_batches_per_epoch": output['analysis']['train_batches_per_epoch'],
            "val_batches_per_epoch": output['analysis']['val_batches_per_epoch'],
            "average_train_loss_per_epoch": output['analysis']['average_train_loss_per_epoch'],
            "average_val_loss_per_epoch": output['analysis']['average_val_loss_per_epoch'],
            "average_time_per_epoch": output['analysis']['average_train_time_per_epoch'],
            "batch_logs": output['batch_logs'],
            "epoch_logs": output['epoch_logs'],
        }

        report_path_out = f"{outputs_path}/report_{runtime_uuid}.json"
        with open(report_path_out, "w", encoding="utf-8") as f:
            json.dump(report, f, indent=4)


In [11]:
########################
# Duplicate file components
########################
    
runtime_uuids = []
if runtime_uuid == "all":
    # Define the directory
    directory = pt(f"{config_path}")

    # Regex pattern for GUID
    pattern = re.compile(r'config_([a-f0-9\-]{36})\.json')

    # Find matching files and extract GUIDs
    for file in directory.glob('config_*.json'):
        match = pattern.match(file.name)
        if match:
            runtime_uuids.append(match.group(1))

# Loop through each GUID to extract weights
for runtime_uuid in runtime_uuids:
        # Load the model from a single file
        tokenizer_path_inp = f"{tokenizers_path}/tokenizer_d25f76fe-eef0-45b1-8584-d4e9dda078ce.json"
        with open(tokenizer_path_inp, "r") as f:
            tokenizer = json.load(f)      
        
        tokenizer_path_out = f"{tokenizers_path}/tokenizer_{runtime_uuid}.json"
        with open(tokenizer_path_out, "w", encoding="utf-8") as f:
            json.dump(tokenizer, f, indent=4)
