In [10]:
using CSV
Pkg.add(path="../../GTF")
using GTF
using DataFrames
using NPZ
using FileIO
using StatsBase  # for unique
using SCYFI

function find_run_folders(base_path::String)
    run_folders = []
    
    # Get all top-level folders
    for outer_folder in readdir(base_path)
        outer_path = joinpath(base_path, outer_folder)
        isdir(outer_path) || continue
        
        # Get all second-level folders
        for inner_folder in readdir(outer_path)
            inner_path = joinpath(outer_path, inner_folder)
            isdir(inner_path) || continue
            
            # Get all run folders (numbered folders)
            for run_folder in readdir(inner_path)
                run_path = joinpath(inner_path, run_folder)
                if isdir(run_path) && occursin(r"^\d{3}$", run_folder)
                    # Store the complete path to the run folder
                    push!(run_folders, (
                        full_path=run_path,
                        outer_folder=outer_folder,
                        inner_folder=inner_folder,
                        run_number=run_folder
                    ))
                end
            end
        end
    end
    
    return run_folders
end

function check_model_criteria(m, O, data, A, W, h, num_relus)
    # Criterion 1: Check unique points in trajectory
    traj = generate(m, O, data[10,1:end], 10000000)
    unique_points = unique(round.(traj[9999500:end,1], digits=3), dims=1)
    if size(unique_points, 1) > 70
        return false, "Too many unique points: $(size(unique_points, 1))"
    end
    
    # Criterion 2 & 3: Check cycles
    try
        res = find_cycles(A, W, h, num_relus, 1, 
                         outer_loop_iterations=200, 
                         inner_loop_iterations=500)
        
        # Check if at least 2 cycles were found
        if length(res[2][1]) < 2
            return false, "Insufficient cycles found: $(length(res[2][1]))"
        end
        
        # Check if at least one cycle has all points within bounds
        found_valid_cycle = false
        for i in 1:length(res[2][1])
            if all(abs.(res[2][1][i]) .< 1)
                found_valid_cycle = true
                break
            end
        end
        return true, "All criteria met"
    catch e
        return false, "Error in cycle analysis: $e"
    end
end

function find_best_models(base_path::String, pe_threshold::Float64, data)
    selected_models = Dict()
    rejected_models = Dict()
    
    run_folders = find_run_folders(base_path)
    
    for run_info in run_folders
        printtln("Current Path", run_info.full_path)
        metrics_path = joinpath(run_info.full_path, "LossMetrics.csv")
        
        if !isfile(metrics_path)
            println("No metrics file found in $(run_info.run_number), skipping...")
            continue
        end
        
        metrics = CSV.read(metrics_path, DataFrame)
        folder_key = "$(run_info.outer_folder)/$(run_info.inner_folder)/$(run_info.run_number)"
        selected_models[folder_key] = Int[]
        rejected_models[folder_key] = Dict{Int, String}()
        
        # First filter by PE threshold
        good_models = filter(row -> row.PE < pe_threshold, metrics)
        
        if !isempty(good_models)
            println("Run $(run_info.run_number): Testing $(nrow(good_models)) models that passed PE threshold")
            
            for row in eachrow(good_models)
                epoch = Int(row.Epoch)
                model_path = joinpath(run_info.full_path, "checkpoints", "model_$epoch.bson")
                
                try
                    # Load model
                    m, O = load_model(model_path)
                    
                    # Get model parameters (assuming these are accessible from your model)
                                    
                    # Check all criteria
                    passes_criteria, reason = check_model_criteria(m, O, data, m.A, m.W, m.h, m.n)
                    
                    if passes_criteria
                        push!(selected_models[folder_key], epoch)
                        println("  Model $epoch passed all criteria")
                    else
                        rejected_models[folder_key][epoch] = reason
                        println("  Model $epoch failed: $reason")
                    end
                catch e
                    rejected_models[folder_key][epoch] = "Error loading/evaluating model: $e"
                    println("  Error processing model $epoch: $e")
                end
            end
        end
    end
    
    return selected_models, rejected_models
end



LibGit2.Error.GitError: GitError(Code:ERROR, Class:SSL, Your Julia is built with a SSL/TLS engine that libgit2 doesn't know how to configure to use a file or directory of certificate authority roots, but your environment specifies one via the SSL_CERT_DIR variable. If you believe your system's root certificates are safe to use, you can `export JULIA_SSL_CA_ROOTS_PATH=""` in your environment to use those instead.)

In [2]:

# Usage example:
base_path = "C:/Users/Lukas/Documents/PhD/Code/SCYFI-01_25/Results_empirical"
pe_threshold = 0.8

# Load your data
data_path = "C:/Users/Lukas/Documents/PhD/Code/SCYFI-01_25/Figures/Figure4/example cell/ExampleCell/lukas_data.npy"
data = npzread(data_path)

# Find models meeting all criteria
selected_models, rejected_models = find_best_models(base_path, pe_threshold, data)

# Print summary
println("\nSelection Summary:")
for (run_folder, epochs) in selected_models
    if isempty(epochs)
        println("Run $run_folder: No models passed all criteria")
        println("Rejected models and reasons:")
        for (epoch, reason) in rejected_models[run_folder]
            println("  Model $epoch: $reason")
        end
    else
        println("Run $run_folder: $(length(epochs)) models passed all criteria")
        println("  Selected models: $epochs")
    end
end

# Evaluate the selected models

UndefVarError: UndefVarError: `npzread` not defined