# Run BERDL Tables Pipeline

This notebook runs the BERDL tables pipeline outside the KBase container.
It uses `util.py` as a proxy for `KBDatalakeAppsImpl`, replicating
how the deployed app initializes `KBDataLakeUtils` and imports the same
standalone worker functions (`run_model_reconstruction`, `run_phenotype_simulation`).

**Workflow:**
1. **Cell 1**: Create scratch directory, write `input_params.json`, print container commands, annotate pangenome FAA files with RAST
2. **Cell 2**: Convert user genome to TSV, run model reconstruction + phenotype simulation in parallel

In [1]:
%run util.py
import os
import json
from pathlib import Path

# ---- EDIT THESE PARAMETERS ----
workspace_id = "76990"
genome_name = "Acinetobacter_baylyi_ADP1_RAST"
# --------------------------------

genome_ref = f"{workspace_id}/{genome_name}"

parameters = {
    "workspace_id": workspace_id,
    "genome_name": genome_name,
}

sdk_config = {
    "kbversion":"appdev",
    "scratch":"/kb/module/work/shared/chenry/scratch",
    "reference_path": "/data",
    "module_path": "/kb/module/work/shared/chenry",
    "max_phenotypes": 5,
    "experimental_phenotype_datafile":"/kb/module/data/experimental_data.json",
    "phenotypeset_file":"/kb/module/data/full_phenotype_set.json",
    "fitness_genomes_dir":"/data/reference_data/phenotype_data/fitness_genomes/",
    "reference_phenosim_dir":"/data/reference_data/phenotype_data/phenosims"
}

poplar_config = {
    "kbversion":"appdev",
    "scratch":"/home/chenry/BERDLFiles/scratch",
    "reference_path": "/home/chenry/GSP",
    "module_path": "/home/chenry/BERDLFiles",
    "max_phenotypes": 5,
    "experimental_phenotype_datafile":"/home/chenry/Dropbox/Projects/KBDatalakeApps/data/experimental_data.json",
    "phenotypeset_file":"/home/chenry/Dropbox/Projects/KBDatalakeApps/data/full_phenotype_set.json",
    "fitness_genomes_dir":"/home/chenry/GSP/reference_data/phenotype_data/fitness_genomes/",
    "reference_phenosim_dir":"/home/chenry/GSP/reference_data/phenotype_data/phenosims/"
}

sdk_config["output_dir"] = sdk_config["scratch"] + "/" + parameters["genome_name"]
poplar_config["output_dir"] = poplar_config["scratch"] + "/" + parameters["genome_name"]

util.save("parameters", parameters)
util.save("sdk_config", sdk_config)
util.save("poplar_config", poplar_config)

# Write input_params.json (mirrors how Impl creates the parameter file)
token = util.get_token('kbase')
input_params = {
    "input_refs": [genome_ref],
    "_ctx": {"token": token},
    "_config": {"scratch": sdk_config["output_dir"]}
}
output_dir = Path(poplar_config["output_dir"])
output_dir.mkdir(parents=True, exist_ok=True)
params_file = output_dir / "input_params.json"
with open(params_file, 'w') as f:
    json.dump(input_params, f, indent=2)
print(f"Wrote: {params_file}")

# Remind user of container commands
print("\n" + "=" * 70)
print("Run this commands in the container BEFORE continuing to the next cell:")
print(f'  {sdk_config["module_path"]}/KBDatalakeApps/scripts/run_genome_pipeline.sh {sdk_config["output_dir"]}/input_params.json')
print("=" * 70)

[KBUtilLib] Failed to import rcsb_pdb_utils: ModuleNotFoundError: No module named 'aiohttp'


modelseedpy 0.4.2


2026-02-12 22:11:32,209 - __main__.NotebookUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:11:32,211 - __main__.NotebookUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-12 22:11:32,212 - __main__.NotebookUtil - INFO - Notebook name: RunBERDLTablesPipeline
2026-02-12 22:11:32,213 - __main__.NotebookUtil - INFO - Notebook environment detected


Wrote: /home/chenry/BERDLFiles/scratch/Acinetobacter_baylyi_ADP1_RAST/input_params.json

Run this commands in the container BEFORE continuing to the next cell:
  /kb/module/work/shared/chenry/KBDatalakeApps/scripts/run_genome_pipeline.sh /kb/module/work/shared/chenry/scratch/Acinetobacter_baylyi_ADP1_RAST/input_params.json


In [3]:
%run util.py
from pathlib import Path

parameters = util.load("parameters")
sdk_config = util.load("sdk_config")
poplar_config = util.load("poplar_config")
output_dir = Path(poplar_config["output_dir"])

# Discover pangenome clade ID from the folder created by run_genome_pipeline.sh
pangenome_dir = output_dir / "pangenome"
if pangenome_dir.exists():
    clade_folders = [d.name for d in pangenome_dir.iterdir() if d.is_dir()]
    if len(clade_folders) == 1:
        pangenome_clade_id = clade_folders[0]
    elif len(clade_folders) > 1:
        print(f"Multiple pangenome clades found: {clade_folders}")
        pangenome_clade_id = clade_folders[0]
        print(f"Using first: {pangenome_clade_id}")
    else:
        raise FileNotFoundError(f"No clade subdirectories found in {pangenome_dir}")
else:
    raise FileNotFoundError(
        f"{pangenome_dir} does not exist. Run the genome pipeline first:\n"
        f'  {sdk_config["module_path"]}/KBDatalakeApps/scripts/run_genome_pipeline.sh {sdk_config["output_dir"]}/input_params.json'
    )

print(f"Detected pangenome clade ID: {pangenome_clade_id}")
sdk_config["pangenome_clade_id"] = pangenome_clade_id
poplar_config["pangenome_clade_id"] = pangenome_clade_id
util.save("sdk_config", sdk_config)
util.save("poplar_config", poplar_config)

print("\n" + "=" * 70)
print("Run these commands in the container BEFORE continuing to the next cell:")
print(f'  {sdk_config["module_path"]}/KBDatalakeApps/scripts/run_pangenome_pipeline.sh {sdk_config["output_dir"]}/input_params.json {pangenome_clade_id}')
print("=" * 70)
print(f'  {sdk_config["module_path"]}/KBDatalakeApps/scripts/build_pangenome_data_tsv.sh {sdk_config["output_dir"]} {pangenome_clade_id}')
print("=" * 70)

2026-02-12 22:12:18,115 - __main__.NotebookUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:12:18,116 - __main__.NotebookUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-12 22:12:18,118 - __main__.NotebookUtil - INFO - Notebook name: RunBERDLTablesPipeline
2026-02-12 22:12:18,119 - __main__.NotebookUtil - INFO - Notebook environment detected


Detected pangenome clade ID: RS_GCF_000368685.1

Run these commands in the container BEFORE continuing to the next cell:
  /kb/module/work/shared/chenry/KBDatalakeApps/scripts/run_pangenome_pipeline.sh /kb/module/work/shared/chenry/scratch/Acinetobacter_baylyi_ADP1_RAST/input_params.json RS_GCF_000368685.1
  /kb/module/work/shared/chenry/KBDatalakeApps/scripts/build_pangenome_data_tsv.sh /kb/module/work/shared/chenry/scratch/Acinetobacter_baylyi_ADP1_RAST/RS_GCF_000368685.1


In [1]:
%run util.py
from pathlib import Path

parameters = util.load("parameters")
poplar_config = util.load("poplar_config")
output_dir = Path(poplar_config["output_dir"])

# Annotate user genome FAA files with RAST (robust to genomes without RAST annotation)
user_genome_dir = output_dir / "genome"
if user_genome_dir.exists():
    faa_files = sorted(user_genome_dir.glob("*.faa"))
    print(f"Found {len(faa_files)} user genome FAA files in {user_genome_dir}")
    for faa_file in faa_files:
        tsv_path = util.annotate_faa_with_rast(faa_file)
        print(f"  {faa_file.name} -> {Path(tsv_path).name}")
else:
    print(f"Warning: {user_genome_dir} does not exist yet.")

# Annotate pangenome member FAA files with RAST
genome_dir = output_dir / "pangenome" / poplar_config["pangenome_clade_id"] / "genome"
if genome_dir.exists():
    faa_files = sorted(genome_dir.glob("*.faa"))
    print(f"\nFound {len(faa_files)} pangenome FAA files in {genome_dir}")
    for faa_file in faa_files:
        tsv_path = util.annotate_faa_with_rast(faa_file)
        print(f"  {faa_file.name} -> {Path(tsv_path).name}")
else:
    print(f"\nWarning: {genome_dir} does not exist yet.")
    print("Run the pangenome pipeline command above first, then re-run this cell.")

print("\nRAST annotation complete.")

[KBUtilLib] Failed to import rcsb_pdb_utils: ModuleNotFoundError: No module named 'aiohttp'


modelseedpy 0.4.2


2026-02-12 22:19:16,570 - __main__.NotebookUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:19:16,572 - __main__.NotebookUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-12 22:19:16,573 - __main__.NotebookUtil - INFO - Notebook name: RunBERDLTablesPipeline
2026-02-12 22:19:16,574 - __main__.NotebookUtil - INFO - Notebook environment detected


Found 1 user genome FAA files in /home/chenry/BERDLFiles/scratch/Acinetobacter_baylyi_ADP1_RAST/genome
  user_Acinetobacter_baylyi_ADP1_RAST.faa -> user_Acinetobacter_baylyi_ADP1_RAST.tsv

Found 13 pangenome FAA files in /home/chenry/BERDLFiles/scratch/Acinetobacter_baylyi_ADP1_RAST/pangenome/RS_GCF_000368685.1/genome
  GB_GCA_002694305.1.faa -> GB_GCA_002694305.1.tsv
  RS_GCF_000046845.1.faa -> RS_GCF_000046845.1.tsv
  RS_GCF_000302115.1.faa -> RS_GCF_000302115.1.tsv
  RS_GCF_000368685.1.faa -> RS_GCF_000368685.1.tsv
  RS_GCF_000621045.1.faa -> RS_GCF_000621045.1.tsv
  RS_GCF_001485005.1.faa -> RS_GCF_001485005.1.tsv
  RS_GCF_010577805.1.faa -> RS_GCF_010577805.1.tsv
  RS_GCF_010577855.1.faa -> RS_GCF_010577855.1.tsv
  RS_GCF_010577875.1.faa -> RS_GCF_010577875.1.tsv
  RS_GCF_010577895.1.faa -> RS_GCF_010577895.1.tsv
  RS_GCF_010577925.1.faa -> RS_GCF_010577925.1.tsv
  RS_GCF_010577955.1.faa -> RS_GCF_010577955.1.tsv
  RS_GCF_900465415.1.faa -> RS_GCF_900465415.1.tsv

RAST annotation 

In [None]:
%run util.py
import os
import traceback
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from KBDatalakeApps.KBDatalakeUtils import run_model_reconstruction

parameters = util.load("parameters")
poplar_config = util.load("poplar_config")
kbversion = poplar_config["kbversion"]
output_dir = Path(poplar_config["output_dir"])

genome_ref = f"{parameters['workspace_id']}/{parameters['genome_name']}"
kbdl = util.create_kbdl_utils(poplar_config["reference_path"], poplar_config["module_path"], kbversion)
classifier_dir = os.path.join("/home/chenry/Dropbox/Projects/KBUtilLib", "data", "ms_classifier")

# Step 1: Convert user genome to full-format TSV from workspace
user_genome_dir = output_dir / "genome"
user_genome_dir.mkdir(parents=True, exist_ok=True)
user_genome_tsv = user_genome_dir / f"user_{parameters['genome_name']}_kbasedump.tsv"
print(f"Converting genome {genome_ref} to TSV...")
kbdl.run_user_genome_to_tsv(genome_ref, str(user_genome_tsv))
print(f"  Wrote: {user_genome_tsv}")

# Step 2: Collect simple RAST TSVs only (exclude kbasedump, bakta, KOfamscan, PSORT, genome_data)
_skip_suffixes = ("_kbasedump.tsv", "_bakta.tsv", "_KOfamscan.tsv", "_PSORT.tsv", "_genome_data.tsv")
all_tsvs = [f for f in sorted(user_genome_dir.glob("user_*.tsv"))
            if not f.name.endswith(_skip_suffixes)]
print(f"Found {len(all_tsvs)} RAST-annotated user genome TSVs in {user_genome_dir}")

# Add pangenome member TSVs (only simple RAST files, not enriched _genome_data.tsv)
pangenome_genome_dir = output_dir / "pangenome" / poplar_config["pangenome_clade_id"] / "genome"
if pangenome_genome_dir.exists():
    rast_tsvs = [f for f in sorted(pangenome_genome_dir.glob("*.tsv"))
                 if not f.name.endswith(_skip_suffixes)]
    all_tsvs.extend(rast_tsvs)
    print(f"Found {len(rast_tsvs)} RAST-annotated TSVs in {pangenome_genome_dir}")
print(f"Total TSVs to process: {len(all_tsvs)}")

# Step 3: Build work items for parallel processing
results_dir = output_dir / "models"
results_dir.mkdir(parents=True, exist_ok=True)

work_items = []
for tsv_path in all_tsvs:
    stem = tsv_path.stem
    output_base = str(results_dir / stem)
    work_items.append((str(tsv_path), output_base))

# Step 4: Run model reconstruction in parallel (10 workers)
print(f"\n--- Model Reconstruction ({len(work_items)} genomes, 10 workers) ---")
with ProcessPoolExecutor(max_workers=10) as executor:
    futures = {
        executor.submit(run_model_reconstruction, inp, outp, classifier_dir, kbversion): (inp, outp)
        for inp, outp in work_items
    }
    for future in as_completed(futures):
        inp, outp = futures[future]
        try:
            result = future.result()
            status = "OK" if result.get('success') else f"FAIL: {str(result.get('error', '?'))[:80]}"
            print(f"  {Path(inp).name}: {status}")
        except Exception as e:
            print(f"  {Path(inp).name}: ERROR - {e}")
            traceback.print_exc()

# Summary
cobra_files = sorted(results_dir.glob("*_cobra.json"))
data_files = sorted(results_dir.glob("*_data.json"))
print(f"\nDone. {len(cobra_files)} cobra models, {len(data_files)} data files in: {results_dir}")

2026-02-12 22:19:51,212 - __main__.NotebookUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:19:51,213 - __main__.NotebookUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-12 22:19:51,214 - __main__.NotebookUtil - INFO - Notebook name: RunBERDLTablesPipeline
2026-02-12 22:19:51,215 - __main__.NotebookUtil - INFO - Notebook environment detected
2026-02-12 22:19:51,231 - KBDatalakeApps.KBDatalakeUtils.KBDataLakeUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:19:51,232 - KBDatalakeApps.KBDatalakeUtils.KBDataLakeUtils - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


loading biochemistry database from /home/chenry/Dropbox/Projects/ModelSEEDDatabase


2026-02-12 22:19:59,395 - KBDatalakeApps.KBDatalakeUtils.KBDataLakeUtils - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-12 22:20:00,016 - KBDatalakeApps.KBDatalakeUtils.KBDataLakeUtils - CRITICAL - KBase version not set up for modeling!


cobrakbase 0.4.0
Converting genome 76990/Acinetobacter_baylyi_ADP1_RAST to TSV...
  Wrote: /home/chenry/BERDLFiles/scratch/Acinetobacter_baylyi_ADP1_RAST/genome/user_Acinetobacter_baylyi_ADP1_RAST_kbasedump.tsv
Found 1 RAST-annotated user genome TSVs in /home/chenry/BERDLFiles/scratch/Acinetobacter_baylyi_ADP1_RAST/genome
Found 13 RAST-annotated TSVs in /home/chenry/BERDLFiles/scratch/Acinetobacter_baylyi_ADP1_RAST/pangenome/RS_GCF_000368685.1/genome
Total TSVs to process: 14

--- Model Reconstruction (14 genomes, 10 workers) ---


2026-02-12 22:20:03,825 - kbutillib.ms_reconstruction_utils.MSReconstructionUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:20:03,824 - kbutillib.ms_reconstruction_utils.MSReconstructionUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:20:03,825 - kbutillib.ms_reconstruction_utils.MSReconstructionUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:20:03,825 - kbutillib.ms_reconstruction_utils.MSReconstructionUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:20:03,825 - kbutillib.ms_reconstruction_utils.MSReconstructionUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:20:03,825 - kbutillib.ms_reconstruction_utils.MSReconstructionUtils - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-12 22:20:03,825 - kbutillib.ms_reconstruction_utils.MSReconstruction

genome_id: user_Acinetobacter_baylyi_ADP1_RASTgenome_id: RS_GCF_010577855.1genome_id: RS_GCF_000368685.1


genome_id: RS_GCF_010577805.1
genome_id: RS_GCF_000621045.1genome_id: RS_GCF_001485005.1

TSV format detected: simple (id, function)
TSV format detected: simple (id, function)genome_id: RS_GCF_010577875.1

TSV format detected: simple (id, function)
genome_id: GB_GCA_002694305.1
genome_id: RS_GCF_000302115.1TSV format detected: simple (id, function)
genome_id: RS_GCF_000046845.1

TSV format detected: simple (id, function)
TSV format detected: simple (id, function)
TSV format detected: simple (id, function)
TSV format detected: simple (id, function)
TSV format detected: simple (id, function)
TSV format detected: simple (id, function)
  Genome user_Acinetobacter_baylyi_ADP1_RAST: 3209 features, 3146 with RAST annotations
  Genome RS_GCF_000368685.1: 3275 features, 3190 with RAST annotations
  Genome RS_GCF_000621045.1: 3318 features, 3220 with RAST annotations
  Genome RS_GCF_0105778

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


N
N
NN

N
NN

N
N
N




Tests: [{'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d876e09a0>, 'is_max_threshold': True, 'threshold': 1e-05, 'objective': 'rxn00062_c0'}, {'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d855c04f0>, 'is_max_threshold': True, 'threshold': 7.799999999999997, 'objective': 'rxn00062_c0'}, {'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d8559a5f0>, 'is_max_threshold': True, 'threshold': 14.7, 'objective': 'rxn00062_c0'}, {'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d8556eec0>, 'is_max_threshold': True, 'threshold': 12.45, 'objective': 'rxn00062_c0'}, {'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d855201c0>, 'is_max_threshold': True, 'threshold': 16.2, 'objective': 'rxn00062_c0'}, {'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d856c40a0>, 'is_max_threshold': True, 'threshold': 14.7, 'objective': 'rxn00062_c0'}, {'media': <modelseedpy.core.msmedia.MSMedia object at 0x779d8569d420>, 'is_max_threshold': True, 'threshold

In [None]:
%run util.py
import os
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from KBDatalakeApps.KBDatalakeUtils import run_phenotype_simulation

poplar_config = util.load("poplar_config")
kbversion = poplar_config["kbversion"]
max_phenotypes = poplar_config["max_phenotypes"]
max_phenotypes = None
output_dir = Path(poplar_config["output_dir"])
results_dir = output_dir / "models"

# Discover cobra model files on disk (independent of model reconstruction results)
cobra_files = sorted(results_dir.glob("*_cobra.json"))
pheno_items = []
phenopath = output_dir / "phenotypes"
phenopath.mkdir(parents=True, exist_ok=True)
data_path = os.path.join(poplar_config["module_path"], "KBDatalakeApps", "data")
for cobra_path in cobra_files:
    genome_id = cobra_path.name.replace("_cobra.json", "")
    pheno_output = str(phenopath / f"{genome_id}_phenosim.json")
    pheno_items.append((str(cobra_path), pheno_output))

print(f"--- Phenotype Simulation ({len(pheno_items)} models, 10 workers) ---")
with ProcessPoolExecutor(max_workers=10) as executor:
    futures = {
        executor.submit(run_phenotype_simulation, model_file, pheno_file, data_path, max_phenotypes, kbversion): model_file
        for model_file, pheno_file in pheno_items
    }
    for future in as_completed(futures):
        model_path = futures[future]
        try:
            result = future.result()
            status = "OK" if result.get('success') else f"FAIL: {str(result.get('error', '?'))[:80]}"
            print(f"  {Path(model_path).name}: {status}")
        except Exception as e:
            print(f"  {Path(model_path).name}: ERROR - {e}")

pheno_files = sorted(phenopath.glob("*_phenosim.json"))
print(f"\nDone. {len(pheno_files)} phenotype files saved in: {phenopath}")

[KBUtilLib] Failed to import rcsb_pdb_utils: ModuleNotFoundError: No module named 'aiohttp'


modelseedpy 0.4.2


2026-02-13 05:38:57,591 - __main__.NotebookUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:57,592 - __main__.NotebookUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-13 05:38:57,594 - __main__.NotebookUtil - INFO - Notebook name: RunBERDLTablesPipeline
2026-02-13 05:38:57,595 - __main__.NotebookUtil - INFO - Notebook environment detected


--- Phenotype Simulation (14 models, 10 workers) ---


2026-02-13 05:38:58,338 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,339 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,341 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-13 05:38:58,342 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token
2026-02-13 05:38:58,341 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,341 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,343 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbu

loading biochemistry database fromloading biochemistry database from

2026-02-13 05:38:58,344 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,346 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


 

2026-02-13 05:38:58,346 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml


 loading biochemistry database from

2026-02-13 05:38:58,348 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


loading biochemistry database from/home/chenry/Dropbox/Projects/ModelSEEDDatabase /home/chenry/Dropbox/Projects/ModelSEEDDatabase

2026-02-13 05:38:58,349 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


 loading biochemistry database from
/home/chenry/Dropbox/Projects/ModelSEEDDatabase
/home/chenry/Dropbox/Projects/ModelSEEDDatabase 
loading biochemistry database from 
/home/chenry/Dropbox/Projects/ModelSEEDDatabaseloading biochemistry database from/home/chenry/Dropbox/Projects/ModelSEEDDatabase 

/home/chenry/Dropbox/Projects/ModelSEEDDatabase


2026-02-13 05:38:58,358 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,360 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


loading biochemistry database from /home/chenry/Dropbox/Projects/ModelSEEDDatabase


2026-02-13 05:38:58,367 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,369 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


loading biochemistry database from /home/chenry/Dropbox/Projects/ModelSEEDDatabase


2026-02-13 05:38:58,375 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded configuration from: /home/chenry/.kbutillib/config.yaml
2026-02-13 05:38:58,377 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - Loaded kbase tokens from /home/chenry/.kbase/token


loading biochemistry database from /home/chenry/Dropbox/Projects/ModelSEEDDatabase


2026-02-13 05:39:06,344 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-13 05:39:06,359 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-13 05:39:06,375 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-13 05:39:06,380 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-13 05:39:06,384 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-13 05:39:06,389 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase
2026-02-13

cobrakbase 0.4.0
cobrakbase 0.4.0
cobrakbase 0.4.0
cobrakbase 0.4.0cobrakbase
cobrakbase  0.4.00.4.0

cobrakbase 0.4.0


2026-02-13 05:39:07,044 - KBDatalakeApps.KBDatalakeUtils.PhenotypeWorkerUtil - INFO - ModelSEED database loaded from /home/chenry/Dropbox/Projects/ModelSEEDDatabase


cobrakbase 0.4.0
cobrakbase 0.4.0
cobrakbase 0.4.0




cpd00020 optimal 4.262434146991229
cpd00020 optimal 4.257944787527006
cpd00020 optimal 2.8561752294088167
cpd00020 optimal 4.257944787527006
cpd00020 optimal 4.262434146991229
cpd00020 optimal 2.8592793394582143
cpd00020 optimal 2.8561752294088167
cpd00020 optimal 2.85617522940877
cpd00020 optimal 4.257944787527006
cpd00020 optimal 4.257944787527006
cpd00023 optimal 5.501834416221673
cpd00023 optimal 5.503325082207447
cpd00023 optimal 3.308604028199567
cpd00023 optimal 5.503325082207447cpd00023
 optimal 5.501834416221673
cpd00023 optimal 3.308604028199567
cpd00023 optimal 5.503325082207447
cpd00023 optimal 5.503325082207447
cpd00023 optimal 3.3086040281996305
cpd00023 optimal 3.3086040281995683
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
cpd00024 optimal 0.0
Removing unneeded gapfilled reaction: rxn00907_c0
Removing unneeded gapfilled reaction

In [None]:
%run util.py
import os
import shutil
from pathlib import Path

poplar_config = util.load("poplar_config")
output_dir = Path(poplar_config["output_dir"])
phenopath = output_dir / "phenotypes"
kbdl = util.create_kbdl_utils(poplar_config["reference_path"], poplar_config["module_path"], poplar_config["kbversion"])

pheno_files = sorted(phenopath.glob("*_phenosim.json"))
print(f"Found {len(pheno_files)} phenotype simulation files in {phenopath}")

# Build phenotype tables
print(f"\nBuilding phenotype tables...")
kbdl.build_phenotype_tables(
    output_dir=str(phenopath),
    phenosim_directory=str(phenopath),
    experiment_data_file=poplar_config["experimental_phenotype_datafile"],
    phenoset_file=poplar_config["phenotypeset_file"],
    fitness_mapping_dir=str(output_dir / "genome"),
    fitness_genomes_dir=poplar_config["fitness_genomes_dir"],
    model_data_dir=str(output_dir / "models"),
    reference_phenosim_dir=poplar_config["reference_phenosim_dir"]
)

# List output files
if phenotype_tables_dir.exists():
    for f in sorted(phenotype_tables_dir.iterdir()):
        print(f"  {f.name} ({f.stat().st_size:,} bytes)")

In [None]:
%run util.py
from pathlib import Path

poplar_config = util.load("poplar_config")
output_dir = Path(poplar_config["output_dir"])
results_dir = output_dir / "models"
kbdl = util.create_kbdl_utils(poplar_config["reference_path"], poplar_config["module_path"], poplar_config["kbversion"])

# Build model tables from *_data.json files in the models directory
# Outputs genome_reactions.tsv and gene_reaction_data.tsv
print(f"Building model tables from {results_dir}...")
data_files = sorted(results_dir.glob("*_data.json"))
print(f"Found {len(data_files)} model data files")

kbdl.build_model_tables(model_path=str(results_dir))

# List output files
for tsv in sorted(results_dir.glob("*.tsv")):
    print(f"  {tsv.name} ({tsv.stat().st_size:,} bytes)")

In [None]:
%run util.py
import sys
from pathlib import Path

poplar_config = util.load("poplar_config")
output_dir = Path(poplar_config["output_dir"])

# Add lib to path so we can import build_berdl_db
sys.path.insert(0, str(Path.cwd().parent / "lib"))
from KBDatalakeApps.build_berdl_db import build_berdl_database

db_path = output_dir / "berdl_tables.db"
build_berdl_database(str(output_dir), str(db_path))