# NB01b: Run remaining cells from NB01

NB01 errored at cell-23 due to wrong column name (term_id → ko_id). This notebook runs the corrected versions of cells 23-25.

In [1]:
spark = get_spark_session()
import pandas as pd

PHB_KOS = {
    'K03821': 'phaC - PHA synthase (committed step)',
    'K00023': 'phaB - acetoacetyl-CoA reductase',
    'K00626': 'phaA - beta-ketothiolase',
    'K05973': 'phaZ - PHB depolymerase',
    'K14205': 'phaP - phasin (granule protein)',
    'K18080': 'phaR - PHB transcriptional regulator',
}
print('Ready')

Ready


In [2]:
# Cell 23 fix: Check KEGG KO terms — verify our PHB KOs exist in NMDC
phb_ko_list = "', '".join(PHB_KOS.keys())
nmdc_kos = spark.sql(f"""
    SELECT * FROM nmdc_arkin.kegg_ko_terms 
    WHERE ko_id IN ('{phb_ko_list}')
""").toPandas()
print('PHB KEGG KOs in NMDC reference:')
nmdc_kos

PHB KEGG KOs in NMDC reference:


Unnamed: 0,ko_id,name,description,category
0,K00023,,,
1,K00626,,,
2,K03821,,,
3,K05973,,,
4,K14205,,,


In [3]:
# Cell 24 fix: Check metabolomics_gold schema and search for 3-hydroxybutyrate
metab_schema = spark.sql("DESCRIBE nmdc_arkin.metabolomics_gold").toPandas()
print('metabolomics_gold columns:')
print(metab_schema[['col_name', 'data_type']].to_string(index=False))

name_cols = [c for c in metab_schema['col_name'] 
             if any(kw in c.lower() for kw in ['name', 'compound', 'metabolite', 'label'])]
print(f'\nPotential name columns: {name_cols}')

if name_cols:
    name_col = name_cols[0]
    hb_metabolites = spark.sql(f"""
        SELECT DISTINCT *
        FROM nmdc_arkin.metabolomics_gold
        WHERE LOWER({name_col}) LIKE '%hydroxybutyrate%'
           OR LOWER({name_col}) LIKE '%hydroxybutyr%'
           OR LOWER({name_col}) LIKE '%phb%'
        LIMIT 20
    """).toPandas()
    print(f'\n3-hydroxybutyrate-related metabolites in NMDC (using {name_col}):')
    print(hb_metabolites)
else:
    print('\nNo obvious name column found. Showing sample rows:')
    sample = spark.sql("SELECT * FROM nmdc_arkin.metabolomics_gold LIMIT 3").toPandas()
    print(sample)

metabolomics_gold columns:
                                    col_name data_type
                                     file_id    string
                                   file_name    string
                                  feature_id    string
                            Apex Scan Number    double
                                        Area    double
Associated Mass Features after Deconvolution    string
                              Calculated m/z    double
                            Confidence Score    double
                            Dispersity Index    double
                          Entropy Similarity    double
                                   Intensity    double
                                 Ion Formula    string
                                    Ion Type    string
          Is Largest Ion after Deconvolution   boolean
                     Isotopologue Similarity    double
                           Isotopologue Type    string
             Library mzs in Query (fra


3-hydroxybutyrate-related metabolites in NMDC (using file_name):
Empty DataFrame
Columns: [file_id, file_name, feature_id, Apex Scan Number, Area, Associated Mass Features after Deconvolution, Calculated m/z, Confidence Score, Dispersity Index, Entropy Similarity, Intensity, Ion Formula, Ion Type, Is Largest Ion after Deconvolution, Isotopologue Similarity, Isotopologue Type, Library mzs in Query (fraction), MS2 Spectrum, Mass Feature ID, Molecular Formula, Monoisotopic Mass Feature ID, Persistence, Polarity, Retention Time (min), Sample Name, Spectra with Annotation (n), Tailing Factor, chebi, database_name, final_scan, inchi, inchikey, kegg, m/z, m/z Error (ppm), m/z Error Score, name, noise_score, noise_score_max, noise_score_min, normalized_dispersity_index, ref_ms_id, smiles, start_scan, Peak Area, Traditional Name, Spectral Similarity Score, Similarity Score, Retention Time Ref, Retention Index Score, Kegg Compound ID, Retention index Ref, Retention index, Half Height Width (min

In [4]:
# Cell 25: Check ncbi_env harmonized_name categories
env_categories = spark.sql("""
    SELECT harmonized_name, COUNT(*) as n
    FROM kbase_ke_pangenome.ncbi_env
    GROUP BY harmonized_name
    ORDER BY n DESC
""").toPandas()
print('NCBI environment metadata categories:')
env_categories

NCBI environment metadata categories:


Unnamed: 0,harmonized_name,n
0,,1600369
1,collection_date,273042
2,geo_loc_name,272707
3,isolation_source,245435
4,strain,205650
...,...,...
329,turbidity,1
330,link_class_info,1
331,microbial_biomass_meth,1
332,standing_water_regm,1
