In [1]:
import os,sys,glob
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import pandas as pd # 'v2.2.3'

import pyexeggutor as exe

In [2]:
genomic_databases = ['EnsemblFungi',
 'EnsemblProtists',
 'Haroon_2016-PRJNA289734-RedSea',
 'JGI-MycoCosm',
 'JGI-PhycoCosm',
 'Jungbluth_2017-PRJNA269163-JuanDeFucaRidgeSubsurface',
 'Li_2024-PRJNA1111327-KermadecDiamantinaTrench',
 'Liu_2024-PRJNA1081583-EastChinaSeaLowOxygenRegion',
 'MMP-MarFun',
 'MMP-MarRef',
 'Nathani_2021-PRJEB26614_PRJEB26615-GulfKuthiawarPeninsula',
 'Nenasheva_2024-10.5281_zenodo.13933292-Diatoms',
 'OceanDNA',
 'TARA-BAC_ARC_MAGs',
 'TARA-Delmont_2018',
 'TARA-SAGv1',
 'TARA-SMAGv1',
 'TARA-TOPAZ',
 'Xu_2024-PRJNA880762-SouthChinaSea',
 'Zhang_2022-PRJNA707313-ColdSeepSouthChinaSea',
 'Zhang_2023-PRJNA808646-CCFZ',
 'Zhang_2024-PRJNA113162029-YapMarianaTrenchSediment',
 'Zhou_2022-PRJNA730330-SouthChinaEstuary']
# for filepath_gene_to_genome in glob.glob("/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/*/Metadata/gene_to_genome.pkl.gz"):
#     id_database = filepath_gene_to_genome.split("/")[-3]
#     genomic_databases.append(id_database)
    


In [22]:
# Creating per-database genomic traits matrices
# =============================================
# for id_database in ['Jungbluth_2017-PRJNA269163-JuanDeFucaRidgeSubsurface']:
for id_database in genomic_databases:
    output_directory = f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Traits"
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    genomic_traits = defaultdict(lambda: defaultdict(int))
    gene_to_genome = exe.read_pickle(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Metadata/gene_to_genome.pkl.gz")
    protein_annotations = exe.read_pickle(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Metadata/protein_annotations.pkl.gz")
    for id_gene, annotations in tqdm(protein_annotations.items(), desc=f"Building genomic traits matrix: {id_database}"):
        id_genome = gene_to_genome[id_gene]
        for id_ko in annotations["KOfam"]:
            genomic_traits[id_ko][id_genome] += 1
    X_genomic_traits = pd.DataFrame(genomic_traits).fillna(0).astype(int)
    memory_before = X_genomic_traits.memory_usage(deep=True).sum()
    X_genomic_traits = X_genomic_traits.astype(pd.SparseDtype(dtype=int, fill_value=0))
    X_genomic_traits.index.name = "id_genome"
    X_genomic_traits.columns.name = "id_feature"
    X_genomic_traits.to_pickle(os.path.join(output_directory, "genomic_traits.kofam.int.pkl.gz"))
    X_genomic_traits = X_genomic_traits > 0
    X_genomic_traits = X_genomic_traits.astype(pd.SparseDtype(dtype=bool, fill_value=False))
    X_genomic_traits.to_pickle(os.path.join(output_directory, "genomic_traits.kofam.bool.pkl.gz"))    
    memory_after = X_genomic_traits.memory_usage(deep=True).sum()
    print(f"{id_database}: Memory before: {memory_before / 1e6:.2f} MB, after: {memory_after / 1e6:.2f} MB, reduction: {100 * (memory_before - memory_after) / memory_before:.2f}%")


Building genomic traits matrix: EnsemblFungi: 100%|██████████| 5703801/5703801 [00:11<00:00, 496428.12it/s]


EnsemblFungi: Memory before: 100.66 MB, after: 22.96 MB, reduction: 77.19%


Building genomic traits matrix: EnsemblProtists: 100%|██████████| 568245/568245 [00:01<00:00, 533150.03it/s]


EnsemblProtists: Memory before: 12.99 MB, after: 2.06 MB, reduction: 84.12%


Building genomic traits matrix: Haroon_2016-PRJNA289734-RedSea: 100%|██████████| 144598/144598 [00:00<00:00, 613390.45it/s]


Haroon_2016-PRJNA289734-RedSea: Memory before: 5.21 MB, after: 0.64 MB, reduction: 87.68%


Building genomic traits matrix: JGI-MycoCosm: 100%|██████████| 9893803/9893803 [00:19<00:00, 511522.02it/s]


JGI-MycoCosm: Memory before: 162.00 MB, after: 39.19 MB, reduction: 75.81%


Building genomic traits matrix: JGI-PhycoCosm: 100%|██████████| 645327/645327 [00:01<00:00, 539859.98it/s]


JGI-PhycoCosm: Memory before: 13.14 MB, after: 2.38 MB, reduction: 81.88%


Building genomic traits matrix: Jungbluth_2017-PRJNA269163-JuanDeFucaRidgeSubsurface: 100%|██████████| 78073/78073 [00:00<00:00, 622241.20it/s]


Jungbluth_2017-PRJNA269163-JuanDeFucaRidgeSubsurface: Memory before: 3.97 MB, after: 0.35 MB, reduction: 91.29%


Building genomic traits matrix: Li_2024-PRJNA1111327-KermadecDiamantinaTrench: 100%|██████████| 1465998/1465998 [00:02<00:00, 540026.09it/s]


Li_2024-PRJNA1111327-KermadecDiamantinaTrench: Memory before: 61.87 MB, after: 6.33 MB, reduction: 89.76%


Building genomic traits matrix: Liu_2024-PRJNA1081583-EastChinaSeaLowOxygenRegion: 100%|██████████| 1488219/1488219 [00:02<00:00, 567288.16it/s]


Liu_2024-PRJNA1081583-EastChinaSeaLowOxygenRegion: Memory before: 84.59 MB, after: 6.55 MB, reduction: 92.25%


Building genomic traits matrix: MMP-MarFun: 100%|██████████| 67173/67173 [00:00<00:00, 578325.48it/s]


MMP-MarFun: Memory before: 0.92 MB, after: 0.28 MB, reduction: 68.95%


Building genomic traits matrix: MMP-MarRef: 100%|██████████| 884664/884664 [00:01<00:00, 584171.48it/s]


MMP-MarRef: Memory before: 32.89 MB, after: 3.51 MB, reduction: 89.34%


Building genomic traits matrix: Nathani_2021-PRJEB26614_PRJEB26615-GulfKuthiawarPeninsula: 100%|██████████| 416359/416359 [00:00<00:00, 606581.36it/s]


Nathani_2021-PRJEB26614_PRJEB26615-GulfKuthiawarPeninsula: Memory before: 13.17 MB, after: 1.62 MB, reduction: 87.66%


Building genomic traits matrix: Nenasheva_2024-10.5281_zenodo.13933292-Diatoms: 100%|██████████| 192127/192127 [00:00<00:00, 576512.68it/s]


Nenasheva_2024-10.5281_zenodo.13933292-Diatoms: Memory before: 1.81 MB, after: 0.59 MB, reduction: 67.12%


Building genomic traits matrix: OceanDNA: 100%|██████████| 49915369/49915369 [01:46<00:00, 467585.88it/s]


OceanDNA: Memory before: 3834.36 MB, after: 221.91 MB, reduction: 94.21%


Building genomic traits matrix: TARA-BAC_ARC_MAGs: 100%|██████████| 2049601/2049601 [00:04<00:00, 511524.29it/s]


TARA-BAC_ARC_MAGs: Memory before: 125.69 MB, after: 8.97 MB, reduction: 92.86%


Building genomic traits matrix: TARA-Delmont_2018: 100%|██████████| 426766/426766 [00:00<00:00, 547704.96it/s]


TARA-Delmont_2018: Memory before: 32.61 MB, after: 1.87 MB, reduction: 94.25%


Building genomic traits matrix: TARA-SAGv1: 100%|██████████| 10593/10593 [00:00<00:00, 726316.98it/s]


TARA-SAGv1: Memory before: 0.18 MB, after: 0.04 MB, reduction: 76.16%


Building genomic traits matrix: TARA-SMAGv1: 100%|██████████| 1257612/1257612 [00:02<00:00, 497913.97it/s]


TARA-SMAGv1: Memory before: 53.22 MB, after: 5.00 MB, reduction: 90.61%


Building genomic traits matrix: TARA-TOPAZ: 100%|██████████| 6090596/6090596 [00:12<00:00, 485005.79it/s]


TARA-TOPAZ: Memory before: 560.51 MB, after: 25.55 MB, reduction: 95.44%


Building genomic traits matrix: Xu_2024-PRJNA880762-SouthChinaSea: 100%|██████████| 1559302/1559302 [00:02<00:00, 553393.89it/s]


Xu_2024-PRJNA880762-SouthChinaSea: Memory before: 106.73 MB, after: 6.88 MB, reduction: 93.56%


Building genomic traits matrix: Zhang_2022-PRJNA707313-ColdSeepSouthChinaSea: 100%|██████████| 601976/601976 [00:01<00:00, 569651.76it/s]


Zhang_2022-PRJNA707313-ColdSeepSouthChinaSea: Memory before: 41.26 MB, after: 2.55 MB, reduction: 93.83%


Building genomic traits matrix: Zhang_2023-PRJNA808646-CCFZ: 100%|██████████| 207906/207906 [00:00<00:00, 606790.02it/s]


Zhang_2023-PRJNA808646-CCFZ: Memory before: 6.93 MB, after: 0.88 MB, reduction: 87.34%


Building genomic traits matrix: Zhang_2024-PRJNA113162029-YapMarianaTrenchSediment: 100%|██████████| 123249/123249 [00:00<00:00, 602850.58it/s]


Zhang_2024-PRJNA113162029-YapMarianaTrenchSediment: Memory before: 3.72 MB, after: 0.54 MB, reduction: 85.50%


Building genomic traits matrix: Zhou_2022-PRJNA730330-SouthChinaEstuary: 100%|██████████| 461193/461193 [00:00<00:00, 582296.30it/s]


Zhou_2022-PRJNA730330-SouthChinaEstuary: Memory before: 26.53 MB, after: 2.03 MB, reduction: 92.35%


In [None]:
ls ../data/training/

In [24]:
# Creating global genomic traits matrices
# =============================================
genomic_traits = defaultdict(lambda: defaultdict(int))
output_directory="../data/training/"
for id_database in genomic_databases:
    gene_to_genome = exe.read_pickle(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Metadata/gene_to_genome.pkl.gz")
    protein_annotations = exe.read_pickle(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Metadata/protein_annotations.pkl.gz")
    for id_gene, annotations in tqdm(protein_annotations.items(), desc=f"Building genomic traits matrix: {id_database}"):
        id_genome = gene_to_genome[id_gene]
        for id_ko in annotations["KOfam"]:
            genomic_traits[id_ko][id_genome] += 1
X_genomic_traits = pd.DataFrame(genomic_traits).fillna(0).astype(int)
memory_before = X_genomic_traits.memory_usage(deep=True).sum()
X_genomic_traits = X_genomic_traits.astype(pd.SparseDtype(dtype=int, fill_value=0))
X_genomic_traits.index.name = "id_genome"
X_genomic_traits.columns.name = "id_feature"
X_genomic_traits.to_pickle(os.path.join(output_directory, "global.genomic_traits.kofam.int.pkl.gz"))
X_genomic_traits = X_genomic_traits > 0
X_genomic_traits = X_genomic_traits.astype(pd.SparseDtype(dtype=bool, fill_value=False))
X_genomic_traits.to_pickle(os.path.join(output_directory, "global.genomic_traits.kofam.bool.pkl.gz"))    
memory_after = X_genomic_traits.memory_usage(deep=True).sum()
# print(f"Global: Memory before: {memory_before / 1e6:.2f} MB, after: {memory_after / 1e6:.2f} MB, reduction: {100 * (memory_before - memory_after) / memory_before:.2f}%")

# Building genomic traits matrix: EnsemblFungi: 100%|██████████| 5703801/5703801 [00:11<00:00, 497211.63it/s]
# Building genomic traits matrix: EnsemblProtists: 100%|██████████| 568245/568245 [00:01<00:00, 496999.58it/s]
# Building genomic traits matrix: Haroon_2016-PRJNA289734-RedSea: 100%|██████████| 144598/144598 [00:00<00:00, 567287.28it/s]
# Building genomic traits matrix: JGI-MycoCosm: 100%|██████████| 9893803/9893803 [00:20<00:00, 487394.26it/s]
# Building genomic traits matrix: JGI-PhycoCosm: 100%|██████████| 645327/645327 [00:01<00:00, 518319.21it/s]
# Building genomic traits matrix: Jungbluth_2017-PRJNA269163-JuanDeFucaRidgeSubsurface: 100%|██████████| 78073/78073 [00:00<00:00, 545903.57it/s]
# Building genomic traits matrix: Li_2024-PRJNA1111327-KermadecDiamantinaTrench: 100%|██████████| 1465998/1465998 [00:02<00:00, 515112.63it/s]
# Building genomic traits matrix: Liu_2024-PRJNA1081583-EastChinaSeaLowOxygenRegion: 100%|██████████| 1488219/1488219 [00:03<00:00, 488871.12it/s]
# Building genomic traits matrix: MMP-MarFun: 100%|██████████| 67173/67173 [00:00<00:00, 507934.19it/s]
# Building genomic traits matrix: MMP-MarRef: 100%|██████████| 884664/884664 [00:01<00:00, 518931.52it/s]
# Building genomic traits matrix: Nathani_2021-PRJEB26614_PRJEB26615-GulfKuthiawarPeninsula: 100%|██████████| 416359/416359 [00:00<00:00, 501986.49it/s]
# Building genomic traits matrix: Nenasheva_2024-10.5281_zenodo.13933292-Diatoms: 100%|██████████| 192127/192127 [00:00<00:00, 523761.86it/s]
# Building genomic traits matrix: OceanDNA: 100%|██████████| 49915369/49915369 [01:49<00:00, 457551.74it/s]
# Building genomic traits matrix: TARA-BAC_ARC_MAGs: 100%|██████████| 2049601/2049601 [00:04<00:00, 425087.15it/s]
# Building genomic traits matrix: TARA-Delmont_2018: 100%|██████████| 426766/426766 [00:00<00:00, 431224.75it/s]
# Building genomic traits matrix: TARA-SAGv1: 100%|██████████| 10593/10593 [00:00<00:00, 588761.03it/s]
# Building genomic traits matrix: TARA-SMAGv1: 100%|██████████| 1257612/1257612 [00:02<00:00, 455021.33it/s]
# Building genomic traits matrix: TARA-TOPAZ: 100%|██████████| 6090596/6090596 [00:14<00:00, 426014.21it/s]
# Building genomic traits matrix: Xu_2024-PRJNA880762-SouthChinaSea: 100%|██████████| 1559302/1559302 [00:03<00:00, 441657.68it/s]
# Building genomic traits matrix: Zhang_2022-PRJNA707313-ColdSeepSouthChinaSea: 100%|██████████| 601976/601976 [00:01<00:00, 448012.27it/s]
# Building genomic traits matrix: Zhang_2023-PRJNA808646-CCFZ: 100%|██████████| 207906/207906 [00:00<00:00, 442045.29it/s]
# Building genomic traits matrix: Zhang_2024-PRJNA113162029-YapMarianaTrenchSediment: 100%|██████████| 123249/123249 [00:00<00:00, 377980.13it/s]
# Building genomic traits matrix: Zhou_2022-PRJNA730330-SouthChinaEstuary: 100%|██████████| 461193/461193 [00:01<00:00, 409541.41it/s]
# Global: Memory before: 9885.71 MB, after: 362.68 MB, reduction: 96.33%

X_genomic_traits.shape
# (71706, 17225)

Building genomic traits matrix: EnsemblFungi: 100%|██████████| 5703801/5703801 [00:11<00:00, 497211.63it/s]
Building genomic traits matrix: EnsemblProtists: 100%|██████████| 568245/568245 [00:01<00:00, 496999.58it/s]
Building genomic traits matrix: Haroon_2016-PRJNA289734-RedSea: 100%|██████████| 144598/144598 [00:00<00:00, 567287.28it/s]
Building genomic traits matrix: JGI-MycoCosm: 100%|██████████| 9893803/9893803 [00:20<00:00, 487394.26it/s]
Building genomic traits matrix: JGI-PhycoCosm: 100%|██████████| 645327/645327 [00:01<00:00, 518319.21it/s]
Building genomic traits matrix: Jungbluth_2017-PRJNA269163-JuanDeFucaRidgeSubsurface: 100%|██████████| 78073/78073 [00:00<00:00, 545903.57it/s]
Building genomic traits matrix: Li_2024-PRJNA1111327-KermadecDiamantinaTrench: 100%|██████████| 1465998/1465998 [00:02<00:00, 515112.63it/s]
Building genomic traits matrix: Liu_2024-PRJNA1081583-EastChinaSeaLowOxygenRegion: 100%|██████████| 1488219/1488219 [00:03<00:00, 488871.12it/s]
Building genom

Global: Memory before: 9885.71 MB, after: 362.68 MB, reduction: 96.33%


In [57]:
# Prokaryotic
with exe.open_file_writer("../data/cluster/ani/genome_filepaths.prokaryotic.list") as f:
    for id_database in tqdm(genomic_databases):
        for filepath in glob.glob(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Genomes/Prokaryotic/*.fa.gz"):
            id_database = filepath.split("/")[-4]
            print(filepath, file=f)

# Eukaryotic
with exe.open_file_writer("../data/cluster/ani/genome_filepaths.eukaryotic.list") as f:
    for id_database in tqdm(genomic_databases):
        for filepath in glob.glob(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Genomes/Eukaryotic/*.fa.gz"):
            id_database = filepath.split("/")[-4]
            print(filepath, file=f)

100%|██████████| 23/23 [00:26<00:00,  1.15s/it]
100%|██████████| 23/23 [00:05<00:00,  3.91it/s]


In [3]:
output_directory="../data/training"
X_genomic_traits = pd.read_pickle(os.path.join(output_directory, "global.genomic_traits.kofam.int.pkl.gz"))
# genome_to_clusterani = pd.concat([
#     pd.read_csv("../data/cluster/ani/prokaryotic/genome_clusters.tsv", sep="\t", index_col=0, header=None).iloc[:,0],
#     pd.read_csv("../data/cluster/ani/eukaryotic/genome_clusters.tsv", sep="\t", index_col=0, header=None).iloc[:,0],
# ])
# genome_to_clusterani.index.name = "id_genome"
# genome_to_clusterani.name = "id_cluster-ani"
# genome_to_clusterani.to_csv(os.path.join(output_directory, "genome_to_ani-cluster.tsv.gz"), sep="\t")
genome_to_clusterani = pd.read_csv(os.path.join(output_directory, "genome_to_ani-cluster.tsv.gz"), sep="\t", index_col=0).iloc[:,0].astype("category").loc[X_genomic_traits.index]


In [4]:
def fast_groupby_sum(X:pd.DataFrame, y:pd.Series):
    if not np.all(X.index == y.index):
        raise IndexError("X.index must equal y.index")
    if not isinstance(y, pd.CategoricalDtype):
        y = y.astype("category")
    # Convert y to numeric indices
    unique_classes, y_indices = np.unique(y, return_inverse=True)

    # Use np.bincount to sum efficiently for each feature
    arrays = list()
    for col in tqdm(X.columns, "Summing rows by groups", unit=" column"):
        summed_values = np.bincount(y_indices, weights=X[col].values, minlength=len(unique_classes))
        arrays.append(summed_values)
    X_grouped = np.vstack(arrays).T

    # Convert to DataFrame
    return pd.DataFrame(X_grouped, index=unique_classes, columns=X.columns)


In [5]:
%%time
X_genomic_traits_clusterani = fast_groupby_sum(X_genomic_traits, genome_to_clusterani).astype(int)
# Summing rows by groups: 100%|██████████| 17225/17225 [00:03<00:00, 4318.08 column/s]
# CPU times: user 4.33 s, sys: 1.88 s, total: 6.21 s
# Wall time: 6.21 s

Summing rows by groups: 100%|██████████| 17225/17225 [00:05<00:00, 2987.23 column/s]


CPU times: user 5.47 s, sys: 3.43 s, total: 8.9 s
Wall time: 9.02 s


In [7]:
%%time
X_genomic_traits_clusterani = X_genomic_traits_clusterani.astype(pd.SparseDtype(dtype=int, fill_value=0))
X_genomic_traits_clusterani.to_pickle(os.path.join(output_directory, "global.genomic_traits.kofam.int.cluster-ani.pkl.gz"))

CPU times: user 1min 15s, sys: 165 ms, total: 1min 16s
Wall time: 1min 16s


In [8]:
%%time
X_genomic_traits_clusterani = (X_genomic_traits_clusterani > 0).astype(pd.SparseDtype(dtype=bool, fill_value=False))
X_genomic_traits_clusterani.to_pickle(os.path.join(output_directory, "global.genomic_traits.kofam.bool.cluster-ani.pkl.gz"))

CPU times: user 2.95 s, sys: 0 ns, total: 2.95 s
Wall time: 2.95 s


In [12]:
X_genomic_traits_clusterani

id_feature,K00003,K00004,K00006,K00008,K05351,K25880,K00009,K00010,K00013,K14152,...,K04086,K08728,K10353,K22393,K22394,K20112,K25641,K24454,K20138,K26920
NAL-ESLC_00045683b92daedb002220d52bbda9fd,False,False,False,True,True,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
NAL-ESLC_006afe0e5f8d7c0842250533643c833a,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
NAL-ESLC_006e500052464c108064a94078e3ba0f,False,False,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
NAL-ESLC_0086aaed51baf1e518acd08b5f0365e0,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
NAL-ESLC_00a497cd8b369af17a9fec5cd6110f48,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NAL-PSLC_ffdfa3aca04e718fbb1d833dc4fbf540,True,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
NAL-PSLC_ffe776aad3a3cd5ae5237b0d66af4048,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
NAL-PSLC_ffec697fdd32a03692f2dcb88b5fa9ea,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
NAL-PSLC_ffedbc30575e734d3df3003b8a419d48,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


# Fixing database discrepancies

In [35]:
# with exe.open_file_writer("../data/cluster/ani/genome_filepaths.prokaryotic.list") as f:
#     for id_database in genomic_databases:
        
#         for glob.glob("/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/

genome_identifiers = set(X_genomic_traits.index)
query = set()
genome_to_database = dict()
for id_database in tqdm(genomic_databases):
    filepaths = glob.glob(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/organisms.prokaryotic.list") + \
                glob.glob(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/organisms.eukaryotic.list")
    for filepath in filepaths:
        with open(filepath, "r") as f:
            for line in f:
                line = line.strip()
                if line:
                    query.add(line)
                    genome_to_database[line] = id_database
len(query & genome_identifiers), len(query - genome_identifiers), len(genome_identifiers - query), genome_identifiers < query

# query - genome_identifiers: ids that are in organisms.list but not genomic traits matrix
{
 # Poor quality   
 'GCA_041246275.1',
 # Fixed
 'TARA_CHRYSOPHYTE-H1', 
 'TARA_CHRYSOPHYTE-H2',
 'TARA_MAST-3A',
 'TARA_MAST-3F',
 'TARA_MAST-4A1',
 'TARA_MAST-4A2',
 'TARA_MAST-4C',
 'TARA_MAST-4E'}

genome_identifiers - query
{'Amoce1_1',
 'Aspfu_P4SB_1',
 'Auxpr25_1_1',
 'Bigna1',
 'ChlNC64A_1',
 'Chleu1_1',
 'ChloDOE1412_1_1',
 'Chloso1228_1_1',
 'Chloso1230_1_1',
 'Chloso1602_1_1',
 'Chrpa1_1',
 'Corcom2',
 'Diptot1',
 'Fisso2',
 'Graco1_1',
 'Hetan2',
 'Ldo8726_1',
 'Liparx1',
 'Lipchi1',
 'Lipjap1',
 'Lipmes1',
 'Lipodoor1',
 'Lipokono1',
 'Lipoli1',
 'Lipotetr1',
 'Lst7536_1',
 'Lst7851_1',
 'Lst8064_1',
 'Micco1_1',
 'Nanoce84910_1',
 'Ost9901_3',
 'OstRCC809_2',
 'Penrib_P3SB_1',
 'Phyca11',
 'Praco1_1',
 'Pyrye1_1',
 'Semro1_1',
 'Treb6682_1',
 'Trebou4402_1',
 'Trebou4966_1',
 'Trebou978_1',
 'Ulvmu1_1',
 'Undpi1_1',
 'Zymtr1',
 'chryh1',
 'chryh2',
 'm3a',
 'm3f',
 'm4a1',
 'm4a2',
 'm4c',
 'm4e'}

# d = {
#     'chryh1':'TARA_CHRYSOPHYTE-H1',
#  'chryh2':'TARA_CHRYSOPHYTE-H2',
#  'm3a':'TARA_MAST-3A',
#  'm3f':'TARA_MAST-3F',
#  'm4a1':'TARA_MAST-4A1',
#  'm4a2':'TARA_MAST-4A2',
#  'm4c':'TARA_MAST-4C',
#  'm4e':'TARA_MAST-4E',
# }

100%|██████████| 23/23 [00:03<00:00,  6.36it/s]


(71654, 9, 52, False)

In [81]:
# errors = defaultdict(list)
# working = defaultdict(set)
# for id_database in ["JGI-MycoCosm", "JGI-PhycoCosm"]:
#     file_count = defaultdict(lambda: defaultdict(bool))
    
#     for filepath in tqdm(glob.glob(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/Genomes/Eukaryotic/*.*.gz"), desc=id_database):
#         if ("orphaned" not in filepath) and ("MODIFIED" not in filepath):
#             fields = filepath.rsplit("/", maxsplit=1)[1].rsplit(".", maxsplit=2)
#             try:
#                 id_genome, ext, gz = fields

#                 if ext in {"fa", "faa", "ffn", "gff"}:
#                     file_count[id_genome][ext] = True

#             except ValueError as e:
#                 print(fields, e)
#                 break
#     for id_genome, file_check in file_count.items():
#         if len(file_check) == 4:
#             working[id_database].add(id_genome)
#         else:
#             errors[id_database].append((id_genome, file_check.keys()))
# for id_database, genomes in working.items():
#     with exe.open_file_writer(f"/home/ec2-user/SageMaker/s3/newatlantis-genomics-db-prod/SourceDatabases/{id_database}/organisms.eukaryotic.list") as f:
#         for id_genome in genomes:
#             print(id_genome, file=f)

JGI-MycoCosm: 100%|██████████| 17849/17849 [00:00<00:00, 245044.82it/s]
JGI-PhycoCosm: 100%|██████████| 1123/1123 [00:00<00:00, 417053.60it/s]


In [None]:
X_genomic_