# Process CNN embeddings from Ray/Bram

In [2]:
import polars as pl
import h5py
import os
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa

In [6]:
# Paths
cpg0037_path = "/Users/jewald/Desktop/cpg0037"
sc_path = "/Users/jewald/Desktop/cpg0037/axiom_merged_sc.parquet"
well_input_path = "/Users/jewald/Desktop/cpg0037/well_means_medians.h5"
well_output_path = "/Users/jewald/Desktop/cpg0037/axiom_merged_well.parquet"
meta_path = "/Users/jewald/repos/2024_09_09_Axiom_OASIS/1_snakemake/inputs/metadata/metadata.parquet"

plates = os.listdir(cpg0037_path)
plates = [i for i in plates if "plate" in i]

column_names = ["Metadata_Index"] + [f"f_{str(i).zfill(3)}" for i in range(1, 673)]

final_names = ["Metadata_Plate", "Metadata_Well"] + [f"f_{str(i).zfill(3)}" for i in range(1, 673)]

data_types = [pa.string(), pa.string()] + [pa.float32()] * 672
schema = pa.schema(list(zip(final_names, data_types)))

In [28]:
# Implement writing in chunks
writer = pq.ParquetWriter(sc_path, schema, compression='gzip')
for plate in tqdm(plates):
    plate_nm = plate.replace("plate_", "")
    plate_nm = plate_nm.replace(".h5", "")

    plate_path = f"{cpg0037_path}/{plate}"

    with h5py.File(plate_path, 'r') as h5_file:
        meta = list(h5_file["meta/Metadata_Well"][:])
        meta = [i.decode() for i in meta]
        meta_df = pl.DataFrame({
            "Metadata_Plate": plate_nm,
            "Metadata_Well": meta,
            "Metadata_Index": range(len(meta)),
        })

        data = h5_file["deepprofiler"][:]
        data_df = pl.DataFrame(data, schema=column_names).with_columns(
            pl.col("Metadata_Index").cast(pl.Int64).alias("Metadata_Index"),
        )
        data_df = meta_df.join(data_df, on="Metadata_Index").drop("Metadata_Index").to_pandas()

        # write to parquet
        table = pa.Table.from_pandas(data_df, preserve_index=False)
        writer.write_table(table)
    
    # close parquet writer
    writer.close()

100%|██████████| 68/68 [47:32<00:00, 41.96s/it]


In [34]:
# Create well-aggregated profiles
well_df = pl.scan_parquet(sc_path).group_by(["Metadata_Plate", "Metadata_Well"]).agg([
    pl.col("*").exclude(["Metadata_Plate", "Metadata_Well"]).median(),
]).collect()

: 

In [8]:
# Annotate well-level data
with h5py.File(well_input_path, 'r') as h5_file:
    meta = list(h5_file["meta/Metadata_Well"][:])
    meta = [i.decode() for i in meta]
    plates = list(h5_file["meta/Metadata_Plate"][:])
    plates = [i.decode() for i in plates]
    meta_df = pl.DataFrame({
        "Metadata_Plate": plates,
        "Metadata_Well": meta,
    })

    data = h5_file["deepprofiler_median"][:]
    data_df = pl.DataFrame(data, schema=[f"f_{str(i).zfill(3)}" for i in range(1, 673)])

data_df = pl.concat([meta_df, data_df], how="horizontal")

In [9]:
meta = pl.read_parquet(meta_path)
merge_df = meta.join(data_df, on=["Metadata_Plate", "Metadata_Well"])
merge_df.write_parquet("/Users/jewald/repos/2024_09_09_Axiom_OASIS/1_snakemake/inputs/profiles/cpcnn/raw.parquet")