# Load and inspect integrated data

`cpg0016` comprises three modalities: CRISPR, ORF, and compound perturbations. Plates corresponding to each modality have been integrated into a single table.

In [13]:
import polars as pl
import pprint

In [14]:
# ORF
orf_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/orf"
orf_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
orf = f"{orf_source_dir}/{orf_source_file}"

# CRISPR
crispr_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/crispr"
crispr_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
crispr = f"{crispr_source_dir}/{crispr_source_file}"

# Compound
compound_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/arevalo_2023_e834481/compound"
compound_source_file = "mad_int_featselect_harmony.parquet"
compound = f"{compound_source_dir}/{compound_source_file}"

In [15]:
def preview(file):
    df = pl.scan_parquet(file)

    lazy_df = df.lazy()

    metadata_columns = [col for col in lazy_df.columns if col.startswith("Metadata")]

    feature_columns = [col for col in lazy_df.columns if not col.startswith("Metadata")]

    selected_columns = metadata_columns + feature_columns[0:2]

    result = (
        lazy_df.filter(pl.col("Metadata_Well") == "K08")
        .select(selected_columns)
        .limit(200)
        .collect()
        .sample(n=20)
    )

    result.glimpse()

In [16]:
preview(orf)

OrderedDict([('Metadata_Source', String),
             ('Metadata_Plate', String),
             ('Metadata_Well', String),
             ('Metadata_JCP2022', String),
             ('Metadata_broad_sample', String),
             ('Metadata_Name', String),
             ('Metadata_Vector', String),
             ('Metadata_Transcript', String),
             ('Metadata_Symbol', String),
             ('Metadata_NCBI_Gene_ID', String),
             ('Metadata_Taxon_ID', String),
             ('Metadata_Gene_Description', String),
             ('Metadata_Prot_Match', String),
             ('Metadata_Insert_Length', String),
             ('Metadata_pert_type', String),
             ('Metadata_Batch', String),
             ('Metadata_PlateType', String),
             ('Metadata_Row', String),
             ('Metadata_Column', String),
             ('X_harmony_0000', Float32),
             ('X_harmony_0001', Float32),
             ('X_harmony_0002', Float32),
             ('X_harmony_0003', Float32

In [17]:
preview(crispr)

OrderedDict([('Metadata_Source', String),
             ('Metadata_Plate', String),
             ('Metadata_Well', String),
             ('Metadata_JCP2022', String),
             ('Metadata_Batch', String),
             ('Metadata_PlateType', String),
             ('Metadata_Row', String),
             ('Metadata_Column', String),
             ('X_harmony_0000', Float32),
             ('X_harmony_0001', Float32),
             ('X_harmony_0002', Float32),
             ('X_harmony_0003', Float32),
             ('X_harmony_0004', Float32),
             ('X_harmony_0005', Float32),
             ('X_harmony_0006', Float32),
             ('X_harmony_0007', Float32),
             ('X_harmony_0008', Float32),
             ('X_harmony_0009', Float32),
             ('X_harmony_0010', Float32),
             ('X_harmony_0011', Float32),
             ('X_harmony_0012', Float32),
             ('X_harmony_0013', Float32),
             ('X_harmony_0014', Float32),
             ('X_harmony_0015', Float

In [18]:
preview(compound)

OrderedDict([('harmony_0', Float32),
             ('harmony_1', Float32),
             ('harmony_2', Float32),
             ('harmony_3', Float32),
             ('harmony_4', Float32),
             ('harmony_5', Float32),
             ('harmony_6', Float32),
             ('harmony_7', Float32),
             ('harmony_8', Float32),
             ('harmony_9', Float32),
             ('harmony_10', Float32),
             ('harmony_11', Float32),
             ('harmony_12', Float32),
             ('harmony_13', Float32),
             ('harmony_14', Float32),
             ('harmony_15', Float32),
             ('harmony_16', Float32),
             ('harmony_17', Float32),
             ('harmony_18', Float32),
             ('harmony_19', Float32),
             ('harmony_20', Float32),
             ('harmony_21', Float32),
             ('harmony_22', Float32),
             ('harmony_23', Float32),
             ('harmony_24', Float32),
             ('harmony_25', Float32),
             ('harmony