# Load and inspect integrated data

`cpg0016` comprises three modalities: CRISPR, ORF, and compound perturbations. Plates corresponding to each modality have been integrated into a single table.

In [19]:
import polars as pl
import pprint

In [20]:
# ORF
orf_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/orf"
orf_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
orf = f"{orf_source_dir}/{orf_source_file}"

# CRISPR
crispr_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/crispr"
crispr_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
crispr = f"{crispr_source_dir}/{crispr_source_file}"

# Compound
compound_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/arevalo_2023_e834481/compound"
compound_source_file = "mad_int_featselect_harmony.parquet"
compound = f"{compound_source_dir}/{compound_source_file}"

In [21]:
def preview(file):
    df = pl.scan_parquet(file)

    lazy_df = df.lazy()

    metadata_columns = [col for col in lazy_df.columns if col.startswith("Metadata")]

    feature_columns = [col for col in lazy_df.columns if not col.startswith("Metadata")]

    selected_columns = metadata_columns + feature_columns[0:2]

    result = (
        lazy_df.filter(pl.col("Metadata_Well") == "K08")
        .select(selected_columns)
        .limit(200)
        .collect()
        .sample(n=5)
    )

    result.glimpse()

In [22]:
preview(orf)

Rows: 20
Columns: 22
$ Metadata_Source           <str> 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4'
$ Metadata_Plate            <str> 'BR00123506', 'BR00126545', 'BR00123512', 'BR00123947', 'BR00123621', 'BR00126539', 'BR00124794', 'BR00126547', 'BR00126393', 'BR00123535'
$ Metadata_Well             <str> 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08'
$ Metadata_JCP2022          <str> 'JCP2022_902207', 'JCP2022_914354', 'JCP2022_904910', 'JCP2022_905062', 'JCP2022_911203', 'JCP2022_911001', 'JCP2022_909408', 'JCP2022_914354', 'JCP2022_913483', 'JCP2022_902865'
$ Metadata_broad_sample     <str> 'ccsbBroad304_02356', 'ccsbBroad304_15350', 'ccsbBroad304_05251', 'ccsbBroad304_05410', 'ccsbBroad304_12016', 'ccsbBroad304_11797', 'ccsbBroad304_10107', 'ccsbBroad304_15350', 'ccsbBroad304_14446', 'ccsbBroad304_03052'
$ Metadata_Name             <str> 'ORF015971.1_TRC304.1', 'ORF014427.1_TRC304.1', 'O

In [23]:
preview(crispr)

Rows: 20
Columns: 10
$ Metadata_Source    <str> 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13'
$ Metadata_Plate     <str> 'CP-CC9-R1-14', 'CP-CC9-R1-03', 'CP-CC9-R5-08', 'CP-CC9-R5-04', 'CP-CC9-R4-15', 'CP-CC9-R1-19', 'CP-CC9-R3-08', 'CP-CC9-R5-15', 'CP-CC9-R2-17', 'CP-CC9-R3-03'
$ Metadata_Well      <str> 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08'
$ Metadata_JCP2022   <str> 'JCP2022_805264', 'JCP2022_805308', 'JCP2022_806802', 'JCP2022_800107', 'JCP2022_801595', 'JCP2022_800303', 'JCP2022_806802', 'JCP2022_801595', 'JCP2022_802523', 'JCP2022_805308'
$ Metadata_Batch     <str> '20220914_Run1', '20220914_Run1', '20221109_Run5', '20221109_Run5', '20221024_Run4', '20220914_Run1', '20221017_Run3', '20221109_Run5', '20221009_Run2', '20221017_Run3'
$ Metadata_PlateType <str> 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR'
$ Metadata_Ro

In [24]:
preview(compound)

Rows: 20
Columns: 12
$ Metadata_Source     <cat> source_2, source_2, source_2, source_2, source_2, source_2, source_2, source_2, source_2, source_2
$ Metadata_Plate      <cat> 1086293447, 1086292754, 1086293812, 1053600834, 1086292891, 1086289785, 1053601848, 1086292464, 1053600766, 1053598001
$ Metadata_Well       <cat> K08, K08, K08, K08, K08, K08, K08, K08, K08, K08
$ Metadata_JCP2022    <cat> JCP2022_017468, JCP2022_063632, JCP2022_100570, JCP2022_098838, JCP2022_018721, JCP2022_111119, JCP2022_060243, JCP2022_064143, JCP2022_020865, JCP2022_044720
$ Metadata_Batch      <cat> 20210726_Batch_7, 20210808_Batch_4, 20210719_Batch_6, 20210614_Batch_1, 20210808_Batch_4, 20211003_Batch_13, 20210607_Batch_2, 20210816_Batch_9, 20210614_Batch_1, 20210712_Batch_5
$ Metadata_PlateType  <cat> COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND
$ Metadata_PertType   <cat> trt, trt, trt, trt, trt, trt, trt, trt, trt, trt
$ Metadata_Row        <cat> K