# Load and inspect integrated data

`cpg0016` comprises three modalities: CRISPR, ORF, and compound perturbations. Plates corresponding to each modality have been integrated into a single table.

In [7]:
import polars as pl
import pprint

In [8]:
# ORF
orf_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/orf"
orf_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
orf = f"{orf_source_dir}/{orf_source_file}"

# CRISPR
crispr_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/crispr"
crispr_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
crispr = f"{crispr_source_dir}/{crispr_source_file}"

# Compound
compound_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/arevalo_2023_e834481/compound"
compound_source_file = "mad_int_featselect_harmony.parquet"
compound = f"{compound_source_dir}/{compound_source_file}"

In [9]:
def preview(file):
    df = pl.scan_parquet(file)

    lazy_df = df.lazy()

    metadata_columns = [col for col in lazy_df.columns if col.startswith("Metadata")]

    feature_columns = [col for col in lazy_df.columns if not col.startswith("Metadata")]

    selected_columns = metadata_columns + feature_columns[0:2]

    result = (
        lazy_df.filter(pl.col("Metadata_Well") == "K08")
        .select(selected_columns)
        .limit(200)
        .collect()
        .sample(n=20)
    )

    result.glimpse()

In [10]:
preview(orf)

Rows: 20
Columns: 22
$ Metadata_Source           <str> 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4', 'source_4'
$ Metadata_Plate            <str> 'BR00121560', 'BR00126395', 'BR00123951', 'BR00123948', 'BR00125626', 'BR00121539', 'BR00126044', 'BR00124773', 'BR00126542', 'BR00123950'
$ Metadata_Well             <str> 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08'
$ Metadata_JCP2022          <str> 'JCP2022_904359', 'JCP2022_910866', 'JCP2022_905062', 'JCP2022_905062', 'JCP2022_907581', 'JCP2022_901564', 'JCP2022_915129', 'JCP2022_909248', 'JCP2022_911001', 'JCP2022_905062'
$ Metadata_broad_sample     <str> 'ccsbBroad304_04654', 'ccsbBroad304_11655', 'ccsbBroad304_05410', 'ccsbBroad304_05410', 'ccsbBroad304_08106', 'ccsbBroad304_01680', 'ccsbBroad304_99988', 'ccsbBroad304_09927', 'ccsbBroad304_11797', 'ccsbBroad304_05410'
$ Metadata_Name             <str> 'ORF001201.1_TRC304.1', 'ORF014714.1_TRC304.1', 'O

In [11]:
preview(crispr)

Rows: 20
Columns: 10
$ Metadata_Source    <str> 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13', 'source_13'
$ Metadata_Plate     <str> 'CP-CC9-R2-11', 'CP-CC9-R3-15', 'CP-CC9-R3-21', 'CP-CC9-R2-13', 'CP-CC9-R8-01', 'CP-CC9-R1-10', 'CP-CC9-R5-16', 'CP-CC9-R4-19', 'CP-CC9-R7-01', 'CP-CC9-R5-05'
$ Metadata_Well      <str> 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08', 'K08'
$ Metadata_JCP2022   <str> 'JCP2022_801478', 'JCP2022_801595', 'JCP2022_802645', 'JCP2022_803989', 'JCP2022_801135', 'JCP2022_801191', 'JCP2022_804724', 'JCP2022_800303', 'JCP2022_801135', 'JCP2022_802155'
$ Metadata_Batch     <str> '20221009_Run2', '20221017_Run3', '20221017_Run3', '20221009_Run2', '20221109_Run5', '20220914_Run1', '20221109_Run5', '20221024_Run4', '20221109_Run5', '20221109_Run5'
$ Metadata_PlateType <str> 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR'
$ Metadata_Ro

In [12]:
preview(compound)

Rows: 20
Columns: 12
$ Metadata_Source     <cat> source_2, source_2, source_2, source_2, source_2, source_2, source_2, source_2, source_2, source_2
$ Metadata_Plate      <cat> 1086293447, 1053601947, 1086289655, 1086293386, 1086292846, 1053600742, 1086293522, 1086292112, 1053601923, 1086292488
$ Metadata_Well       <cat> K08, K08, K08, K08, K08, K08, K08, K08, K08, K08
$ Metadata_JCP2022    <cat> JCP2022_017468, JCP2022_079337, DMSO, DMSO, JCP2022_043213, JCP2022_015567, JCP2022_072903, JCP2022_064143, JCP2022_031513, JCP2022_096951
$ Metadata_Batch      <cat> 20210726_Batch_7, 20210607_Batch_2, 20211003_Batch_13, 20210726_Batch_7, 20210808_Batch_4, 20210614_Batch_1, 20210726_Batch_7, 20210823_Batch_10, 20210607_Batch_2, 20210816_Batch_9
$ Metadata_PlateType  <cat> COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND
$ Metadata_PertType   <cat> trt, trt, negcon, negcon, trt, trt, trt, trt, trt, trt
$ Metadata_Row        <cat> K, K, K, K, K,