# Load and inspect integrated data

`cpg0016` comprises three modalities: CRISPR, ORF, and compound perturbations. Plates corresponding to each modality have been integrated into a single table.

In [25]:
import polars as pl
import pprint

In [26]:
# ORF
orf_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/orf"
orf_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
orf = f"{orf_source_dir}/{orf_source_file}"

# CRISPR
crispr_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/chandrasekaran_2024_0000000/crispr"
crispr_source_file = "wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet"
crispr = f"{crispr_source_dir}/{crispr_source_file}"

# Compound
compound_source_dir = "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles/arevalo_2023_e834481/compound"
compound_source_file = "mad_int_featselect_harmony.parquet"
compound = f"{compound_source_dir}/{compound_source_file}"

In [27]:
def preview(file):
    df = pl.scan_parquet(file)

    lazy_df = df.lazy()

    metadata_columns = [col for col in lazy_df.columns if col.startswith("Metadata")]

    feature_columns = [col for col in lazy_df.columns if not col.startswith("Metadata")]

    selected_columns = metadata_columns + feature_columns[0:2]

    result = (
        lazy_df.filter(pl.col("Metadata_Well") == "K08")
        .select(selected_columns)
        .limit(200)
        .collect()
        .sample(n=5)
    )

    result.glimpse()

In [28]:
preview(orf)

Rows: 5
Columns: 22
$ Metadata_Source           <str> 'source_4', 'source_4', 'source_4', 'source_4', 'source_4'
$ Metadata_Plate            <str> 'BR00126053', 'BR00123516', 'BR00121553', 'BR00126542', 'BR00126044'
$ Metadata_Well             <str> 'K08', 'K08', 'K08', 'K08', 'K08'
$ Metadata_JCP2022          <str> 'JCP2022_910990', 'JCP2022_904910', 'JCP2022_905618', 'JCP2022_911001', 'JCP2022_915129'
$ Metadata_broad_sample     <str> 'ccsbBroad304_11785', 'ccsbBroad304_05251', 'ccsbBroad304_06010', 'ccsbBroad304_11797', 'ccsbBroad304_99988'
$ Metadata_Name             <str> 'ORF005171.1_TRC304.1', 'ORF003011.1_TRC304.1', 'ORF016170.1_TRC304.1', 'ORF017833.1_TRC304.1', 'ORFC00002.1_TRC304.1'
$ Metadata_Vector           <str> 'pLX_304', 'pLX_304', 'pLX_304', 'pLX_304', 'pLX_304'
$ Metadata_Transcript       <str> 'NM_015473.4', 'NM_174928.2', 'NM_001286.4', 'NM_015527.4', 'HcRed.1'
$ Metadata_Symbol           <str> 'HEATR5A', 'EEF1AKMT1', 'CLCN6', 'TBC1D10B', 'HcRed'
$ Metadata_NCBI_Ge

In [29]:
preview(crispr)

Rows: 5
Columns: 10
$ Metadata_Source    <str> 'source_13', 'source_13', 'source_13', 'source_13', 'source_13'
$ Metadata_Plate     <str> 'CP-CC9-R2-18', 'CP-CC9-R2-27', 'CP-CC9-R1-25', 'CP-CC9-R5-15', 'CP-CC9-R2-17'
$ Metadata_Well      <str> 'K08', 'K08', 'K08', 'K08', 'K08'
$ Metadata_JCP2022   <str> 'JCP2022_800437', 'JCP2022_800001', 'JCP2022_807471', 'JCP2022_801595', 'JCP2022_802523'
$ Metadata_Batch     <str> '20221009_Run2', '20221009_Run2', '20220914_Run1', '20221109_Run5', '20221009_Run2'
$ Metadata_PlateType <str> 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR', 'CRISPR'
$ Metadata_Row       <str> 'K', 'K', 'K', 'K', 'K'
$ Metadata_Column    <str> '08', '08', '08', '08', '08'
$ X_harmony_0000     <f32> -9.455662727355957, -21.31245231628418, 5.230257987976074, -19.21879005432129, -8.955716133117676
$ X_harmony_0001     <f32> -6.56337833404541, 3.395615339279175, -23.338926315307617, 7.264920234680176, 6.719682216644287



In [30]:
preview(compound)

Rows: 5
Columns: 12
$ Metadata_Source     <cat> source_2, source_2, source_2, source_2, source_2
$ Metadata_Plate      <cat> 1053601756, 1086292327, 1053599688, 1086289693, 1086290347
$ Metadata_Well       <cat> K08, K08, K08, K08, K08
$ Metadata_JCP2022    <cat> JCP2022_110802, JCP2022_109156, JCP2022_103545, JCP2022_084754, JCP2022_019390
$ Metadata_Batch      <cat> 20210607_Batch_2, 20210816_Batch_9, 20210621_Batch_3, 20211003_Batch_13, 20210920_Batch_12
$ Metadata_PlateType  <cat> COMPOUND, COMPOUND, COMPOUND, COMPOUND, COMPOUND
$ Metadata_PertType   <cat> trt, trt, trt, trt, trt
$ Metadata_Row        <cat> K, K, K, K, K
$ Metadata_Column     <cat> 08, 08, 08, 08, 08
$ Metadata_Microscope <cat> CV8000, CV8000, CV8000, CV8000, CV8000
$ harmony_0           <f32> 0.3439345359802246, 0.2985861599445343, -0.6214839220046997, 0.39734596014022827, -1.2525172233581543
$ harmony_1           <f32> -0.8251946568489075, -1.2919988632202148, 0.4527185559272766, 0.12860026955604553, 0.5053775906