In [1]:
import pandas as pd

In this notebook, the number of wells and features are counted. The profiles are not downloaded directly to repo but are stored outside the repo.

```bash
mkdir ../../orf_profiles
mkdir ../../crispr_profiles
aws s3 sync s3://cellpainting-gallery/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/ORF/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony/ ../../orf_profiles/
aws s3 sync s3://cellpainting-gallery/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/CRISPR/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected/ ../../crispr_profiles/
```

In [2]:
def get_featurecols(df):
    """return a list of featuredata columns"""
    return [c for c in df.columns if not c.startswith("Metadata")]

In [3]:
orf_profiles = {
    "well-level": "profiles",
    "well-layout-corrected": "profiles_wellpos",
    "cell-count-regressed": "profiles_wellpos_cc",
    "Low-variance-features-removed": "profiles_wellpos_cc_var",
    "normalized": "profiles_wellpos_cc_var_mad",
    "outlier-features-removed": "profiles_wellpos_cc_var_mad_outlier",
    "feature-selected": "profiles_wellpos_cc_var_mad_outlier_featselect",
    "sphered": "profiles_wellpos_cc_var_mad_outlier_featselect_sphering",
    "batch-corrected": "profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony"
}

In [4]:
crispr_profiles = {
    "well-level": "profiles",
    "well-layout-corrected": "profiles_wellpos",
    "cell-count-regressed": "profiles_wellpos_cc",
    "Low-variance-features-removed": "profiles_wellpos_cc_var",
    "normalized": "profiles_wellpos_cc_var_mad",
    "outlier-features-removed": "profiles_wellpos_cc_var_mad_outlier",
    "feature-selected": "profiles_wellpos_cc_var_mad_outlier_featselect",
    "sphered": "profiles_wellpos_cc_var_mad_outlier_featselect_sphering",
    "batch-corrected": "profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony",
    "PCA-dimension-reduced": "profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA",
    "CRISPR-profiles": "profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected"
}

In [5]:
table_df = pd.DataFrame()

for level in orf_profiles.keys():
    parquet_file_name = f"{orf_profiles[level]}.parquet"
    df = pd.read_parquet(f"../../orf_profiles/{parquet_file_name}")
    table_df = pd.concat(
        [
            table_df,
            pd.DataFrame(
                {
                    "modality": ["ORF"],
                    "level": [level],
                    "num_features": [len(get_featurecols(df))],
                    "num_samples": [len(df)],
                }
            ),
        ],
        ignore_index=True,
    )

# From ../03.retrieve-annotations/0.0.phenotypic-activity-orf.ipynb, after removing empty wells and low infection efficiency wells, there are 74273 wells.

table_df = pd.concat(
    [
        table_df,
        pd.DataFrame(
            {
                "modality": ["ORF"],
                "level": ["ORF-profiles"],
                "num_features": [722],
                "num_samples": [74273],
            },
        ),
    ],
    ignore_index=True,
)

In [6]:
for level in crispr_profiles.keys():
    parquet_file_name = f"{crispr_profiles[level]}.parquet"
    df = pd.read_parquet(f"../../crispr_profiles/{parquet_file_name}")
    table_df = pd.concat(
        [
            table_df,
            pd.DataFrame(
                {
                    "modality": ["CRISPR"],
                    "level": [level],
                    "num_features": [len(get_featurecols(df))],
                    "num_samples": [len(df)],
                }
            ),
        ],
        ignore_index=True,
    )

In [7]:
print(table_df.to_markdown(index=False))

| modality   | level                         |   num_features |   num_samples |
|:-----------|:------------------------------|---------------:|--------------:|
| ORF        | well-level                    |           3673 |         81663 |
| ORF        | well-layout-corrected         |           3673 |         81660 |
| ORF        | cell-count-regressed          |           3674 |         81660 |
| ORF        | Low-variance-features-removed |           3649 |         81660 |
| ORF        | normalized                    |           3649 |         81660 |
| ORF        | outlier-features-removed      |           3636 |         81660 |
| ORF        | feature-selected              |            722 |         81660 |
| ORF        | sphered                       |            722 |         81660 |
| ORF        | batch-corrected               |            722 |         81660 |
| ORF        | ORF-profiles                  |            722 |         74273 |
| CRISPR     | well-level               