In [1]:
import pathlib
import warnings

import pandas as pd
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore")  # Ignore all warnings
warnings.simplefilter("ignore")  # Additional suppression method

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

In [2]:
profile_dict = {
    "organoid_fs": {
        "input_profile_path": pathlib.Path(
            root_dir, "5.EDA/results/linear_modeling/organoid_fs.parquet"
        ),
        "metadata_columns": [
            "patient",
            "object_id",
            "unit",
            "dose",
            "treatment",
            "Target",
            "Class",
            "image_set",
            "Well",
            "Therapeutic_Categories",
            "single_cell_count",
        ],
    },
    "single_cell_fs": {
        "input_profile_path": pathlib.Path(
            root_dir, "5.EDA/results/linear_modeling/sc_fs.parquet"
        ),
        "metadata_columns": [
            "patient",
            "object_id",
            "unit",
            "dose",
            "treatment",
            "Target",
            "Class",
            "image_set",
            "Well",
            "Therapeutic_Categories",
            "parent_organoid",
        ],
    },
}

## Filter significant features
pvalue threshold is set to 0.05 - statistically significant features    
rsquared threshold is set to 0.5 - the explained variance is at least 50% of the total variance    
rsquared adjusted threshold is set to positive values - the model performs better than the mean    


### Single Cell

In [3]:
df = pd.read_parquet(
    profile_dict["single_cell_fs"]["input_profile_path"],
)
print(df.shape)

(165584, 15)


In [4]:
pvalue_threshold_max = 0.05  # significance threshold for p-values
rsquared_threshold_min = 0.5  # 50% of variance explained by the model
rsquared_adj_threshold_min = 0  # the model performs better than the null model
coefficient_threshold_min = 1  # minimum effect size of 1

In [5]:
# filter significant features
df_filtered = df[
    (df["pvalue"] < pvalue_threshold_max)
    & (df["rsquared"] > rsquared_threshold_min)
    & (df["rsquared_adj"] > rsquared_adj_threshold_min)
    & (df["coefficient"].abs() > coefficient_threshold_min)
].copy()
print(df_filtered.shape)
df_filtered.head()

(35, 15)


Unnamed: 0,patient,treatment,feature,rsquared,rsquared_adj,fvalue,pvalue,coefficient,intercept,Feature_type,Compartment,Channel,Measurement,Extra_info,pvalue_fdr
51127,NF0018,Copanlisib,Colocalization_Cell_DNAMito_MEANK2,0.504153,0.50071,146.412102,1.0786620000000001e-23,59.369253,-1.05211e-15,Colocalization,Cell,DNAMito,MEANK2,,
51128,NF0018,Copanlisib,Colocalization_Cell_DNAMito_MEDIANK2,0.504153,0.50071,146.412102,1.0786620000000001e-23,59.369253,-1.05211e-15,Colocalization,Cell,DNAMito,MEDIANK2,,
51129,NF0018,Copanlisib,Colocalization_Cell_DNAMito_MINK2,0.504153,0.50071,146.412102,1.0786620000000001e-23,59.369253,-1.05211e-15,Colocalization,Cell,DNAMito,MINK2,,
51130,NF0018,Copanlisib,Colocalization_Cell_DNAMito_MAXK2,0.504153,0.50071,146.412102,1.0786620000000001e-23,59.369253,-1.05211e-15,Colocalization,Cell,DNAMito,MAXK2,,
51174,NF0018,Copanlisib,Intensity_Cell_AGP_CMIX,0.569113,0.566121,190.19458,4.1335560000000005e-28,80.03613,2.706169e-16,Intensity,Cell,AGP,CMIX,,


In [6]:
df_filtered["treatment"].unique()

array(['Copanlisib', 'Selumetinib'], dtype=object)

In [7]:
df_filtered["patient"].unique()

array(['NF0018', 'NF0030'], dtype=object)

In [8]:
df_filtered["feature"].unique()

array(['Colocalization_Cell_DNAMito_MEANK2',
       'Colocalization_Cell_DNAMito_MEDIANK2',
       'Colocalization_Cell_DNAMito_MINK2',
       'Colocalization_Cell_DNAMito_MAXK2', 'Intensity_Cell_AGP_CMIX',
       'Intensity_Cell_AGP_CMIY', 'Intensity_Cell_AGP_CMIZ',
       'Granularity_Cell_AGP_GRANULARITY1',
       'Granularity_Cell_AGP_GRANULARITY2',
       'Colocalization_Cytoplasm_DNAMito_MEANK2',
       'Colocalization_Cytoplasm_DNAMito_MEDIANK2',
       'Colocalization_Cytoplasm_DNAMito_MINK2',
       'Colocalization_Cytoplasm_DNAMito_MAXK2',
       'Intensity_Cytoplasm_AGP_CMIX',
       'Granularity_Cytoplasm_AGP_GRANULARITY1',
       'Colocalization_Nuclei_DNAMito_MEANK2',
       'Colocalization_Nuclei_DNAMito_MEDIANK2',
       'Colocalization_Nuclei_DNAMito_MINK2',
       'Colocalization_Nuclei_DNAMito_MAXK2',
       'Colocalization_Cell_AGPMito_MEANK2',
       'Colocalization_Cell_AGPMito_MEDIANK2',
       'Colocalization_Cell_AGPMito_MINK2',
       'Colocalization_Cell_AGPM

### Organoid

In [9]:
df = pd.read_parquet(
    profile_dict["organoid_fs"]["input_profile_path"],
)
print(df.shape)

(57024, 15)


In [10]:
pvalue_threshold_max = 0.05
rsquared_threshold_min = 0.4
rsquared_adj_threshold_min = 0
coefficient_threshold_min = 1

In [11]:
# filter significant features
df_filtered = df[
    (df["pvalue"] < pvalue_threshold_max)
    & (df["rsquared"] > rsquared_threshold_min)
    & (df["rsquared_adj"] > rsquared_adj_threshold_min)
    & (df["coefficient"].abs() > coefficient_threshold_min)
].copy()
print(df_filtered.shape)
df_filtered.head()

(390, 15)


Unnamed: 0,patient,treatment,feature,rsquared,rsquared_adj,fvalue,pvalue,coefficient,intercept,Feature_type,Compartment,Channel,Measurement,Extra_info,pvalue_fdr
283,NF0014,Mirdametinib,Intensity_Organoid_ER_STDINTENSITY,0.447425,0.4234,18.623301,0.000256,-1.745595,1.536424e-16,Intensity,Organoid,ER,STDINTENSITY,,
284,NF0014,Mirdametinib,Intensity_Organoid_ER_STDINTENSITYEDGE,0.402125,0.37613,15.469554,0.000664,-1.665961,2.317867e-17,Intensity,Organoid,ER,STDINTENSITYEDGE,,
434,NF0014,Fimepinostat,AreaSizeShape_Organoid_CENTERZ,0.427737,0.397618,14.201498,0.0013,-16.857658,22.80302,AreaSizeShape,Organoid,,CENTERZ,,
435,NF0014,Fimepinostat,AreaSizeShape_Organoid_VOLUME,0.456506,0.427901,15.958957,0.000775,-2.108811,5.228555e-16,AreaSizeShape,Organoid,,VOLUME,,
438,NF0014,Fimepinostat,AreaSizeShape_Organoid_EQUIVALENTDIAMETER,0.62819,0.608621,32.101402,1.8e-05,-3.092539,6.538228000000001e-17,AreaSizeShape,Organoid,,EQUIVALENTDIAMETER,,


In [12]:
df_filtered["treatment"].unique()

array(['Mirdametinib', 'Fimepinostat', 'Staurosporine', 'Binimetinib',
       'Copanlisib', 'Cabozantinib', 'Onalespib', 'Rapamycin', 'Digoxin',
       'Ketotifen', 'Trametinib', 'Linsitinib', 'Imatinib', 'Everolimus',
       'Selumetinib'], dtype=object)

In [13]:
df_filtered["patient"].unique()

array(['NF0014', 'NF0016', 'NF0018', 'NF0021', 'NF0030', 'SARCO219'],
      dtype=object)

In [14]:
df_filtered["feature"].unique()

array(['Intensity_Organoid_ER_STDINTENSITY',
       'Intensity_Organoid_ER_STDINTENSITYEDGE',
       'AreaSizeShape_Organoid_CENTERZ', 'AreaSizeShape_Organoid_VOLUME',
       'AreaSizeShape_Organoid_EQUIVALENTDIAMETER',
       'Colocalization_Organoid_AGPER_MEANMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_AGPER_MEDIANMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_AGPER_MINMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_AGPER_MAXMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_DNAAGP_MEANCORRELATIONCOEFF',
       'Colocalization_Organoid_DNAAGP_MEDIANCORRELATIONCOEFF',
       'Colocalization_Organoid_DNAAGP_MINCORRELATIONCOEFF',
       'Colocalization_Organoid_DNAAGP_MAXCORRELATIONCOEFF',
       'Colocalization_Organoid_DNAAGP_MEANMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_DNAAGP_MEDIANMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_DNAAGP_MINMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_DNAAGP_MAXMANDERSCOEFFCOSTESM1',
       '