In [1]:
import os
import glob
import pandas as pd
import numpy as np
import pandas as pd
import pycytominer

os.chdir('./output/')

In [2]:
## Processing Steps (https://github.com/cytodata/single-cell-classifier/blob/9d43dfb15bae9dd66a3e9fce502c6cb878b31d08/2.process-data/scripts/nbconverted/1.process-ebimage-features.py)

# https://pycytominer.readthedocs.io/en/latest/pycytominer.html

# 1. Remove features that have high missingness
#   * Remove features that have a proportion of missing values greater than 1%

# 2. Remove redundant features (high correlation)
#   * Remove features that have correlations with other features greater than 0.95 Pearson correlation
#   * Retain the feature with the lowest correlation in each highly correlated block of features

# 3. Remove low variance features
#   * Remove features with a ratio of second most common value / most common less than 1%
#     * Removes features that have a common and high outlier
#   * Remove features with a ratio of second max count / max count less than 0.1%
#     * Removes features that have a very high number of redundant values

# 4. Apply robust normalization
#   * subtract median and divide by IQR
#   * robust to outliers

In [3]:
#For fingerprint generation, all features can be used (Gustafsdottir et al., 2013)
# or an optional feature selection step may be included in the analysis pipeline to exclude features that carry no information (median absolute deviation [MAD] close to 0) or are highly redundant (Pearson correlation >0.9 or >0.95) 
#(Hughes et al., 2020; Warchal et al., 2020). 

In [4]:
def blocklist_features(df):
    cleaned_df = pycytominer.feature_select(
        profiles=df,
        operation="blocklist"
    )
    num_removed = df.shape[1] - cleaned_df.shape[1]
    print(f"A total of {num_removed} features are removed from a total of 55 blocklisted features. {cleaned_df.shape[1]} features remaining")
    return cleaned_df

def correlate_features(df):
    cleaned_df = pycytominer.feature_select(
        profiles=df,
        operation="correlation_threshold",
        corr_threshold=0.99 # 0.99
    )
    num_removed = df.shape[1] - cleaned_df.shape[1]
    print(f"A total of {num_removed} correlated features are removed. {cleaned_df.shape[1]} features remaining")
    return cleaned_df

def remove_outliers(df):
    cleaned_df = pycytominer.feature_select(
        profiles=df,
        operation="drop_outliers",
        outlier_cutoff=100
    )
    num_removed = df.shape[1] - cleaned_df.shape[1]
    print(f"A total of {num_removed} outlier features are removed. {cleaned_df.shape[1]} features remaining")
    return cleaned_df

def filter_variance(df):
    cleaned_df = pycytominer.feature_select(
        profiles=df,
        operation="variance_threshold",
        samples="all",
        unique_cut=0.1
    )
    num_removed = df.shape[1] - cleaned_df.shape[1]
    print(f"A total of {num_removed} invariant features are detected. {cleaned_df.shape[1]} features remaining")
    return cleaned_df

def remove_noisy_features(df):
    cleaned_df = pycytominer.feature_select(
        profiles=df,
        operation="noise_removal",
        noise_removal_stdev_cutoff=3, #3
        samples="all",
        noise_removal_perturb_groups="Metadata_cmpdName"
    )
    num_removed = df.shape[1] - cleaned_df.shape[1]
    print(f"A total of {num_removed} noisy features are removed. {cleaned_df.shape[1]} features remaining")
    return cleaned_df

def drop_na_columns(df):
    cleaned_df = pycytominer.feature_select(
        profiles=df,
        na_cutoff=0,
        operation="drop_na_columns"
    )
    num_removed = df.shape[1] - cleaned_df.shape[1]
    print(f"A total of {num_removed} NAN features are removed. {cleaned_df.shape[1]} features remaining")
    return cleaned_df

In [5]:
df = pd.read_parquet('level5_AllPlates_mad_robustize_DMSO.parquet')
print("A total of", df.shape[1], "features are detected")
    
df = blocklist_features(df)
df = correlate_features(df)
df = remove_outliers(df)
df = filter_variance(df)
df = remove_noisy_features(df)
df = drop_na_columns(df)

A total of 2122 features are detected
A total of 55 features are removed from a total of 55 blocklisted features. 2067 features remaining
A total of 280 correlated features are removed. 1787 features remaining
A total of 276 outlier features are removed. 1511 features remaining
A total of 308 invariant features are detected. 1203 features remaining
A total of 0 noisy features are removed. 1203 features remaining
A total of 9 NAN features are removed. 1194 features remaining


In [7]:
df.to_parquet("level6.parquet")