In [8]:
from nbdev import *

In [9]:
#default_exp load

# Load

> API details.

In [10]:
#hide
from nbdev.showdoc import *

In [11]:
# %load common.py
# export

import pandas
import os
import numpy
import seaborn
import logging

In [12]:
#export
import fcsparser
from pathlib import Path
import dask.dataframe as dd
from dask.delayed import delayed

In [13]:
logging.basicConfig(level=logging.INFO)

## Load EhV IDEAS exports

In [14]:
# export

def load_raw_ideas_tree(tree_path, load_labels=False):
    logger = logging.getLogger(__name__)

    data = []
    columns = set()

    for timepoint_path in [p for p in os.listdir(tree_path) if os.path.isdir(os.path.join(tree_path, p))]:
        for replicate_path in os.listdir(os.path.join(tree_path, timepoint_path)):
            path = os.path.join(tree_path, timepoint_path, replicate_path)

            if not os.path.isfile(os.path.join(path, "focused.fcs")):
                continue

            logger.info(f"Loading dir {path}")

            meta, features = fcsparser.parse(os.path.join(path, "focused.fcs"))
            features["meta_timepoint"] = "".join(filter(str.isdigit, timepoint_path))
            features["meta_replicate"] = replicate_path

            if load_labels:
                features["meta_label"] = "unknown"
                for file in [p for p in os.listdir(path) if p.endswith(".txt")]:
                    label = os.path.splitext(file)[0]
                    object_numbers = pandas.read_csv(os.path.join(path, file), skiprows=1, delimiter="\t", index_col=0).index
                    features.loc[object_numbers, "meta_label"] = label

            logger.debug(f"Loaded dataframe with shape {features.shape}")

            if len(columns) == 0:
                columns |= set(features.columns.values.tolist())
            else:
                columns &= set(features.columns.values.tolist())

            data.append(features)

    return pandas.concat(data)[columns]

In [None]:
df_201 = load_raw_ideas("D:/Experiment_data/weizmann/EhV/201", load_labels=True)
df_control = load_raw_ideas("D:/Experiment_data/weizmann/EhV/control", load_labels=False)

In [15]:
# export

def check_should_load(cif, load_df):
    tmp = {}
    tmp["meta_timepoint"] = int("".join(filter(str.isdigit, cif.parts[-1].split("_")[1])))
    tmp["meta_replicate"] = "R"+cif.parts[-1].split("_")[0][1]
    tmp["meta_group"] = cif.parts[-2]
    
    sel = load_df[
        (load_df["meta_timepoint"] == tmp["meta_timepoint"]) &
        (load_df["meta_replicate"] == tmp["meta_replicate"]) & 
        (load_df["meta_group"] == tmp["meta_group"])
    ]
    return len(sel) != 0

def load_raw_ideas_fcs(cif, feature_dir, feature_postfix, label_dir):
    
    fcs = (feature_dir / cif.parts[-2] / (str(cif.stem) + "_%s" % feature_postfix)).with_suffix(".fcs")
    meta, features = fcsparser.parse(fcs)
    features["Object Number"] = features["Object Number"].astype(int)
    features = features.set_index("Object Number")
    features["meta_timepoint"] = int("".join(filter(str.isdigit, cif.parts[-1].split("_")[1])))
    features["meta_replicate"] = "R"+cif.parts[-1].split("_")[0][1]
    features["meta_group"] = cif.parts[-2]
    
    csv = label_dir / cif.parts[-2] / cif.with_suffix(".csv").parts[-1]
    labels = pandas.read_csv(csv).set_index("Object Number")
    labels.columns = ["meta_label_"+c for c in labels.columns]
    
    return features.join(labels, how="inner").reset_index()

def load_raw_ideas_dir(path: Path, feature_dir: Path, feature_postfix: Path, label_dir: Path, load_df:pandas.DataFrame=None, glob: str="*.cif"):
    logger = logging.getLogger(__name__)
    
    path = Path(path)
    dfs = []
    for cif in path.rglob(glob):
        logger.info(cif)
        if (load_df is None) or check_should_load(cif, load_df):
            features = load_raw_ideas_fcs(cif, feature_dir, feature_postfix, label_dir)
            dfs.append(features)
    
    return pandas.concat(dfs)

In [None]:
df = load_raw_ideas_dir(
    Path("/data/weizmann/EhV/high_time_res"), 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/representations/ideas_features/"), 
    "ALL", 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/cell_populations/manual_gating/"))

In [None]:
df.shape

In [16]:
df = load_raw_ideas_dir(
    Path("/data/weizmann/EhV/high_time_res"), 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/representations/ideas_features/"), 
    "ALL", 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/cell_populations/manual_gating/"),
    pandas.read_csv("data/selected_samples.csv"), "Low/*.cif")

INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T0_1.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T11_11.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T12_12.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T13_13.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T14_14.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T15_15.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T16_16.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T1_2.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T2_3.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T3_4.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T4_5.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T5_6.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T6_7.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T7_8.cif
INFO:__main__:/data/weizmann/EhV/high_time_res/Low/L1_T8_9.cif
INFO:__main__:/data/weizmann/EhV/high_time_

In [17]:
df.shape

(691705, 150)

In [None]:
# export
def load_raw_ideas_dir_dask(path: Path, feature_dir: Path, feature_postfix: Path, label_dir: Path, load_df: pandas.DataFrame, glob: str = "*.cif"):
    logger = logging.getLogger(__name__)
    
    path = Path(path)
    dfs = []
    for cif in path.rglob(glob):
        if (load_df is None) or check_should_load(cif, load_df):
            dfs.append(delayed(load_raw_ideas_fcs)(cif, feature_dir, feature_postfix, label_dir))

    return dd.from_delayed(dfs)

In [None]:
df = load_raw_ideas_dir_dask(
    Path("/data/weizmann/EhV/high_time_res"), 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/representations/ideas_features/"), 
    "ALL", 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/cell_populations/manual_gating/"))

In [None]:
df.shape[0].compute()

## Feature organization

### Raw features
Contain values that are not corrected for camera background intensity.

In [None]:
# export
def remove_unwanted_features(df):
    todrop = df.filter(regex="(?i).*(raw|bkgd|saturation).*").columns

    return df.drop(columns=todrop)

In [None]:
df_201 = remove_unwanted_features(df_201)
df_control = remove_unwanted_features(df_control)

In [None]:
df = remove_unwanted_features(df)

In [None]:
# export

def tag_columns(df):
    columns = df.columns.values.tolist()
    system_cols = ["Flow Speed", "Time", "Object Number"]
    for c in system_cols:
        if c in columns:
            columns[columns.index(c)] = "meta_"+c
        
    for c in columns:
        if not "meta_" in c:
            columns[columns.index(c)] = "feat_"+c
            
    df.columns = columns
    return df

In [None]:
tag_columns(df).head()

In [None]:
df = df.map_partitions(tag_columns)

In [None]:
# export

def add_merged_col(df, cols):
    df["meta_id"] = df[cols].astype(str).agg(''.join, axis=1)
    return df

In [None]:
df = df.map_partitions(add_merged_col, cols=["meta_group", "meta_timepoint", "meta_replicate", "meta_Object Number"])

In [None]:
df.head()

In [None]:
# export

def clean_column_names(df):
    df.columns = df.columns.map(lambda c: c.lower().replace(" ", "_"))
    return df

In [None]:
df = clean_column_names(df)

# Stats

In [None]:
df_201.shape

In [None]:
df_control.shape

## Number of cells in controls per replicate and timepoint

In [None]:
seaborn.countplot(x="replicate", hue="timepoint", data=df_control)

## MCP/PSBA labels per replicate and timepoint

In [None]:
seaborn.catplot(x="label", hue="replicate", col="timepoint", data=df_201[df_201["label"] != "unknown"], kind="count")