# WBC Quality control

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from scip_workflows.common import *
from scip_workflows.core import plot_gate_zarr, plot_gate_zarr_channels


In [None]:
import zarr
from scip.features import texture


## Load processed frame

In [None]:
try:
    df = pq.read_table(snakemake.input[0]).to_pandas()

    if "image_root" in snakemake.config:
        image_root = Path(snakemake.config["image_root"])
        df["meta_path"] = df["meta_path"].apply(
            lambda p: image_root.joinpath(
                *Path(p).parts[Path(p).parts.index("images") + 1 :]
            )
        )

    output_index = snakemake.output.index
    output_cols = snakemake.output.columns
except NameError:
    # data_dir = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/wbc/scip/20220713131400/")
    data_dir = Path("/home/maximl/scratch/data/vsc/datasets/wbc/scip/131020222139/")
    df = pq.read_table(data_dir / "features.parquet").to_pandas()

    image_root = Path("/home/maximl/scratch/data/vsc/datasets/wbc/images/")
    df["meta_path"] = df["meta_path"].apply(
        lambda p: image_root.joinpath(
            *Path(p).parts[Path(p).parts.index("images") + 1 :]
        )
    )

    output_index = data_dir / "indices/index.npy"
    output_cols = data_dir / "indices/columns.npy"


# Filter on number of regions

In [None]:
sel = (
    (df["meta_li_regions_BF1"] == 1)
    & (df["meta_li_regions_BF2"] == 1)
    & (df["meta_li_regions_SSC"] >= 1)
)


In [None]:
df = df[sel]
df.shape


# NaN values

In [None]:
# show all NaN columns
df.columns[df.isna().all(axis=0)]


# Detecting multiplets that are missed during segmentation

In [None]:
feat_majorminor_ratio = (
    df["feat_li_major_axis_length_BF1"] / df["feat_li_minor_axis_length_BF1"]
)
ax = seaborn.displot(data=feat_majorminor_ratio)


In [None]:
sel1 = feat_majorminor_ratio > 2
plot_gate_zarr(sel1, df, "li")


In [None]:
df = df[~sel1]
df.shape


In [None]:
seaborn.displot(data=df["feat_li_area_BF1"] - df["feat_li_convex_area_BF1"], bins=100)


In [None]:
sel1 = (df["feat_li_area_BF1"] - df["feat_li_convex_area_BF1"]) < -200
plot_gate_zarr(sel1, df, "li", maxn=20)


In [None]:
df = df[~sel1]
df.shape


In [None]:
plt.hist(df["feat_li_eccentricity_BF1"], bins=100)


In [None]:
sel1 = df["feat_li_eccentricity_BF1"] > 0.75
plot_gate_zarr(sel1, df, "li", maxn=50)


In [None]:
df = df[~sel1]
df.shape


# Identifying zero-variance features

In [None]:
low_var = df.filter(regex="feat").columns[df.filter(regex="feat").var() < 0.001]


In [None]:
len(low_var)


In [None]:
df = df.drop(columns=low_var)


# Export

In [None]:
numpy.save(output_index, df.index.values)


In [None]:
numpy.save(output_cols, df.columns.values)
