# BBBC comparison

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from scip_workflows.common import *


In [None]:
try:
    path_gt = snakemake.input.gt
    moa_path = snakemake.input.moa
    image_path = snakemake.input.image
    path = snakemake.input.features
except NameError:
    data_root = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/BBBC021")
    # data_root = Path("/home/maximl/scratch/data/vsc/datasets/BBBC021/")
    data_dir_gt = data_root / "cellprofiler"
    path_gt = data_dir_gt / "db.sqlite3"
    moa_path = data_root / "BBBC021_v1_moa.csv"
    image_path = data_root / "BBBC021_v1_image.csv"
    data_dir = data_root / "results" / "images_subset_v4"
    path = data_dir / "features.parquet"


In [None]:
moa = pandas.read_csv(moa_path)
image = pandas.read_csv(image_path)


In [None]:
moa_image = moa.merge(
    image,
    left_on=["compound", "concentration"],
    right_on=["Image_Metadata_Compound", "Image_Metadata_Concentration"],
).drop(columns=["Image_Metadata_Compound", "Image_Metadata_Concentration"])


In [None]:
moa_image["batch"] = (
    moa_image["Image_Metadata_Plate_DAPI"]
    .apply(lambda p: int(p.split("_")[0][len("Week") :]))
    .astype("category")
)


In [None]:
moa_image.columns = ["meta_" + c for c in moa_image.columns]


In [None]:
import sqlite3

con = sqlite3.connect(path_gt)


In [None]:
df_gt = pandas.read_sql_query(
    """
    SELECT * 
    FROM supplement_Object
    INNER JOIN supplement_Image ON (supplement_Object.TableNumber = supplement_Image.TableNumber) AND (supplement_Object.ImageNumber = supplement_Image.ImageNumber)
""",
    con,
)


In [None]:
df_gt_image = pandas.read_sql_query("SELECT * FROM supplement_Image", con)


In [None]:
df_gt.shape


# Actin + DAPI segmentation

In [None]:
%%time
df1 = pq.read_table(path).to_pandas()
df1 = df1.merge(moa_image, left_on="meta_filename", right_on="meta_Image_FileName_DAPI")

In [None]:
plate_counts = (
    df_gt.groupby("Image_Metadata_Plate_DAPI")
    .size()
    .to_frame()
    .merge(
        df1.groupby("meta_Image_Metadata_Plate_DAPI").size().to_frame(),
        left_index=True,
        right_index=True,
    )
)
plate_counts.columns = ["gt", "my"]

plate_counts["diff"] = plate_counts["gt"] - plate_counts["my"]


In [None]:
plate_counts.sort_values("diff")


In [None]:
image_counts = (
    df_gt.groupby("Image_FileName_DAPI")
    .size()
    .to_frame()
    .merge(
        df1.groupby("meta_filename").size().to_frame(),
        left_index=True,
        right_index=True,
    )
)
image_counts.columns = ["gt", "my"]

image_counts["diff"] = image_counts["gt"] - image_counts["my"]


In [None]:
image_counts.sort_values("diff")


In [None]:
r = df_gt_image[
    df_gt_image["Image_FileName_DAPI"] == image_counts.sort_values("diff").iloc[-2].name
].iloc[0]


In [None]:
r["Image_Metadata_Plate_DAPI"]


In [None]:
r["Image_FileName_DAPI"]


# Value comparisons

In [None]:
seaborn.ecdfplot(
    data=df_gt[df_gt["Cells_AreaShape_Area"] < 20000], x="Cells_AreaShape_Area"
)


In [None]:
seaborn.ecdfplot(data=df1[df1["feat_area_actin"] < 20000], x="feat_area_actin")


In [None]:
seaborn.ecdfplot(
    data=df_gt[df_gt["Cytoplasm_AreaShape_Area"] < 10000], x="Cytoplasm_AreaShape_Area"
)


In [None]:
seaborn.ecdfplot(data=df1[df1["feat_area_tubulin"] < 10000], x="feat_area_tubulin")


In [None]:
seaborn.ecdfplot(data=df1, x="feat_sum_actin")


In [None]:
seaborn.ecdfplot(data=df_gt, x="Cells_Intensity_MaxIntensity_CorrActin")
