In [None]:
import pandas as pd
import plotly.express as px
import numpy as np

Plot scatter plot of mean average precision vs. viral infection efficiency

Additionally, cell counts are also plotted against viral infection efficiency and mean average precision.

### Mean Average Precision vs. Viral Infection Efficiency

In [None]:
# Read infection efficiency data

inf_eff_df_all = (
    pd.read_csv(
        "output/gpp-infection-efficiencies.csv.gz",
        usecols=[
            "Plate_name",
            "Batch_name",
            "Well",
            "broad_sample",
            "Plus_Blast",
            "Minus_Blast",
            "Minus_Avg_Background",
        ],
    )
    .rename(
        columns={
            "broad_sample": "Metadata_broad_sample",
            "Plate_name": "Metadata_plate_map_name",
            "Batch_name": "Metadata_Batch",
            "Well": "Metadata_Well",
        }
    )
    .query("Minus_Avg_Background < 2")
)

inf_eff_df_all.head()

In [None]:
# Add metadata to the dataframe

orf_metadata_df = pd.read_csv(
    "../datasets/metadata/orf.csv.gz",
    usecols=["Metadata_broad_sample", "Metadata_pert_type"]
)

# merge metadata with the dataframe

inf_eff_df_all = inf_eff_df_all.merge(orf_metadata_df, on="Metadata_broad_sample", how="left")

inf_eff_df_all.head()

In [None]:
inf_eff_mean_df = (
    inf_eff_df_all[["Metadata_broad_sample", "Minus_Avg_Background"]]
    .query("Metadata_broad_sample != 'EMPTY'")
    .query("Metadata_broad_sample != 'BAD CONSTRUCT'")
    .groupby("Metadata_broad_sample").mean().reset_index()
)

inf_eff_mean_df.head()

In [None]:
# Read mean average precision data

map_df = pd.read_csv(
    "../05.retrieve-orf-annotations/old_notebooks/output/replicate-retrieval-mAP-baseline-profiles.csv.gz",
    usecols=["Metadata_broad_sample", "mean_average_precision"],
)

map_df.head()

In [None]:
map_inf_eff_mean_df = map_df.merge(inf_eff_mean_df, on="Metadata_broad_sample", how="inner")

In [None]:
# Plot scatterplot

fig = px.scatter(
    map_inf_eff_mean_df,
    x="Minus_Avg_Background",
    y="mean_average_precision",
    marginal_x="histogram",
    marginal_y="histogram",
)

fig.update_layout(
    xaxis_title="Viral Infection Efficiency",
    yaxis_title="Mean Average Precision",
    title="Correlation between viral infection efficiency and mean average precision",
)

fig.show("png")
fig.write_image('figures/mean-average-precision-viral-infection-efficiency.png')

Find threshold using Otsu's method

In [None]:
# Functions from https://bic-berkeley.github.io/psych-214-fall-2016/otsu_threshold.html

def ssd(hist, bin_centers):
    n = np.sum(hist)
    mu = np.sum(bin_centers * hist) / n
    return np.sum(hist * ((bin_centers - mu) ** 2))

def otsu_threshold(values):
    n_bins = 100
    hist, bin_edges = np.histogram(values, bins=n_bins)
    bin_centers = bin_edges[:-1] + np.diff(bin_edges) / 2

    total_ssds = []

    for bin in range(1, n_bins):
        left_ssd = ssd(hist[:bin], bin_centers[:bin])
        right_ssd = ssd(hist[bin:], bin_centers[bin:])
        total_ssds.append(left_ssd + right_ssd)
    
    z = np.argmin(total_ssds)
    threshold = bin_centers[z]
    return threshold

In [None]:
otsu_threshold_value = otsu_threshold(inf_eff_df_all["Minus_Avg_Background"].values)
print(otsu_threshold_value)

In [None]:
map_inf_eff_mean_df["OverTheOtsuThreshold"] = map_inf_eff_mean_df["Minus_Avg_Background"] > otsu_threshold_value

In [None]:
# Split based on OverTheOtsuThreshold

fig = px.scatter(
    map_inf_eff_mean_df,
    x="Minus_Avg_Background",
    y="mean_average_precision",
    marginal_x="histogram",
    marginal_y="histogram",
    facet_col="OverTheOtsuThreshold",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis1_title="Viral Infection Efficiency",
    xaxis2_title="Viral Infection Efficiency",
    yaxis_title="Mean Average Precision",
    title="Correlation between viral infection efficiency and mean average precision",
)

fig.show("png")
fig.write_image('figures/mean-average-precision-viral-infection-efficiency-facet-otsu.png')

Plot density plots with Otsu threshold

In [None]:
fig = px.density_contour(
    map_inf_eff_mean_df,
    x="Minus_Avg_Background",
    y="mean_average_precision",
    marginal_x="histogram",
    marginal_y="histogram",
    color="OverTheOtsuThreshold",
    trendline="ols",
)

fig.update_layout(
    xaxis_title="Viral Infection Efficiency",
    yaxis_title="Mean Average Precision",
    title="Correlation between viral infection efficiency and mean average precision",
)

fig.show("png")
fig.write_image('figures/mean-average-precision-viral-infection-efficiency-density-contour-otsu.png')

### Cell Counts vs. Viral Infection Efficiency

Plot cell count and infection efficiency

In [None]:
cell_count_df = (
    pd.read_csv(
        "../00.0.explore-data/output/cell_counts.csv.gz",
        usecols=["Metadata_plate_map_name", "Metadata_Well", "Metadata_Count_Cells", "Metadata_Plate"],
    )
)

cell_count_df.head()

In [None]:
# Add Batch name to the cell count dataframe

batch_map = {
    "Batch1": "2021_04_26_Batch1",
    "Batch2": "2021_05_31_Batch2",
    "Batch3": "2021_05_10_Batch3",
    "Batch4": "2021_05_17_Batch4",
    "Batch5": "2021_06_07_Batch5",
    "Batch6": "2021_06_14_Batch6",
    "Batch7": "2021_06_21_Batch7",
    "Batch8": "2021_07_12_Batch8",
    "Batch9": "2021_07_26_Batch9",
    "Batch10": "2021_08_02_Batch10",
    "Batch11": "2021_08_09_Batch11",
    "Batch12": "2021_08_23_Batch12",
    "Batch13": "2021_08_30_Batch13",
}

experiment_df = (
    pd.read_csv(
        "../00.0.explore-data/output/experiment-metadata.tsv",
        sep="\t",
        usecols=["Batch", "Assay_Plate_Barcode"],
    )
    .rename(
        columns={"Assay_Plate_Barcode": "Metadata_Plate"}
    )
    .assign(Metadata_Batch=lambda x: x["Batch"].map(batch_map))
    .drop(["Batch"], axis=1)    
)

cell_count_df = cell_count_df.merge(
    experiment_df,
    on="Metadata_Plate",
    how="left",
)

# Drop Plate name

cell_count_df = cell_count_df.drop(["Metadata_Plate"], axis=1)

cell_count_df.head()

In [None]:
# Compute mean cell count per well per plate

cell_count_df = (
    cell_count_df.groupby(["Metadata_Batch", "Metadata_plate_map_name", "Metadata_Well"]).agg({"Metadata_Count_Cells": "mean"})
    .reset_index()
)

cell_count_df.head()

In [None]:
# Merge cell count and infection efficiency data

inf_eff_cell_count = inf_eff_df_all.merge(cell_count_df, on=["Metadata_Batch", "Metadata_plate_map_name", "Metadata_Well"], how="inner")
inf_eff_cell_count["OverTheOtsuThreshold"] = inf_eff_cell_count["Minus_Avg_Background"] > otsu_threshold_value

inf_eff_cell_count.head()

In [None]:
# Plot scatterplot of cell count and infection efficiency

fig = px.scatter(
    inf_eff_cell_count,
    x="Minus_Avg_Background",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Viral Infection Efficiency",
    yaxis_title="Mean Cell Count",
    title="Correlation between viral infection efficiency and cell count",
)

fig.show("png")
fig.write_image('figures/cell-count-viral-infection-efficiency.png')

### Plot Cell Painting cell count vs. Cell viability assay cell count

In [None]:
# Plot scatterplot of cell count and plus_blast cell count

fig = px.scatter(
    inf_eff_cell_count,
    x="Plus_Blast",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Plus Blast Cell Count",
    yaxis_title="Mean Cell Count",
    title="Correlation between cell count and plus blast cell count",
)

fig.show("png")
fig.write_image('figures/cell-count-plus-blast.png')

In [None]:
# Plot scatterplot of cell count and minus_blast cell count

fig = px.scatter(
    inf_eff_cell_count,
    x="Minus_Blast",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Minus Blast Cell Count",
    yaxis_title="Mean Cell Count",
    title="Correlation between cell count and minus blast cell count",
)

fig.show("png")
fig.write_image('figures/cell-count-minus-blast.png')

Plotting the last plot with only the ORF treatments.

In [None]:
merged_all_df_filtered = (
    inf_eff_cell_count.query(
        "Metadata_pert_type!='control'",
    )
    .query("Metadata_broad_sample!='EMPTY'")
    .dropna(subset="Metadata_broad_sample")
)

# Plot scatterplot of mean cell count and minus_blast cell count for only ORF treatments

fig = px.scatter(
    merged_all_df_filtered,
    x="Minus_Blast",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Minus Blast Cell Count",
    yaxis_title="Mean Cell Count",
    title="Correlation between cell count and minus blast cell count - ORF treatments",
)

fig.show("png")
fig.write_image('figures/cell-count-minus-blast-orf-treatments.png')


In [None]:
# Filter out cells with low infection efficiency

merged_all_df_filtered_cell_count = (
    inf_eff_cell_count.query(
        "Metadata_pert_type!='control'",
    )
    .query("Metadata_broad_sample!='EMPTY'")
    .dropna(subset="Metadata_broad_sample")
)

# Plot scatterplot of mean cell count and minus_blast cell count for only ORF treatments with minus blast > 1.5M

fig = px.scatter(
    merged_all_df_filtered_cell_count.query("Minus_Blast>1.5e6"),
    x="Minus_Blast",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Minus Blast Cell Count",
    yaxis_title="Mean Cell Count",
    title="Correlation between cell count and minus blast cell count - ORF treatments",
)

fig.show("png")
fig.write_image('figures/cell-count-minus-blast-orf-treatments-filtered-cell_count.png')

In [None]:
# Filter out cell count with low infection efficiency

merged_all_df_filtered_viability = (
    inf_eff_cell_count.query(
        "Metadata_pert_type!='control'",
    )
    .query("Metadata_broad_sample!='EMPTY'")
    .dropna(subset="Metadata_broad_sample")
)

# Plot scatterplot of mean cell count and minus_blast cell count for only ORF treatments with viral infection efficiency over the otsu threshold

fig = px.scatter(
    merged_all_df_filtered_viability.query("OverTheOtsuThreshold==True"),
    x="Minus_Blast",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Minus Blast Cell Count",
    yaxis_title="Mean Cell Count",
    title="Correlation between cell count and minus blast cell count - ORF treatments",
)

fig.show("png")
fig.write_image('figures/cell-count-minus-blast-orf-treatments-filtered-viability.png')

In [None]:
# Split by plate map name

merged_all_df_filtered_viability = (
    inf_eff_cell_count.query(
        "Metadata_pert_type!='control'",
    )
    .query("Metadata_broad_sample!='EMPTY'")
    .dropna(subset="Metadata_broad_sample")
)


fig = px.scatter(
    merged_all_df_filtered_viability,
    x="Minus_Blast",
    y="Metadata_Count_Cells",
    trendline="ols",
    trendline_color_override="red",
    facet_col="Metadata_plate_map_name",
    facet_col_wrap=7,
    facet_col_spacing=0.02,
    facet_row_spacing=0.02,
)

fig.write_image('figures/cell-count-minus-blast-orf-treatments-filtered-cell_count-facet.png', width = 4200, height = 2800)

In [None]:
# Plot scatterplot of plus blast and minus blast

fig = px.scatter(
    inf_eff_cell_count,
    x="Plus_Blast",
    y="Minus_Blast",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Plus Blast Cell Count",
    yaxis_title="Minus Blast Cell Count",
    title="Correlation between plus blast and minus blast cell count",
)

fig.show("png")
fig.write_image('figures/plus-blast-minus-blast.png')

Since most reagents have high efficiency, it makes sense that plus blast cell count correlates strongly with minus blast cell count

### `EMPTY` wells

Looking at the distribution of cells in `EMPTY` wells to determine if blasticidin selection was performed. If selection was performed, we would expect `EMPTY` wells to have very few cells.

In [None]:
inf_eff_cell_count['EMPTY'] = inf_eff_cell_count['Metadata_broad_sample'] == "EMPTY"

fig = px.histogram(
    inf_eff_cell_count,
    x="Metadata_Count_Cells",
    color="EMPTY",
    marginal="rug",
    histnorm="probability",
    opacity=0.8,
    color_discrete_map={True: "rgba(0, 114, 178, 0.7)", False: "rgba(213, 94, 0, 0.7)"}
)

fig.update_layout(
    xaxis_title="Cell Count",
    yaxis_title="Probability",
    title="Distribution of cell count color by EMPTY",
)

fig.show("png")
fig.write_image('figures/cell-count-histogram-EMPTY.png')

Plot the cell count and infection efficiency of `EMPTY` wells.

In [None]:
fig = px.scatter(
    inf_eff_cell_count.query('Metadata_broad_sample == "EMPTY"'),
    x="Minus_Avg_Background",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Viral Infection Efficiency",
    yaxis_title="Mean Cell Count",
    title="Correlation between viral infection efficiency and cell count for EMPTY wells",
)

fig.show("png")
fig.write_image('figures/cell-count-viral-infection-efficiency-EMPTY_with_outliers.png')

In [None]:
# Drop outliers

fig = px.scatter(
    inf_eff_cell_count.query('Metadata_broad_sample == "EMPTY" and Minus_Avg_Background < 0.2'),
    x="Minus_Avg_Background",
    y="Metadata_Count_Cells",
    marginal_x="histogram",
    marginal_y="histogram",
    trendline="ols",
    trendline_color_override="red",
)

fig.update_layout(
    xaxis_title="Viral Infection Efficiency",
    yaxis_title="Mean Cell Count",
    title="Correlation between viral infection efficiency and cell count for EMPTY wells",
)

fig.show("png")
fig.write_image('figures/cell-count-viral-infection-efficiency-EMPTY.png')

#### Identify the plate map of these EMPTY wells with high infection efficiency

In [None]:
inf_eff_cell_count.query(
    'Metadata_broad_sample == "EMPTY" and Minus_Avg_Background > @otsu_threshold_value'
)[["Metadata_Batch", "Metadata_plate_map_name"]].drop_duplicates().values[0]

#### Rotate the platemap to check if the infection efficiency of the `EMPTY` decreases.

##### First plot infection efficiency for this plate map

In [None]:
defective_platemap_df = inf_eff_cell_count.query(
    "Metadata_plate_map_name=='OAB41.OAC17.OAB78.79.A'"
)

# Number of `EMPTY` wells passing the Otsu threshold

defective_platemap_df.query(
    'EMPTY==True'
).OverTheOtsuThreshold.value_counts()

In [None]:
defective_platemap_rotated_df = defective_platemap_df.copy()

defective_platemap_rotated_df["EMPTY"] = defective_platemap_rotated_df["EMPTY"].values[
    ::-1
]

# Number of `EMPTY` wells passing the Otsu threshold
defective_platemap_rotated_df.query(
    'EMPTY==True'
).OverTheOtsuThreshold.value_counts()