In [None]:
# merge experiment and fov metadata DFs together.

fov = pd.read_csv("output/FoV-experiment-metadata.tsv", sep="\t").drop(columns=("Batch")).rename(columns={"FoV_Batch": "Batch"})

orig = pd.read_csv('output/experiment-metadata-updated.csv')
comb = pd.concat([fov, orig], ignore_index=True)
comb["sites"] = comb["Sites-SubSampled"]
comb["sites"].fillna(comb["Images_per_well"], inplace=True)
comb.to_csv("output/all-profile-metadata.csv", index_label='index', index=False)

# Read new experiment df
experiment_df = pd.read_csv("output/all-profile-metadata.csv")

experiment_df

In [None]:
def create_moa_dataframe(experiment_metadata, profile_parent_dir, batch_col="Batch", match_or_rep_or_both="replicating", enable_sphering="both"):
    """
    batch_col is the name of the column to distinguish the profile parent folder. Eg. "Scope1_MolDev_10X" or "1siteSubSample_Scope1_MolDev_10X"
    Output df will also use this batch_col name
    """
    n_samples = 10000
    n_replicates = 4  # number of sample replicates within each plate 
    metadata_common = 'Metadata_moa'
    metadata_perturbation = 'Metadata_broad_sample'
    group_by_feature = 'Metadata_pert_iname'

    corr_replicating_list = list()
    corr_matching_list = list()

    for ind, a_vendor in enumerate(experiment_metadata["Vendor"].unique()):
        print(f"Processing {a_vendor}")
        vendor_data = experiment_metadata.loc[experiment_metadata["Vendor"] == a_vendor]
        for a_batch in vendor_data[batch_col].unique():
            batch_data = vendor_data.loc[vendor_data[batch_col] == a_batch]
            for a_plate in batch_data["Assay_Plate_Barcode"].unique():
                # plate_data = batch_data.loc[batch_data["Assay_Plate_Barcode"] == a_plate]
                data_path = os.path.join(profile_parent_dir, a_batch, a_plate, a_plate+"_normalized_feature_select_negcon_batch.csv.gz")
                load_data = pd.read_csv(data_path)
                # print(data_path)
                try:
                    if match_or_rep_or_both.casefold() == "replicating" or match_or_rep_or_both.casefold() == "both":
                        if enable_sphering.casefold() == "yes" or enable_sphering.casefold() == "both":
                            sphere_bool = True
                            replicate_corr_sphere, null_replicating_sphere, prop_95_replicating_sphere, value_95_replicating_sphere = utilssphering.calculate_percent_replicating_MOA("", "", data_df=load_data)
                            corr_replicating_list.append(pd.DataFrame({'Vendor': a_vendor,
                                                                        batch_col: a_batch,
                                                                        'Assay_Plate_Barcode': a_plate,
                                                                        'Replicating':[replicate_corr_sphere],
                                                                        'Null_Replicating':[null_replicating_sphere],
                                                                        'Percent_Replicating':prop_95_replicating_sphere,
                                                                        'Value_95':value_95_replicating_sphere,
                                                                        'sphering': sphere_bool}, index=[ind]))

                        if enable_sphering.casefold() == "no" or enable_sphering.casefold() == "both": 
                            sphere_bool = False
                            plate_df = utils.remove_negcon_empty_wells(load_data)
                            replicate_corr = list(utils.corr_between_replicates(plate_df, group_by_feature))
                            null_replicating = list(utils.corr_between_non_replicates(plate_df, n_samples=n_samples, n_replicates=n_replicates, metadata_compound_name=group_by_feature))
                            prop_95_replicating, value_95_replicating = utils.percent_score(null_replicating, replicate_corr, how='right')
                            corr_replicating_list.append(pd.DataFrame({'Vendor': a_vendor,
                                                                        batch_col: a_batch,
                                                                        'Assay_Plate_Barcode': a_plate,
                                                                        'Replicating':[replicate_corr],
                                                                        'Null_Replicating':[null_replicating],
                                                                        'Percent_Replicating':prop_95_replicating,
                                                                        'Value_95':value_95_replicating,
                                                                        'sphering': sphere_bool}, index=[ind]))

                    if match_or_rep_or_both.casefold() == "matching" or match_or_rep_or_both.casefold() == "both":
                        if enable_sphering.casefold() == "yes" or enable_sphering.casefold() == "both":
                            sphere_bool = True
                            matching_corr_sphere, null_matching_sphere, prop_95_matching_sphere, value_95_matching_sphere = utilssphering.calculate_percent_matching_MOA("", "", data_df=load_data)
                            corr_matching_list.append(pd.DataFrame({'Vendor': a_vendor,
                                                                    batch_col: a_batch,
                                                                    'Assay_Plate_Barcode': a_plate,
                                                                    'Matching':[matching_corr_sphere],
                                                                    'Null_Matching':[null_matching_sphere],
                                                                    'Percent_Matching':prop_95_matching_sphere,
                                                                    'Value_95':value_95_matching_sphere,
                                                                    'sphering': sphere_bool}, index=[ind]))
                        
                        if enable_sphering.casefold() == "no" or enable_sphering.casefold() == "both": 
                            sphere_bool = False
                            plate_df = utils.remove_negcon_empty_wells(load_data)
                            matching_corr = list(utils.corr_between_perturbation_pairs(plate_df, 'Metadata_moa', 'Metadata_broad_sample'))
                            null_matching = list(utils.corr_between_perturbation_non_pairs(plate_df, n_samples=n_samples, metadata_common=metadata_common, metadata_perturbation=metadata_perturbation))
                            prop_95_matching, value_95_matching = utils.percent_score(null_matching, matching_corr, how='right')
                            corr_matching_list.append(pd.DataFrame({'Vendor': a_vendor,
                                                                    batch_col: a_batch,
                                                                    'Assay_Plate_Barcode': a_plate,
                                                                    'Matching':[matching_corr],
                                                                    'Null_Matching':[null_matching],
                                                                    'Percent_Matching':prop_95_matching,
                                                                    'Value_95':value_95_matching,
                                                                    'sphering': sphere_bool}, index=[ind]))
                except:
                    print(f"Passed: {data_path}")
                    pass
    # Concatenate the data
    if match_or_rep_or_both.casefold() == "replicating" or match_or_rep_or_both.casefold() == "both":
        corr_replicating_df = pd.concat(corr_replicating_list, ignore_index=True)
    if match_or_rep_or_both.casefold() == "matching" or match_or_rep_or_both.casefold() == "both":
        corr_matching_df = pd.concat(corr_matching_list, ignore_index=True)
                
    # Merge metadata with output dataframes
    merge_columns = ['Vendor', batch_col, 'Assay_Plate_Barcode']
    if match_or_rep_or_both.casefold() == "both":
        corr_replicating_df = experiment_metadata.merge(corr_replicating_df, how="inner", on=merge_columns)
        corr_matching_df = experiment_metadata.merge(corr_matching_df, how="inner", on=merge_columns)
        return corr_replicating_df, corr_matching_df
    if match_or_rep_or_both.casefold() == "replicating":
        return experiment_metadata.merge(corr_replicating_df, how="inner", on=merge_columns)
    elif match_or_rep_or_both.casefold() == "matching":
        return experiment_metadata.merge(corr_matching_df, how="inner", on=merge_columns)

df_replicating, df_matching = create_moa_dataframe(experiment_df, "../jump-scope/profiles/", match_or_rep_or_both="both", enable_sphering="both")
# df = create_moa_dataframe(pd.read_csv('output/experiment-metadata.tsv', sep='\t'), "../jump-scope/profiles/", match_or_rep_or_both="replicating", enable_sphering="no")

# df


In [None]:
def add_total_cell_counts(df, profile_path):
    out_df = df.copy()
    out_df["cell_count"] = ""
    for i in df.index:
        batch = df.loc[i, "Batch"]
        barcode = df.loc[i, "Assay_Plate_Barcode"]
        load_path = os.path.join(profile_path, batch, barcode, f"{barcode}_normalized_negcon.csv.gz")
        load_df = pd.read_csv(load_path)
        try:
            sum_cells = sum(load_df.loc[:,"Metadata_Count_Cells"])
        except:
            # In case a profile is missing cell count data
            sum_cells = np.nan
        out_df.loc[i, "cell_count"] = sum_cells
    return out_df

df_replicating = add_total_cell_counts(df_replicating, "../jump-scope/profiles/")
df_matching = add_total_cell_counts(df_matching, "../jump-scope/profiles/")

In [None]:
# Merge sites and subsample columns
df_replicating.loc[df_replicating["Sites-SubSampled"].isnull(), "Sites-SubSampled"] = df_replicating["Images_per_well"]
df_replicating["Sites-SubSampled"] = pd.to_numeric(df_replicating["Sites-SubSampled"], downcast="integer")

df_matching.loc[df_matching["Sites-SubSampled"].isnull(), "Sites-SubSampled"] = df_matching["Images_per_well"]
df_matching["Sites-SubSampled"] = pd.to_numeric(df_matching["Sites-SubSampled"], downcast="integer")


In [None]:
## Checkpoint save

if not os.path.isdir("checkpoints"):
    os.mkdir("checkpoints")

df_replicating.to_csv("checkpoints/moa-replicating-sphering.csv", index_label='index', index=False)

df_matching.to_csv("checkpoints/moa-matching-sphering.csv", index_label='index', index=False)

In [None]:
# Rename columns so 
df_replicating = df_replicating.rename(columns={"Value_95": "value_95_replicating"})
# df_replicating["spinning-disc"] = df_replicating["spinning-disc"].fillna(0)
df_matching = df_matching.rename(columns={"Value_95": "value_95_matching"})
# df_matching["spinning-disc"] = df_matching["spinning-disc"].fillna(0)

# print(df_replicating.shape, df_matching.shape)
# merge_cols = [i for i in df_matching.columns if "matching" not in i.lower()]

merge_cols = ['Vendor',
 'Batch',
 'Plate_Map_Name',
 'Assay_Plate_Barcode',
 'Modality',
 'Images_per_well',
 'Binning',
 'Magnification',
 'Number_of_channels',
 'z_plane',
 'Anomaly',
 'spinning-disc',
 'vs-brightfield',
 'sites',
 "BF_Zplanes",
 "dry-immersion",
 "vs-brightfield",
 "simultaneous-excitation",
 "cell_count",
 "sites",
 "aperture",
 'sphering']

# match_rep_df = df_replicating.merge(df_matching, on=merge_cols, how="left")[["Percent_Replicating"]]

match_rep_df = pd.merge(df_replicating, df_matching, on=merge_cols, how="inner")
match_rep_df

# Implement QC information into dataset