In [1]:
import os
import glob
import pandas as pd
import numpy as np
#import pycytominer
import pyarrow.parquet as pq

os.chdir('./output/')

In [2]:
# return a list of unique plates in dataset
def dividePlates(lst):
    dct = {}
 
    for p in lst:
        acqID = p.split('/')[6]
        
        if acqID not in dct:
            dct[(acqID)] = acqID
     
    res = []
    
    for key in sorted(dct):
        res.append(dct[key])
     
    return res

In [3]:
path = "/share/data/cellprofiler/automation/results/"

feat_folder = path + '*P1035*'
cells = sorted(glob.glob(feat_folder + '/*/*/featICF_cells*')) 
cyto = sorted(glob.glob(feat_folder + '/*/*/featICF_cyto*')) 
nuclei = sorted(glob.glob(feat_folder + '/*/*/featICF_nuclei*')) 

In [6]:
list_of_plates = dividePlates(cells)
list_of_plates = list_of_plates[22:] 

list_of_plates

['P103590', 'P103591', 'P103592', 'P103593', 'P103595', 'P103597']

In [7]:
d = {}

for plate in list_of_plates:
    
    selected_features = []
    
    #nuclei
    all_nuclei = [s for s in nuclei if plate in s]
    sel_nuclei = max(all_nuclei , key = os.path.getctime)
    selected_features.append(sel_nuclei)
    
    #cytoplasm
    all_cyto = [s for s in cyto if plate in s]
    sel_cyto = max(all_cyto , key = os.path.getctime)
    selected_features.append(sel_cyto)
    
    #cells
    all_cells = [s for s in cells if plate in s]
    sel_cells = max(all_cells , key = os.path.getctime)
    selected_features.append(sel_cells)

    d[plate] = selected_features
    

d

{'P103590': ['/share/data/cellprofiler/automation/results/P103590/4324/6078/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/P103590/4324/6078/featICF_cytoplasm.parquet',
  '/share/data/cellprofiler/automation/results/P103590/4324/6078/featICF_cells.parquet'],
 'P103591': ['/share/data/cellprofiler/automation/results/P103591/4320/6108/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/P103591/4320/6108/featICF_cytoplasm.parquet',
  '/share/data/cellprofiler/automation/results/P103591/4320/6108/featICF_cells.parquet'],
 'P103592': ['/share/data/cellprofiler/automation/results/P103592/4325/6082/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/P103592/4325/6082/featICF_cytoplasm.parquet',
  '/share/data/cellprofiler/automation/results/P103592/4325/6082/featICF_cells.parquet'],
 'P103593': ['/share/data/cellprofiler/automation/results/P103593/4321/6072/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/

In [8]:
import datetime
x = datetime.datetime.now()
date = (x.strftime("%x")) 
time = (x.strftime("%X")) 

print(date, time)

11/23/23 11:24:58


In [10]:
collected_df = []

for i, key in enumerate(d):
    
    n =  (d[key][0])
    cy = (d[key][1])
    ce = (d[key][2])
    nuclei = pd.read_parquet(n).add_prefix('Nuclei_').reset_index()
    cytoplasm = pd.read_parquet(cy).add_prefix('Cytoplasm_').reset_index()
    cells = pd.read_parquet(ce).add_prefix('Cells_').reset_index()
    
    #------------------- merge NUCLEI CYTOPLASM and CELL objects ----------------#
    
    # step 1: Take the mean values of for multiple nuclei belonging to one cell - also add a column with nuclei count per cell!
    nuclei = nuclei.loc[:, ~nuclei.columns.str.startswith(('Nuclei_FileName_', 'Nuclei_PathName_'))]
    nuclei_count = nuclei.groupby(["Nuclei_Metadata_Barcode", "Nuclei_Metadata_Well", "Nuclei_Metadata_Site", "Nuclei_Parent_cells"]).size().reset_index(name='Nuclei_Nucleicount')
    nuclei_grouped = nuclei.groupby(["Nuclei_Metadata_Barcode", "Nuclei_Metadata_Well", "Nuclei_Metadata_Site", "Nuclei_Parent_cells"]).mean().reset_index()
    nuclei_merged = pd.merge(nuclei_grouped, nuclei_count, on=["Nuclei_Metadata_Barcode", "Nuclei_Metadata_Well", "Nuclei_Metadata_Site", "Nuclei_Parent_cells"])

    # step 2: merge nuclei and cytoplasm objects
    new_df = pd.merge(nuclei_merged, cytoplasm, how='left', left_on=['Nuclei_Metadata_Barcode', 'Nuclei_Metadata_Well', "Nuclei_Metadata_Site", "Nuclei_Parent_cells"],
                      right_on=['Cytoplasm_Metadata_Barcode', 'Cytoplasm_Metadata_Well', "Cytoplasm_Metadata_Site", "Cytoplasm_ObjectNumber"])
    
    # step 3: merge cells objects
    new_df = pd.merge(new_df, cells, how='left', left_on=['Cytoplasm_Metadata_Barcode', 'Cytoplasm_Metadata_Well', "Cytoplasm_Metadata_Site", "Cytoplasm_ObjectNumber"],
                      right_on=['Cells_Metadata_Barcode', 'Cells_Metadata_Well', "Cells_Metadata_Site", "Cells_ObjectNumber"])
    
    #------------------- reorder to bring meta columns to front  --------------------------------#
    first_column   = new_df.pop('Cells_Metadata_Barcode')
    second_column  = new_df.pop('Cells_Metadata_Well')
    third_column   = new_df.pop('Cells_Metadata_Site')
    
    new_df.insert(0, 'Metadata_plate_map_name', first_column)
    new_df.insert(1, 'Metadata_Well', second_column)
    new_df.insert(2, 'Metadata_Site', third_column)
    
    # ------------------ add image identifier for metadata and QC -----------------------------#
    new_df['ImageID'] = new_df['Metadata_plate_map_name'] + "_" + new_df['Metadata_Well'] + "_s" + new_df['Metadata_Site'].astype(str).replace(r'\.0$', '', regex=True)
    
    # clean up #
    new_df = new_df.loc[:, ~new_df.columns.str.contains('Unnamed|index|Cytoplasm_Meta|Cells_Meta|Nuclei_Meta|FileName|PathName|_ImageNumber|Location|Children|Parent|Object_Number|ObjectNumber')]
    
    # change to BROAD names
    new_df.columns = new_df.columns.str.replace(r"illumSYTO", "RNA")
    new_df.columns = new_df.columns.str.replace(r"illumCONC", "ER")
    new_df.columns = new_df.columns.str.replace(r"illumHOECHST", "DNA")
    new_df.columns = new_df.columns.str.replace(r"illumPHAandWGA", "AGP")
    new_df.columns = new_df.columns.str.replace(r"illumMITO", "Mito")
    
    print("plate", list_of_plates[i], "contains a total of", len(new_df), "rows")
    
    new_df.to_parquet("level1_agg_{}.parquet".format(list_of_plates[i]))  # save as parquet
    
    # save to master df
    collected_df.append(new_df)
    
dfs = pd.concat(collected_df)
dfs = dfs.drop(0)
dfs.to_parquet("level1_merge.parquet", index=False)  # save as parquet

plate P103590 contains a total of 281790 rows
plate P103591 contains a total of 367883 rows
plate P103592 contains a total of 285003 rows
plate P103593 contains a total of 370691 rows
plate P103595 contains a total of 399346 rows
plate P103597 contains a total of 368550 rows


In [11]:
print(date, time)

11/23/23 11:24:58
