# RAM-12,13,14
#### Jonathan Ramos 5/29/2024

This set of experiments was similar to the older RAM groups, but rather than assessing the differences in ensembles between last train and VR5, here we examine the ensembles on the reactivation (FR1 or VR5) and the CUE (FR1) reinstatement in ABC or Vehicle treated animals. 

In [1]:
import numpy as np
import pandas as pd
import glob
import sys

# loading some functions we wrote before
sys.path.append("/Users/jonathanramos/Desktop/LRI/Image ROI Data Wrangling/")
from clean import *
from norm import *
from count import *


# Cleaning, Wrangling Data
## loading data, stitching sets together

From examining the sets we've determined that the columns corresponding to stains are as follwos:
- cFos: cFos
- MicrogliaCortex: EGFP
- 8-oxo-DG: mKate

colocalized columns are always presented in monotonically decreasing order such that cFos > EGFP > mKate. That is, from left to right, coloc col ordering is always in the order: cFos, then EGFP, then mKate.
For example: if an EGFP double labled with mKate csv contains only 'hand-drawn' labels, then we can infer that the first coloc col corresponds to EGFP and the second to mKate, and we may relabel accordingly.

Using this information we can appropriately relabel all columns designated as "hand-drawn" be inferring the correct label based on the columns position.

THE HAND-DRAWN LABEL IS AMBIGUOUS. PLEASE REMOVE THIS IN A FUTURE UPDATE

In [2]:
df_data = [pd.read_csv(f) for f in glob.glob('colocalized data/*/*.csv')]
df_double = [df for df in df_data if len([col for col in df.columns if col.__contains__('coloc')]) == 2]
df_triple = [df for df in df_data if len([col for col in df.columns if col.__contains__('coloc')]) == 3]

# inspect col labels for coloc cols; we only want to check cols WITHOUT 'hand-drawn'
labeled = [df for df in df_triple if len([col for col in df.columns if 'hand-drawn' in col]) == 0]
for df in labeled:
    print([col for col in df.columns if 'coloc' in col])

def preprocessing(coloc_dfs):
    '''
    small fn to automate some of the preprocessing steps that must be done
    on each df before we can concat: removing unncessary whitespaces,
    dropping rows without an intensity measurement
    '''
    # remove leading whitespace from col names
    for df in coloc_dfs:
        df.columns = [col.replace(' ', '') for col in df.columns]

    # drop rows without intensity measurement
    coloc_dfs = [df[~df['mean-background'].isna()] for df in coloc_dfs]

    # remove leading whitespace from filenames (note access via .loc to avoid setting with copy)
    for df in coloc_dfs:
        df.loc[:, 'filename'] = df.filename.str.replace(' ', '')

    return coloc_dfs

df_double = preprocessing(df_double)
df_triple = preprocessing(df_triple)

df_triple[0]

['coloc w/ c-Fos', 'coloc w/ Microglia Cortex', 'coloc w/ 8-oxo-dG']
['coloc w/ c-Fos', 'coloc w/ Microglia Cortex', 'coloc w/ 8-oxo-dG']
['coloc w/ c-Fos', 'coloc w/ Microglia Cortex', 'coloc w/ 8-oxo-dG']
['coloc w/ c-Fos', 'coloc w/ Microglia Cortex', 'coloc w/ 8-oxo-dG']


Unnamed: 0,stain,colocw/hand-drawn,colocw/hand-drawn.1,colocw/hand-drawn.2,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,feret_angle,feret_min,circularity,aspect_ratio,roundness,solidity,skewness,kurtosis,filename,analysis_date
0,c-Fos hand-drawn,,0-FFF-00030,0-FFF-00019,0-FFF-00140,145.01,276.29,61.0,322.8468,411.2940,...,0.0,10.0,0.7767,1.0000,0.7625,0.7262,0.3641,-0.7820,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
1,c-Fos hand-drawn,,0-01c-00026,0-FFF-00017,0-FFF-00107,351.24,319.62,77.0,322.8468,728.2460,...,0.0,10.0,0.9804,1.0000,0.9625,0.9167,-0.1310,-1.1841,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
2,c-Fos hand-drawn,,0-01c-00007,0-FFF-00015,0-FFF-00090,465.98,318.40,41.0,322.8468,287.8058,...,0.0,10.0,0.5220,1.0000,0.5125,0.4881,1.1532,1.6963,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
3,c-Fos hand-drawn,,0-FFF-00029,0-FFF-00030,0-FFF-00058,22.36,277.28,40.0,322.8468,273.6023,...,0.0,10.0,0.5093,1.0000,0.5000,0.4762,0.5347,-1.1035,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
4,c-Fos hand-drawn,-,-,-,0-FFF-00182,170.88,200.20,79.0,322.8468,345.7466,...,0.0,10.0,1.0000,1.0000,0.9875,0.9405,0.1981,-0.6553,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,8-oxo-dG hand-drawn,-,-,-,0-FFF-00005,247.05,206.49,65.0,38.9865,36.9134,...,0.0,12.0,0.4883,1.1442,0.4177,0.4924,0.8711,0.0252,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
311,8-oxo-dG hand-drawn,-,-,-,0-FFF-00004,306.93,80.71,87.0,38.9865,61.4913,...,0.0,12.0,0.6535,1.1442,0.5591,0.6591,0.9822,1.4934,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
312,8-oxo-dG hand-drawn,-,-,-,0-FFF-00003,459.14,211.04,89.0,38.9865,87.0218,...,0.0,12.0,0.6685,1.1442,0.5719,0.6742,0.7031,-0.5700,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
313,8-oxo-dG hand-drawn,-,-,-,0-FFF-00002,342.62,275.87,72.0,38.9865,44.4182,...,0.0,12.0,0.5408,1.1442,0.4627,0.5455,0.6865,-0.2463,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024


### renaming stain type labels

In [3]:
def infer_stains(df):
    '''
    takes a dataframe, splits the filename on "_", builds new stain col
    based on the last item split on "_". 
        files ending in 2.tif are cFos
        files ending in 3.tif are EGFP
        files ending in 4.tif are mKate
    
    args:
        df(N,M): pd.core.DataFrame
    retrun:
        df(N,M) pd.core.DataFrame; replaced with new stain col
    '''
    d_stains = {'2.tif': 'cFos','3.tif': 'EGFP','4.tif': 'mKate'}
    df['stain'] = df.filename.str.split('_')\
        .apply(lambda x: x[-1])\
        .apply(lambda x: d_stains[x])

    return df

df_double = [infer_stains(df) for df in df_double]
df_triple = [infer_stains(df) for df in df_triple]

df_triple[0]

Unnamed: 0,stain,colocw/hand-drawn,colocw/hand-drawn.1,colocw/hand-drawn.2,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,feret_angle,feret_min,circularity,aspect_ratio,roundness,solidity,skewness,kurtosis,filename,analysis_date
0,cFos,,0-FFF-00030,0-FFF-00019,0-FFF-00140,145.01,276.29,61.0,322.8468,411.2940,...,0.0,10.0,0.7767,1.0000,0.7625,0.7262,0.3641,-0.7820,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
1,cFos,,0-01c-00026,0-FFF-00017,0-FFF-00107,351.24,319.62,77.0,322.8468,728.2460,...,0.0,10.0,0.9804,1.0000,0.9625,0.9167,-0.1310,-1.1841,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
2,cFos,,0-01c-00007,0-FFF-00015,0-FFF-00090,465.98,318.40,41.0,322.8468,287.8058,...,0.0,10.0,0.5220,1.0000,0.5125,0.4881,1.1532,1.6963,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
3,cFos,,0-FFF-00029,0-FFF-00030,0-FFF-00058,22.36,277.28,40.0,322.8468,273.6023,...,0.0,10.0,0.5093,1.0000,0.5000,0.4762,0.5347,-1.1035,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
4,cFos,-,-,-,0-FFF-00182,170.88,200.20,79.0,322.8468,345.7466,...,0.0,10.0,1.0000,1.0000,0.9875,0.9405,0.1981,-0.6553,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,mKate,-,-,-,0-FFF-00005,247.05,206.49,65.0,38.9865,36.9134,...,0.0,12.0,0.4883,1.1442,0.4177,0.4924,0.8711,0.0252,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
311,mKate,-,-,-,0-FFF-00004,306.93,80.71,87.0,38.9865,61.4913,...,0.0,12.0,0.6535,1.1442,0.5591,0.6591,0.9822,1.4934,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
312,mKate,-,-,-,0-FFF-00003,459.14,211.04,89.0,38.9865,87.0218,...,0.0,12.0,0.6685,1.1442,0.5719,0.6742,0.7031,-0.5700,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
313,mKate,-,-,-,0-FFF-00002,342.62,275.87,72.0,38.9865,44.4182,...,0.0,12.0,0.5408,1.1442,0.4627,0.5455,0.6865,-0.2463,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024


### renaming coloc col names

In [4]:
def rename_coloc(df):
    stains = df.stain.unique()
    col, ind = np.unique(stains, return_index=True)
    coloc_cols = [f'coloc_w_{stains[i]}' for i in sorted(ind)]
    d_ind = dict(zip(col, ind))

    # check that our ordering is correct
    try:
        assert d_ind['cFos'] < d_ind['EGFP']
    except:
        try:
            assert d_ind['EGFP'] < d_ind['mKate']
        except:
            try:
                assert d_ind['cFos'] < d_ind['mKate']
            except:
                print('Check your ordering assumption')
                print(d_ind)

    # replace colnames 
    for i, col in enumerate(coloc_cols):
        df = df.rename(columns={df.columns[i+1]: coloc_cols[i]})

    return df

df_double_renamed = [rename_coloc(df) for df in df_double]
df_triple_renamed = [rename_coloc(df) for df in df_triple]
df_triple_renamed[0]

Unnamed: 0,stain,coloc_w_cFos,coloc_w_EGFP,coloc_w_mKate,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,feret_angle,feret_min,circularity,aspect_ratio,roundness,solidity,skewness,kurtosis,filename,analysis_date
0,cFos,,0-FFF-00030,0-FFF-00019,0-FFF-00140,145.01,276.29,61.0,322.8468,411.2940,...,0.0,10.0,0.7767,1.0000,0.7625,0.7262,0.3641,-0.7820,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
1,cFos,,0-01c-00026,0-FFF-00017,0-FFF-00107,351.24,319.62,77.0,322.8468,728.2460,...,0.0,10.0,0.9804,1.0000,0.9625,0.9167,-0.1310,-1.1841,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
2,cFos,,0-01c-00007,0-FFF-00015,0-FFF-00090,465.98,318.40,41.0,322.8468,287.8058,...,0.0,10.0,0.5220,1.0000,0.5125,0.4881,1.1532,1.6963,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
3,cFos,,0-FFF-00029,0-FFF-00030,0-FFF-00058,22.36,277.28,40.0,322.8468,273.6023,...,0.0,10.0,0.5093,1.0000,0.5000,0.4762,0.5347,-1.1035,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
4,cFos,-,-,-,0-FFF-00182,170.88,200.20,79.0,322.8468,345.7466,...,0.0,10.0,1.0000,1.0000,0.9875,0.9405,0.1981,-0.6553,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,mKate,-,-,-,0-FFF-00005,247.05,206.49,65.0,38.9865,36.9134,...,0.0,12.0,0.4883,1.1442,0.4177,0.4924,0.8711,0.0252,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
311,mKate,-,-,-,0-FFF-00004,306.93,80.71,87.0,38.9865,61.4913,...,0.0,12.0,0.6535,1.1442,0.5591,0.6591,0.9822,1.4934,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
312,mKate,-,-,-,0-FFF-00003,459.14,211.04,89.0,38.9865,87.0218,...,0.0,12.0,0.6685,1.1442,0.5719,0.6742,0.7031,-0.5700,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024
313,mKate,-,-,-,0-FFF-00002,342.62,275.87,72.0,38.9865,44.4182,...,0.0,12.0,0.5408,1.1442,0.4627,0.5455,0.6865,-0.2463,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024


### build rat_n, image_name, treatment, dox, react, cols

In [5]:
# load in key, standardize cols and labels
df_key = pd.read_csv('RAM Ensembles Complete Cohort Key.csv')
df_key['Group'] = df_key.Group.replace('RAM-14_a', 'RAM-14a').replace('RAM-14_b', 'RAM-14b')
df_key['rat_n'] = df_key.apply(lambda x: '_'.join([x['Group'], str(x['Rat ID'])]), axis=1)
df_key['treatment'] = df_key['ABC/Veh'].str.upper()
df_key['dox'] = df_key['Dox Tx'].str.replace(' ', '_').str.upper()
df_key['react'] = df_key['React'].str.upper()

# select out the cols we are interested in
df_key = df_key[['rat_n', 'react', 'treatment', 'dox']]

### add missing rat information; this was confirmed by AG
df_key.loc[-1] = ['RAM-13_7', 'NR', 'ABC', 'OFF_DOX']
df_key.index = df_key.index + 1
df_key = df_key.sort_index()


def get_cols(df, df_key):
    # parse filename to build rat_n, image_name
    df['rat_n'] = df.filename.str.split('_').apply(lambda x: '_'.join(x[:2]))
    df['image_name'] = df.filename.str.split('_').apply(lambda x: '_'.join(x[:-1]))

    # check that target rat is in df_key
    assert df.rat_n.unique().item() in set(df_key.rat_n)

    # merge with key on rat_n
    df = df.merge(df_key, how='left', on='rat_n')

    return df

df_double_parsed = [get_cols(df, df_key) for df in df_double_renamed]
df_triple_parsed = [get_cols(df, df_key) for df in df_triple_renamed]

df_triple_parsed[0]

Unnamed: 0,stain,coloc_w_cFos,coloc_w_EGFP,coloc_w_mKate,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,solidity,skewness,kurtosis,filename,analysis_date,rat_n,image_name,react,treatment,dox
0,cFos,,0-FFF-00030,0-FFF-00019,0-FFF-00140,145.01,276.29,61.0,322.8468,411.2940,...,0.7262,0.3641,-0.7820,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
1,cFos,,0-01c-00026,0-FFF-00017,0-FFF-00107,351.24,319.62,77.0,322.8468,728.2460,...,0.9167,-0.1310,-1.1841,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
2,cFos,,0-01c-00007,0-FFF-00015,0-FFF-00090,465.98,318.40,41.0,322.8468,287.8058,...,0.4881,1.1532,1.6963,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
3,cFos,,0-FFF-00029,0-FFF-00030,0-FFF-00058,22.36,277.28,40.0,322.8468,273.6023,...,0.4762,0.5347,-1.1035,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
4,cFos,-,-,-,0-FFF-00182,170.88,200.20,79.0,322.8468,345.7466,...,0.9405,0.1981,-0.6553,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,mKate,-,-,-,0-FFF-00005,247.05,206.49,65.0,38.9865,36.9134,...,0.4924,0.8711,0.0252,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
311,mKate,-,-,-,0-FFF-00004,306.93,80.71,87.0,38.9865,61.4913,...,0.6591,0.9822,1.4934,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
312,mKate,-,-,-,0-FFF-00003,459.14,211.04,89.0,38.9865,87.0218,...,0.6742,0.7031,-0.5700,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
313,mKate,-,-,-,0-FFF-00002,342.62,275.87,72.0,38.9865,44.4182,...,0.5455,0.6865,-0.2463,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX


### relabeling roi_ids

In [6]:
df = df_double_parsed[0].copy(deep=True)
sort_order = {'cFos':0, 'EGFP':1, 'mKate':2}

def update_roi_ids(df):
    # fill cells only containing '-' with np.nan
    df = df.replace('-', np.nan)

    # add stain type label to end of roi_id
    df['roi_id'] = df.apply(lambda x: '_'.join([x.roi_id, x.stain]), axis=1)

    # add stain type label to end of coloc roi_ids if coloc stain type label is not null
    coloc_cols = [col for col in df.columns if 'coloc' in col]
    for col in coloc_cols:
        df[col] = df[col].apply(lambda x: '_'.join([x, col.split('_')[-1]]) if not pd.isnull(x) else x)

    return df

df_double_relabel = [update_roi_ids(df) for df in df_double_parsed]
df_triple_relabel = [update_roi_ids(df) for df in df_triple_parsed]

df_triple_relabel[0]

Unnamed: 0,stain,coloc_w_cFos,coloc_w_EGFP,coloc_w_mKate,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,solidity,skewness,kurtosis,filename,analysis_date,rat_n,image_name,react,treatment,dox
0,cFos,,0-FFF-00030_EGFP,0-FFF-00019_mKate,0-FFF-00140_cFos,145.01,276.29,61.0,322.8468,411.2940,...,0.7262,0.3641,-0.7820,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
1,cFos,,0-01c-00026_EGFP,0-FFF-00017_mKate,0-FFF-00107_cFos,351.24,319.62,77.0,322.8468,728.2460,...,0.9167,-0.1310,-1.1841,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
2,cFos,,0-01c-00007_EGFP,0-FFF-00015_mKate,0-FFF-00090_cFos,465.98,318.40,41.0,322.8468,287.8058,...,0.4881,1.1532,1.6963,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
3,cFos,,0-FFF-00029_EGFP,0-FFF-00030_mKate,0-FFF-00058_cFos,22.36,277.28,40.0,322.8468,273.6023,...,0.4762,0.5347,-1.1035,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
4,cFos,,,,0-FFF-00182_cFos,170.88,200.20,79.0,322.8468,345.7466,...,0.9405,0.1981,-0.6553,RAM-13_8_PFC_3.7_E_2.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,mKate,,,,0-FFF-00005_mKate,247.05,206.49,65.0,38.9865,36.9134,...,0.4924,0.8711,0.0252,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
311,mKate,,,,0-FFF-00004_mKate,306.93,80.71,87.0,38.9865,61.4913,...,0.6591,0.9822,1.4934,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
312,mKate,,,,0-FFF-00003_mKate,459.14,211.04,89.0,38.9865,87.0218,...,0.6742,0.7031,-0.5700,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX
313,mKate,,,,0-FFF-00002_mKate,342.62,275.87,72.0,38.9865,44.4182,...,0.5455,0.6865,-0.2463,RAM-13_8_PFC_3.7_E_4.tif,Thu Feb 01 12:44:37 PST 2024,RAM-13_8,RAM-13_8_PFC_3.7_E,FR1,VEH,OFF_DOX


### aggregate across images, build adjacency list

In [7]:
def get_adjacency(df, coloc_type):
    if not coloc_type in set(['double', 'triple']):
        raise ValueError('coloc_type must be either "double" or "triple"')

    if coloc_type == 'double':
        df['grouping'] = df.apply(\
            lambda x: tuple(sorted([y for y in (x.iloc[[1,2,3]]) if not pd.isnull(y)],\
                key=lambda z: sort_order[z.split('_')[-1]])), axis=1)

    if coloc_type == 'triple':
        df['grouping'] = df.apply(\
            lambda x: tuple(sorted([y for y in (x.iloc[[1,2,3,4]]) if not pd.isnull(y)],\
                key=lambda z: sort_order[z.split('_')[-1]])), axis=1)

    return df.drop(columns=[col for col in df.columns if 'coloc' in col], axis=1)

df_double_adj = [get_adjacency(df, coloc_type='double') for df in df_double_relabel]
df_double_adj = pd.concat(df_double_adj)
df_double_adj

Unnamed: 0,stain,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,mode_intensity,stdev,...,skewness,kurtosis,filename,analysis_date,rat_n,image_name,react,treatment,dox,grouping
0,cFos,0-FFF-00289_cFos,359.55,415.47,91.0,268.9914,531.4795,513.3634,221.8932,238.1019,...,0.2173,-1.1709,RAM-13_7_PFC_3.7_D_2.tif,Thu Feb 01 12:37:52 PST 2024,RAM-13_7,RAM-13_7_PFC_3.7_D,NR,ABC,OFF_DOX,"(0-FFF-00289_cFos, 0-01c-00003_EGFP)"
1,cFos,0-FFF-00284_cFos,359.08,168.89,71.0,268.9914,239.4242,229.0855,207.7186,51.4417,...,0.6234,-0.3682,RAM-13_7_PFC_3.7_D_2.tif,Thu Feb 01 12:37:52 PST 2024,RAM-13_7,RAM-13_7_PFC_3.7_D,NR,ABC,OFF_DOX,"(0-FFF-00284_cFos, 0-01c-00042_EGFP)"
2,cFos,0-FFF-00283_cFos,455.98,177.70,68.0,268.9914,235.5893,231.9074,174.4994,51.5495,...,0.7101,-0.0190,RAM-13_7_PFC_3.7_D_2.tif,Thu Feb 01 12:37:52 PST 2024,RAM-13_7,RAM-13_7_PFC_3.7_D,NR,ABC,OFF_DOX,"(0-FFF-00283_cFos, 0-01c-00035_EGFP)"
3,cFos,0-FFF-00280_cFos,69.38,386.92,67.0,268.9914,252.8609,249.5636,246.0562,53.1270,...,0.1612,-0.7985,RAM-13_7_PFC_3.7_D_2.tif,Thu Feb 01 12:37:52 PST 2024,RAM-13_7,RAM-13_7_PFC_3.7_D,NR,ABC,OFF_DOX,"(0-FFF-00280_cFos, 0-FFF-00256_EGFP)"
4,cFos,0-FFF-00277_cFos,216.91,145.75,54.0,268.9914,271.4481,274.0393,205.3216,65.4765,...,0.2494,-0.8877,RAM-13_7_PFC_3.7_D_2.tif,Thu Feb 01 12:37:52 PST 2024,RAM-13_7,RAM-13_7_PFC_3.7_D,NR,ABC,OFF_DOX,"(0-FFF-00277_cFos, 0-FFF-00245_EGFP)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,EGFP,0-01c-00027_EGFP,432.23,144.24,148.0,1.2013,33.9209,31.894,37.9127,21.1863,...,0.399,-0.7197,RAM-14a_8_PFC_3.5_C_3.tif,Thu Feb 01 13:39:15 PST 2024,RAM-14a_8,RAM-14a_8_PFC_3.5_C,FR1,VEH,OFF_DOX,"(0-005-00063_cFos, 0-01c-00027_EGFP)"
222,EGFP,0-FFF-00053_EGFP,223.92,25.3,96.0,1.2013,6.5439,5.9674,6.9663,3.6182,...,0.6187,0.0251,RAM-14a_8_PFC_3.5_C_3.tif,Thu Feb 01 13:39:15 PST 2024,RAM-14a_8,RAM-14a_8_PFC_3.5_C,FR1,VEH,OFF_DOX,"(0-005-00046_cFos, 0-FFF-00053_EGFP)"
223,EGFP,0-FFF-00070_EGFP,91.04,375.7,80.0,1.2013,3.4551,2.975,1.9619,2.1140,...,1.1027,0.8299,RAM-14a_8_PFC_3.5_C_3.tif,Thu Feb 01 13:39:15 PST 2024,RAM-14a_8,RAM-14a_8_PFC_3.5_C,FR1,VEH,OFF_DOX,"(0-005-00024_cFos, 0-FFF-00070_EGFP)"
224,EGFP,0-FFF-00056_EGFP,367.91,49.18,90.0,1.2013,13.191,11.9709,1.0509,9.2798,...,0.8227,0.6604,RAM-14a_8_PFC_3.5_C_3.tif,Thu Feb 01 13:39:15 PST 2024,RAM-14a_8,RAM-14a_8_PFC_3.5_C,FR1,VEH,OFF_DOX,"(0-005-00023_cFos, 0-FFF-00056_EGFP)"


### aggregate adjacency tuples, merge with remaining data cols

In [8]:
agg_groupings = df_double_adj.groupby(['image_name', 'roi_id']).grouping.sum()\
    .apply(lambda x: tuple(sorted(sorted(list(set(x))), key=lambda y: sort_order[y.split('_')[-1]])))\
    .reset_index().reset_index().drop('index', axis=1)\
    .rename(columns={'grouping':'agg_grouping'})

df_double_agg_groupings = agg_groupings\
    .merge(df_double_adj.drop_duplicates(subset=['image_name', 'roi_id']), on=['image_name', 'roi_id'], how='left')

df_double_agg_groupings

Unnamed: 0,image_name,roi_id,agg_grouping,stain,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,solidity,skewness,kurtosis,filename,analysis_date,rat_n,react,treatment,dox,grouping
0,RAM-12_11_PFC_3.5_A,0-002-00000_mKate,"(0-01c-00028_EGFP, 0-002-00000_mKate)",mKate,490.69,263.66,91.0,54.2523,130.8234,133.4745,...,0.9286,0.0462,-0.9715,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)"
1,RAM-12_11_PFC_3.5_A,0-002-00001_mKate,"(0-002-00001_mKate,)",mKate,359.15,417.22,77.0,54.2523,252.4054,263.5753,...,0.9167,0.1464,-1.004,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-002-00001_mKate,)"
2,RAM-12_11_PFC_3.5_A,0-002-00002_mKate,"(0-01c-00047_EGFP, 0-002-00002_mKate)",mKate,149.95,354.17,125.0,54.2523,910.3442,837.9077,...,0.9470,0.2517,-1.4962,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00047_EGFP, 0-002-00002_mKate)"
3,RAM-12_11_PFC_3.5_A,0-002-00003_mKate,"(0-01c-00128_EGFP, 0-002-00003_mKate)",mKate,193.05,160.84,80.0,54.2523,280.5944,262.2604,...,1.0526,0.3551,-0.7112,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00128_EGFP, 0-002-00003_mKate)"
4,RAM-12_11_PFC_3.5_A,0-002-00004_mKate,"(0-FFF-00238_EGFP, 0-002-00004_mKate)",mKate,221.09,254.73,90.0,54.2523,237.5106,230.533,...,0.9184,0.1787,-1.1646,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-FFF-00238_EGFP, 0-002-00004_mKate)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41248,RAM-14b_8_PFC_3.9_B,0-FFF-00229_cFos,"(0-FFF-00229_cFos,)",cFos,31.40,410.3,75.0,163.3016,437.2036,495.5051,...,0.8929,-0.5086,-1.0140,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00229_cFos,)"
41249,RAM-14b_8_PFC_3.9_B,0-FFF-00230_cFos,"(0-FFF-00230_cFos, 0-FFF-00067_EGFP)",cFos,23.08,426.77,80.0,163.3016,311.7118,323.7442,...,0.9524,-0.4302,-0.0583,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00230_cFos,)"
41250,RAM-14b_8_PFC_3.9_B,0-FFF-00231_cFos,"(0-FFF-00231_cFos,)",cFos,54.17,439.03,77.0,163.3016,646.9208,672.5289,...,0.9167,-0.1466,-0.9051,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00231_cFos,)"
41251,RAM-14b_8_PFC_3.9_B,0-FFF-00232_cFos,"(0-FFF-00232_cFos,)",cFos,30.05,470.72,75.0,163.3016,459.072,463.2822,...,0.8929,-0.0804,-0.2764,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00232_cFos,)"


### check that previously offending queries give valid results

In [9]:
# set types
df_double_agg_groupings['CoM_x'] = df_double_agg_groupings.CoM_x.astype(float)
df_double_agg_groupings['CoM_y'] = df_double_agg_groupings.CoM_y.astype(float)
df_double_agg_groupings['background'] = df_double_agg_groupings.background.astype(float)
df_double_agg_groupings['mean_intensity'] = df_double_agg_groupings.mean_intensity.astype(float)

# check previously offending queries
pd.concat([df_double_agg_groupings.query("image_name == 'RAM-13_1_PFC_3.7_A' and roi_id == '0-005-00024_cFos'"),
df_double_agg_groupings.query("image_name == 'RAM-13_1_PFC_3.7_A' and roi_id == '0-002-00076_mKate'"),
df_double_agg_groupings.query("image_name == 'RAM-14a_3_PFC_3.2_C' and roi_id == '0-FFF-00054_cFos'"),])

Unnamed: 0,image_name,roi_id,agg_grouping,stain,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,solidity,skewness,kurtosis,filename,analysis_date,rat_n,react,treatment,dox,grouping
13462,RAM-13_1_PFC_3.7_A,0-005-00024_cFos,"(0-005-00024_cFos, 0-FFF-00190_mKate)",cFos,481.17,150.81,78.0,229.0655,382.5926,337.9207,...,0.7959,0.7495,-0.6422,RAM-13_1_PFC_3.7_A_2.tif,Thu Feb 01 12:19:20 PST 2024,RAM-13_1,NR,VEH,OFF_DOX,"(0-005-00024_cFos, 0-FFF-00190_mKate)"
13398,RAM-13_1_PFC_3.7_A,0-002-00076_mKate,"(0-01c-00053_EGFP, 0-01c-00105_EGFP, 0-002-000...",mKate,341.31,97.87,95.0,146.8669,300.5107,310.3995,...,0.9694,-0.3685,-0.3652,RAM-13_1_PFC_3.7_A_4.tif,Thu Feb 01 12:19:31 PST 2024,RAM-13_1,NR,VEH,OFF_DOX,"(0-01c-00105_EGFP, 0-002-00076_mKate)"
26549,RAM-14a_3_PFC_3.2_C,0-FFF-00054_cFos,"(0-FFF-00054_cFos, 0-01c-00034_EGFP, 0-01c-000...",cFos,334.51,372.63,103.0,386.6327,528.301,524.3964,...,0.9196,0.2832,-0.6025,RAM-14a_3_PFC_3.2_C_2.tif,Thu Feb 01 12:56:15 PST 2024,RAM-14a_3,VR5,VEH,ON_DOX,"(0-FFF-00054_cFos, 0-01c-00044_EGFP)"


## Enforcing complete subgraph colocalization

In [10]:
df_grouped_counts = df_double_agg_groupings.groupby(['image_name', 'agg_grouping'])['agg_grouping']\
    .count().rename('counts').to_frame()\
    .reset_index().reset_index().drop('index', axis=1)
df_grouped_counts['len'] = df_grouped_counts.agg_grouping.apply(lambda x: len(x))

# if a grouping's length (the number of roi ids listed in the tuple) is equal to
# the number of times it appears in a given image, that grouping is plausible
# that is, if some mKate cell points to some cFos cell, and that cFos cell points
# to the same mKate cell, then that adjacency tuple must appear exactly twice in
# the given image (qualify per image here since roi_ids start from 0 for each image)

# now lets consider the case wher the counts and the lengths do not match. This 
# mismatch means that either a row was duplicated (counts > len) or that the subgraph
# defined by its adjacency tuple is not complete (counts < len); i.e. some mKate cell
# points to some cFos cell, but that cFos cell says it's single labeled. 

# duplicates where already dropped so we expect this length (counts > len) to be exactly 0
assert len(df_grouped_counts[df_grouped_counts.counts > df_grouped_counts.len]) == 0

# lets examine only cases of incomplete subgraphs
df_mismatched = df_grouped_counts[df_grouped_counts.counts < df_grouped_counts.len]
df_mismatched

Unnamed: 0,image_name,agg_grouping,counts,len
38,RAM-12_11_PFC_3.5_A,"(0-01c-00028_EGFP, 0-002-00000_mKate)",1,2
154,RAM-12_11_PFC_3.5_A,"(0-FFF-00024_cFos, 0-01c-00028_EGFP)",1,2
155,RAM-12_11_PFC_3.5_A,"(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",1,3
405,RAM-12_11_PFC_3.7_B,"(0-01c-00001_EGFP, 0-002-00014_mKate)",1,2
533,RAM-12_11_PFC_3.7_B,"(0-FFF-00015_cFos, 0-FFF-00234_EGFP)",1,2
...,...,...,...,...
34358,RAM-14b_8_PFC_3.9_B,"(0-005-00103_cFos, 0-FFF-00046_EGFP)",1,2
34359,RAM-14b_8_PFC_3.9_B,"(0-005-00103_cFos, 0-FFF-00046_EGFP, 0-FFF-001...",1,3
34415,RAM-14b_8_PFC_3.9_B,"(0-FFF-00046_EGFP, 0-FFF-00100_mKate)",1,2
34450,RAM-14b_8_PFC_3.9_B,"(0-FFF-00083_EGFP, 0-FFF-00081_mKate)",1,2


### explode out all roi_ids contained in adjacency grouping tuples, merge with other data cols

In [11]:
df_coloc_mismatch = df_mismatched.explode('agg_grouping')[['image_name', 'agg_grouping']]\
    .drop_duplicates().rename(columns={'agg_grouping': 'roi_id'})\
    .merge(df_double_agg_groupings, how='left', on=['image_name', 'roi_id'])
    # .dropna()

df_coloc_mismatch

Unnamed: 0,image_name,roi_id,agg_grouping,stain,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,solidity,skewness,kurtosis,filename,analysis_date,rat_n,react,treatment,dox,grouping
0,RAM-12_11_PFC_3.5_A,0-01c-00028_EGFP,"(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",EGFP,489.90,262.61,176.0,431.2980,1104.5052,1143.469,...,1.0000,0.0549,-1.0023,RAM-12_11_PFC_3.5_A_3.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)"
1,RAM-12_11_PFC_3.5_A,0-002-00000_mKate,"(0-01c-00028_EGFP, 0-002-00000_mKate)",mKate,490.69,263.66,91.0,54.2523,130.8234,133.4745,...,0.9286,0.0462,-0.9715,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)"
2,RAM-12_11_PFC_3.5_A,0-FFF-00024_cFos,"(0-FFF-00024_cFos, 0-01c-00028_EGFP)",cFos,489.67,266.64,35.0,518.5723,541.0287,536.5906,...,0.4167,0.8046,0.5241,RAM-12_11_PFC_3.5_A_2.tif,Thu Feb 01 11:15:18 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-FFF-00024_cFos,)"
3,RAM-12_11_PFC_3.7_B,0-01c-00001_EGFP,"(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",EGFP,375.84,304.48,198.0,641.2907,1425.8392,1399.5972,...,0.9083,0.0895,-1.5263,RAM-12_11_PFC_3.7_B_3.tif,Thu Feb 01 11:16:34 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00001_EGFP, 0-002-00014_mKate)"
4,RAM-12_11_PFC_3.7_B,0-002-00014_mKate,"(0-01c-00001_EGFP, 0-002-00014_mKate)",mKate,376.27,305.64,75.0,49.4218,93.7627,93.9137,...,0.7653,0.2264,-0.137,RAM-12_11_PFC_3.7_B_4.tif,Thu Feb 01 11:16:23 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-002-00014_mKate,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729,RAM-14b_8_PFC_3.9_B,0-FFF-00083_EGFP,"(0-005-00080_cFos, 0-FFF-00083_EGFP, 0-FFF-000...",EGFP,250.16,270.02,113.0,2.1699,10.5380,9.596,...,0.8561,0.6978,-0.0223,RAM-14b_8_PFC_3.9_B_3.tif,Thu Feb 01 15:01:59 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00083_EGFP, 0-FFF-00081_mKate)"
1730,RAM-14b_8_PFC_3.9_B,0-FFF-00081_mKate,"(0-FFF-00083_EGFP, 0-FFF-00081_mKate)",mKate,249.83,270.32,80.0,108.3995,1417.8975,1712.0447,...,0.9524,-0.3809,-1.3175,RAM-14b_8_PFC_3.9_B_4.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00081_mKate,)"
1731,RAM-14b_8_PFC_3.9_B,0-005-00103_cFos,"(0-005-00103_cFos, 0-FFF-00046_EGFP)",cFos,27.32,225.03,67.0,163.3016,235.2092,247.6649,...,0.6837,0.1937,-1.1679,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-005-00103_cFos,)"
1732,RAM-14b_8_PFC_3.9_B,0-FFF-00046_EGFP,"(0-005-00103_cFos, 0-FFF-00046_EGFP, 0-FFF-001...",EGFP,28.21,224.15,97.0,2.1699,6.4613,4.9509,...,0.7348,1.0979,0.7079,RAM-14b_8_PFC_3.9_B_3.tif,Thu Feb 01 15:01:59 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00046_EGFP, 0-FFF-00100_mKate)"


### Get implied groupings for mismatched subgraphs

In [12]:
### testing image
im = 'RAM-13_1_PFC_3.7_A'
rid = '0-005-00024_cFos'
df = df_coloc_mismatch

def implied_grouping(df, im, rid):
    implied_adj = [rid]
    updated_adj = []

    while set(implied_adj) != set(updated_adj):
        implied_adj += updated_adj

        for r in implied_adj:
            q = f"image_name == '{im}' and roi_id == '{r}'"
            neighbors = df.query(q)['agg_grouping']

            try: 
                neighbors = neighbors.item() 
            except: 
                print(q)
                print(neighbors,'\n')
                return('network search failed')

            for n in neighbors:
                updated_adj.append(n)

    return tuple(sorted(sorted(list(set(implied_adj))), key=lambda x: sort_order[x.split('_')[-1]]))

df_coloc_mismatch['implied_grouping'] = df_coloc_mismatch\
    .apply(lambda x: implied_grouping(df_double_agg_groupings, x.image_name, x.roi_id), axis=1)

### Consider differently sized groups of mismatched roi_ids separately

In [13]:
df_coloc_mismatch['len'] = df_coloc_mismatch.implied_grouping.apply(lambda x: len(x))
print(df_coloc_mismatch.len.value_counts())

df_coloc_mismatch_3way = df_coloc_mismatch.query('len == 3')
df_coloc_mismatch_4way = df_coloc_mismatch.query('len == 4')
df_coloc_mismatch_5way = df_coloc_mismatch.query('len == 5')

# check that the number of instances of each erronous implied grouping is equal 
# to the size of that grouping (i.e., an mismatched implied grouping of size 4 should
# appear exactly 4 times, once for each of the roi_id's in the grouping)
assert df_coloc_mismatch_3way.groupby('implied_grouping').implied_grouping\
    .apply(lambda x: len(x)).unique().item() == 3
assert df_coloc_mismatch_4way.groupby('implied_grouping').implied_grouping\
    .apply(lambda x: len(x)).unique().item() == 4
assert df_coloc_mismatch_5way.groupby('implied_grouping').implied_grouping\
    .apply(lambda x: len(x)).unique().item() == 5

df_coloc_mismatch_3way.head()


len
3    1674
4      40
5      20
Name: count, dtype: int64


Unnamed: 0,image_name,roi_id,agg_grouping,stain,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,kurtosis,filename,analysis_date,rat_n,react,treatment,dox,grouping,implied_grouping,len
0,RAM-12_11_PFC_3.5_A,0-01c-00028_EGFP,"(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",EGFP,489.9,262.61,176.0,431.298,1104.5052,1143.469,...,-1.0023,RAM-12_11_PFC_3.5_A_3.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",3
1,RAM-12_11_PFC_3.5_A,0-002-00000_mKate,"(0-01c-00028_EGFP, 0-002-00000_mKate)",mKate,490.69,263.66,91.0,54.2523,130.8234,133.4745,...,-0.9715,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",3
2,RAM-12_11_PFC_3.5_A,0-FFF-00024_cFos,"(0-FFF-00024_cFos, 0-01c-00028_EGFP)",cFos,489.67,266.64,35.0,518.5723,541.0287,536.5906,...,0.5241,RAM-12_11_PFC_3.5_A_2.tif,Thu Feb 01 11:15:18 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-FFF-00024_cFos,)","(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",3
3,RAM-12_11_PFC_3.7_B,0-01c-00001_EGFP,"(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",EGFP,375.84,304.48,198.0,641.2907,1425.8392,1399.5972,...,-1.5263,RAM-12_11_PFC_3.7_B_3.tif,Thu Feb 01 11:16:34 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00001_EGFP, 0-002-00014_mKate)","(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",3
4,RAM-12_11_PFC_3.7_B,0-002-00014_mKate,"(0-01c-00001_EGFP, 0-002-00014_mKate)",mKate,376.27,305.64,75.0,49.4218,93.7627,93.9137,...,-0.137,RAM-12_11_PFC_3.7_B_4.tif,Thu Feb 01 11:16:23 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-002-00014_mKate,)","(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",3


#### the 3way case

In [14]:
import itertools

def dist(p1, p2):
    x1, y1 = p1
    x2, y2 = p2
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def tie_breaker_3way(df_3way, current_grp):
    grp = df_3way[df_3way['implied_grouping'] == current_grp].copy(deep=True)
    coords = dict(zip(grp.roi_id,list(zip(grp.CoM_x.values, grp.CoM_y.values))))

    distances = []
    for p1, p2 in itertools.combinations(coords.keys(), 2):
        stain_type1 = p1.split('_')[-1]
        stain_type2 = p2.split('_')[-1]
        if not stain_type1 == stain_type2:
            distances.append(((p1, p2), dist(coords[p1], coords[p2])))
        else:
            print(f'{p1} and {p2} cannot be colocalized; skipping distance computation for this pair')

    d = dict(distances)
    winner = set(min(d, key=d.get))
    leftover = set(current_grp) - winner

    winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-1]]))
    leftover = tuple(leftover)

    # update groupings
    grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else leftover, axis=1)

    return grp

mismatched_3ways = df_coloc_mismatch_3way.implied_grouping.unique()
tie_broken_3ways = [tie_breaker_3way(df_coloc_mismatch_3way, grp) for grp in mismatched_3ways]
df_3way_tiebreak = pd.concat(tie_broken_3ways)

0-01c-00106_EGFP and 0-FFF-00256_EGFP cannot be colocalized; skipping distance computation for this pair
0-01c-00015_EGFP and 0-01c-00090_EGFP cannot be colocalized; skipping distance computation for this pair
0-01c-00064_EGFP and 0-FFF-00091_EGFP cannot be colocalized; skipping distance computation for this pair
0-002-00008_mKate and 0-FFF-00036_mKate cannot be colocalized; skipping distance computation for this pair


#### the 4way case

In [15]:
def tie_breaker_4way(df_4way, current_grp):
    def check_same(leftover):
        p1, p2 = leftover
        stain_type1 = p1.split('_')[-1]
        stain_type2 = p2.split('_')[-1]

        if not stain_type1 == stain_type2:
            return False
        else:
            return True

    # after computing winning pair, we have 3 possible out comes:
    # first leftover rids form a true pair
    def check_pair(leftover):
        p1, p2 = leftover
        if p2 in grp[grp.roi_id == p1]['grouping'].item() and p1 in grp[grp.roi_id == p2]['grouping'].item():
            return True
        else:
            return False

    def check_triple(leftover, winner):
        l1, l2 = leftover
        w1, w2 = winner

        if w1 in grp[grp.roi_id == l1]['grouping'].item() \
            and w2 in grp[grp.roi_id == l1]['grouping'].item() \
            and l1 in grp[grp.roi_id == w1]['grouping'].item() \
            and l1 in grp[grp.roi_id == w2]['grouping'].item():
            return True, l1

        elif w1 in grp[grp.roi_id == l2]['grouping'].item() \
            and w2 in grp[grp.roi_id == l2]['grouping'].item() \
            and l2 in grp[grp.roi_id == w1]['grouping'].item() \
            and l2 in grp[grp.roi_id == w2]['grouping'].item():
            return True, l2

        else:
            return False, None

    grp = df_4way[df_4way['implied_grouping'] == current_grp].copy(deep=True)
    coords = dict(zip(grp.roi_id,list(zip(grp.CoM_x.values, grp.CoM_y.values))))

    distances = []
    for p1, p2 in itertools.combinations(coords.keys(), 2):
        stain_type1 = p1.split('_')[-1]
        stain_type2 = p2.split('_')[-1]
        if not stain_type1 == stain_type2:
            distances.append(((p1, p2), dist(coords[p1], coords[p2])))
        else:
            print(f'{p1} and {p2} cannot be colocalized; skipping distance computation for this pair')

    d = dict(distances)
    winning_set = set(min(d, key=d.get))
    winner = tuple(winning_set)
    leftover = tuple(set(current_grp) - winning_set)
    lonely_leftovers = False

    # if leftover rids not the same staintype, check if they belong in a separate pair
    if not check_same(leftover):
        if not check_pair(leftover):

            # if leftover rids not the same staintype, and they do NOT form a pair,
            # check of the winning pair should be triple
            triple, triple_rid = check_triple(leftover, winner)

            # if winning pair should be a triple, add that rid to winning tuple
            if triple:
                winner += (triple_rid,)
                leftover = tuple(set(leftover) - set(winner))
            
            # if leftover rids do not form a pair and winner is not triple, then split them
            elif not triple:
                lonely_leftovers = True

        # leftover rids not the same staintype, and both belong in a separate pair
        elif check_pair(leftover):
            # keep pairs as is
            winner = winner
            leftover = leftover

    # if both leftover rids are the same staintype, the winning pair could still be a triple
    elif check_same(leftover):
        triple, triple_rid = check_triple(leftover, winner)

        # if winning pair should be a triple, add that rid to winning tuple
        if triple:
            winner += (triple_rid,)
            leftover = tuple(set(leftover) - set(winner))
        
        # if the winning pair is NOT triple and the leftover rids are the SAME staintype, split them
        elif not triple:
            lonely_leftovers = True

    if not lonely_leftovers:
        winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-1]]))
        leftover = tuple(sorted([rid for rid in leftover], key=lambda x: sort_order[x.split('_')[-1]]))
        grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else leftover, axis=1)

    elif lonely_leftovers:
        winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-1]]))
        grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else (x.roi_id,), axis=1)

    return grp

mismatched_4ways = df_coloc_mismatch_4way.implied_grouping.unique()
tie_broken_4ways = [tie_breaker_4way(df_coloc_mismatch_4way, grp) for grp in mismatched_4ways]
df_4way_tiebreak = pd.concat(tie_broken_4ways)

0-01c-00048_EGFP and 0-01c-00092_EGFP cannot be colocalized; skipping distance computation for this pair
0-01c-00001_EGFP and 0-01c-00063_EGFP cannot be colocalized; skipping distance computation for this pair
0-01c-00053_EGFP and 0-01c-00105_EGFP cannot be colocalized; skipping distance computation for this pair
0-01c-00102_EGFP and 0-01c-00145_EGFP cannot be colocalized; skipping distance computation for this pair
0-005-00060_cFos and 0-FFF-00142_cFos cannot be colocalized; skipping distance computation for this pair
0-01c-00036_EGFP and 0-FFF-00258_EGFP cannot be colocalized; skipping distance computation for this pair
0-005-00042_cFos and 0-005-00190_cFos cannot be colocalized; skipping distance computation for this pair
0-FFF-00118_EGFP and 0-01c-00055_EGFP cannot be colocalized; skipping distance computation for this pair
0-FFF-00164_EGFP and 0-FFF-00165_EGFP cannot be colocalized; skipping distance computation for this pair
0-FFF-00112_EGFP and 0-FFF-00113_EGFP cannot be colocal

#### the 5way case
thankfully, we have no mismatched groupings larger than this

In [16]:
def tie_breaker_5way(df_5way, current_grp):
    def check_same(leftover):
        p1, p2 = leftover
        stain_type1 = p1.split('_')[-1]
        stain_type2 = p2.split('_')[-1]

        if not stain_type1 == stain_type2:
            return False
        else:
            return True

    # after computing winning pair, we have 3 possible out comes:
    # first leftover rids form a true pair
    def check_pair(leftover):
        p1, p2 = leftover
        if p2 in grp[grp.roi_id == p1]['grouping'].item() and p1 in grp[grp.roi_id == p2]['grouping'].item():
            return True
        else:
            return False

    def check_triple(leftover, winner):
        if len(leftover) == 3:
            l1, l2, l3 = leftover
            w1, w2 = winner

            if w1 in grp[grp.roi_id == l1]['grouping'].item() \
                and w2 in grp[grp.roi_id == l1]['grouping'].item() \
                and l1 in grp[grp.roi_id == w1]['grouping'].item() \
                and l1 in grp[grp.roi_id == w2]['grouping'].item():
                return True, l1

            elif w1 in grp[grp.roi_id == l2]['grouping'].item() \
                and w2 in grp[grp.roi_id == l2]['grouping'].item() \
                and l2 in grp[grp.roi_id == w1]['grouping'].item() \
                and l2 in grp[grp.roi_id == w2]['grouping'].item():
                return True, l2
            
            elif w1 in grp[grp.roi_id == l3]['grouping'].item() \
                and w2 in grp[grp.roi_id == l3]['grouping'].item() \
                and l3 in grp[grp.roi_id == w1]['grouping'].item() \
                and l3 in grp[grp.roi_id == w2]['grouping'].item():
                return True, l3
            
            else:
                return False, None
        
        elif len(leftover) == 1:
            l1 = leftover[0]
            w1, w2 = winner

            if w1 in grp[grp.roi_id == l1]['grouping'].item() \
                and w2 in grp[grp.roi_id == l1]['grouping'].item() \
                and l1 in grp[grp.roi_id == w1]['grouping'].item() \
                and l1 in grp[grp.roi_id == w2]['grouping'].item():
                return True, l1

            else:
                return False, None

    def check_lonely(leftover):

        p1, p2, p3 = leftover
        # leftover rids are lonely if none of them point to each other
        if p1 in grp[grp.roi_id == p2]['grouping'].item() \
            or p1 in grp[grp.roi_id == p3]['grouping'].item() \
            or p2 in grp[grp.roi_id == p1]['grouping'].item() \
            or p2 in grp[grp.roi_id == p3]['grouping'].item() \
            or p3 in grp[grp.roi_id == p1]['grouping'].item() \
            or p3 in grp[grp.roi_id == p2]['grouping'].item():
            return False
        else:
            return True

    def map_updated_groupings(roi_id, groups):
        for g in groups:
            if roi_id in g:
                return g

    grp = df_5way[df_5way['implied_grouping'] == current_grp].copy(deep=True)
    coords = dict(zip(grp.roi_id,list(zip(grp.CoM_x.values, grp.CoM_y.values))))

    distances = []
    for p1, p2 in itertools.combinations(coords.keys(), 2):
        stain_type1 = p1.split('_')[-1]
        stain_type2 = p2.split('_')[-1]
        if not stain_type1 == stain_type2:
            distances.append(((p1, p2), dist(coords[p1], coords[p2])))
        else:
            print(f'{p1} and {p2} cannot be colocalized; skipping distance computation for this pair')

    d = dict(distances)
    winning_set = set(min(d, key=d.get))

    winner = tuple(winning_set)
    leftover = tuple(set(current_grp) - winning_set)
    lonely_leftovers = False
    next_winner = False

    # check if winning pair is triple
    triple, triple_rid = check_triple(leftover, winner)
    if triple:
        # add triple_rid to winner, remove new winner
        winner += (triple_rid,)
        leftover = tuple(set(leftover) - set(winner))

        # check if remaining two leftover rids form a pair
        if check_pair(leftover):
            # do nothing and update groupings
            winner = winner
            leftover = leftover

        elif not check_pair(leftover):
            lonely_leftovers = True

    elif not triple:
        # check if we have the lonely case
        if not check_lonely(leftover):
            # if first winner is not triple and we do not have the lonely case,
            # there must be at least one other true pairing; get the next closest pair

            # continue to get next closest pair until winner and next_winner do not overlap
            next_winner = winner
            while len(set(winner).intersection(set(next_winner))) > 0:
                del d[min(d, key=d.get)]
                next_winning_set = set(min(d, key=d.get))
                next_winner = tuple(next_winning_set)
            leftover = tuple(set(leftover) - next_winning_set)

            # check if next winner is triple
            next_triple, next_triple_rid =  check_triple(leftover, next_winner)
            if next_triple:
                # add next_triple_rid to next_winner, remove new winner
                next_winner += (next_triple_rid,)
                leftover = tuple(set(leftover) - set(next_winner))

            # if it is not, then we have 2 pairs and one single lonely cell
            elif not next_triple:
                # do nothing and updated groupings
                next_winner = next_winner
                leftover = leftover

        else:
            lonely_leftovers = True

    ### update groupings
    # the case where our first winner was triple, and the leftovers formed a pair
    if not next_winner and not lonely_leftovers:
        winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-1]]))
        leftover = tuple(sorted([rid for rid in leftover], key=lambda x: sort_order[x.split('_')[-1]]))
        grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else leftover, axis=1)

    # the case where our first winner was a pair, and the leftovers either formed a pair or triple
    if next_winner:
        winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-1]]))
        next_winner = tuple(sorted([rid for rid in next_winner], key=lambda x: sort_order[x.split('_')[-1]]))

        # the case where our next winner was a pair, and we had one single leftover
        if len(leftover) > 0:
            leftover = tuple(sorted([rid for rid in leftover], key=lambda x: sort_order[x.split('_')[-1]]))
            grp['updated_grouping'] = grp.apply(lambda x: map_updated_groupings(x.roi_id, [winner, next_winner, leftover]), axis=1)
        
        # the case where our next winner was a triple and we had no singles leftover
        else:
            grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else next_winner, axis=1)

    # the case where our first winner was a pair and all remaining leftovers were lonely
    elif lonely_leftovers:
        winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-1]]))
        grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else (x.roi_id,), axis=1)
    
    return grp

mismatched_5ways = df_coloc_mismatch_5way.implied_grouping.unique()
tie_broken_5ways = [tie_breaker_5way(df_coloc_mismatch_5way, grp) for grp in mismatched_5ways]
df_5way_tiebreak = pd.concat(tie_broken_5ways)

0-01c-00014_EGFP and 0-01c-00071_EGFP cannot be colocalized; skipping distance computation for this pair
0-FFF-00032_mKate and 0-FFF-00033_mKate cannot be colocalized; skipping distance computation for this pair
0-FFF-00315_cFos and 0-FFF-00316_cFos cannot be colocalized; skipping distance computation for this pair
0-01c-00051_EGFP and 0-FFF-00247_EGFP cannot be colocalized; skipping distance computation for this pair
0-01c-00032_EGFP and 0-01c-00047_EGFP cannot be colocalized; skipping distance computation for this pair
0-FFF-00173_mKate and 0-FFF-00175_mKate cannot be colocalized; skipping distance computation for this pair
0-01c-00034_EGFP and 0-01c-00044_EGFP cannot be colocalized; skipping distance computation for this pair
0-002-00094_mKate and 0-FFF-00227_mKate cannot be colocalized; skipping distance computation for this pair


In [17]:
# final concatentation
df_tiebreak = pd.concat([df_3way_tiebreak,df_4way_tiebreak,df_5way_tiebreak])
df_tiebreak

Unnamed: 0,image_name,roi_id,agg_grouping,stain,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,filename,analysis_date,rat_n,react,treatment,dox,grouping,implied_grouping,len,updated_grouping
0,RAM-12_11_PFC_3.5_A,0-01c-00028_EGFP,"(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",EGFP,489.90,262.61,176.0,431.2980,1104.5052,1143.469,...,RAM-12_11_PFC_3.5_A_3.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",3,"(0-01c-00028_EGFP, 0-002-00000_mKate)"
1,RAM-12_11_PFC_3.5_A,0-002-00000_mKate,"(0-01c-00028_EGFP, 0-002-00000_mKate)",mKate,490.69,263.66,91.0,54.2523,130.8234,133.4745,...,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",3,"(0-01c-00028_EGFP, 0-002-00000_mKate)"
2,RAM-12_11_PFC_3.5_A,0-FFF-00024_cFos,"(0-FFF-00024_cFos, 0-01c-00028_EGFP)",cFos,489.67,266.64,35.0,518.5723,541.0287,536.5906,...,RAM-12_11_PFC_3.5_A_2.tif,Thu Feb 01 11:15:18 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-FFF-00024_cFos,)","(0-FFF-00024_cFos, 0-01c-00028_EGFP, 0-002-000...",3,"(0-FFF-00024_cFos,)"
3,RAM-12_11_PFC_3.7_B,0-01c-00001_EGFP,"(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",EGFP,375.84,304.48,198.0,641.2907,1425.8392,1399.5972,...,RAM-12_11_PFC_3.7_B_3.tif,Thu Feb 01 11:16:34 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00001_EGFP, 0-002-00014_mKate)","(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",3,"(0-01c-00001_EGFP, 0-002-00014_mKate)"
4,RAM-12_11_PFC_3.7_B,0-002-00014_mKate,"(0-01c-00001_EGFP, 0-002-00014_mKate)",mKate,376.27,305.64,75.0,49.4218,93.7627,93.9137,...,RAM-12_11_PFC_3.7_B_4.tif,Thu Feb 01 11:16:23 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-002-00014_mKate,)","(0-FFF-00029_cFos, 0-01c-00001_EGFP, 0-002-000...",3,"(0-01c-00001_EGFP, 0-002-00014_mKate)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449,RAM-14a_3_PFC_3.2_C,0-01c-00034_EGFP,"(0-FFF-00054_cFos, 0-01c-00034_EGFP, 0-002-000...",EGFP,331.69,369.69,239.0,814.9801,1596.3589,1578.047,...,RAM-14a_3_PFC_3.2_C_3.tif,Thu Feb 01 12:56:15 PST 2024,RAM-14a_3,VR5,VEH,ON_DOX,"(0-FFF-00054_cFos, 0-01c-00034_EGFP)","(0-FFF-00054_cFos, 0-01c-00034_EGFP, 0-01c-000...",5,"(0-01c-00034_EGFP, 0-002-00094_mKate)"
1450,RAM-14a_3_PFC_3.2_C,0-002-00094_mKate,"(0-01c-00034_EGFP, 0-002-00094_mKate)",mKate,333.06,369.05,190.0,259.2381,563.1533,497.0677,...,RAM-14a_3_PFC_3.2_C_4.tif,Thu Feb 01 12:56:38 PST 2024,RAM-14a_3,VR5,VEH,ON_DOX,"(0-01c-00034_EGFP, 0-002-00094_mKate)","(0-FFF-00054_cFos, 0-01c-00034_EGFP, 0-01c-000...",5,"(0-01c-00034_EGFP, 0-002-00094_mKate)"
1451,RAM-14a_3_PFC_3.2_C,0-01c-00044_EGFP,"(0-FFF-00054_cFos, 0-01c-00044_EGFP, 0-FFF-002...",EGFP,327.68,375.31,380.0,814.9801,1713.6154,1717.4448,...,RAM-14a_3_PFC_3.2_C_3.tif,Thu Feb 01 12:56:15 PST 2024,RAM-14a_3,VR5,VEH,ON_DOX,"(0-FFF-00054_cFos, 0-01c-00044_EGFP)","(0-FFF-00054_cFos, 0-01c-00034_EGFP, 0-01c-000...",5,"(0-01c-00044_EGFP, 0-FFF-00227_mKate)"
1452,RAM-14a_3_PFC_3.2_C,0-FFF-00227_mKate,"(0-01c-00044_EGFP, 0-FFF-00227_mKate)",mKate,325.55,380.25,144.0,259.2381,532.9109,549.9702,...,RAM-14a_3_PFC_3.2_C_4.tif,Thu Feb 01 12:56:38 PST 2024,RAM-14a_3,VR5,VEH,ON_DOX,"(0-01c-00044_EGFP, 0-FFF-00227_mKate)","(0-FFF-00054_cFos, 0-01c-00034_EGFP, 0-01c-000...",5,"(0-01c-00044_EGFP, 0-FFF-00227_mKate)"


### update groupings with our new true groupings

In [18]:
df_tiebreak['iid_rid'] = df_tiebreak[['image_name', 'roi_id']].agg('_'.join, axis=1)
df_double_agg_groupings['iid_rid'] = df_double_agg_groupings[['image_name', 'roi_id']].agg('_'.join, axis=1)

df_dbl_true = df_double_agg_groupings.merge(df_tiebreak[['iid_rid', 'updated_grouping']].copy(), how='left', on='iid_rid')
df_dbl_true['true_grouping'] = df_dbl_true.updated_grouping.fillna(df_dbl_true.agg_grouping)
df_dbl_true

Unnamed: 0,image_name,roi_id,agg_grouping,stain,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,filename,analysis_date,rat_n,react,treatment,dox,grouping,iid_rid,updated_grouping,true_grouping
0,RAM-12_11_PFC_3.5_A,0-002-00000_mKate,"(0-01c-00028_EGFP, 0-002-00000_mKate)",mKate,490.69,263.66,91.0,54.2523,130.8234,133.4745,...,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)",RAM-12_11_PFC_3.5_A_0-002-00000_mKate,"(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-01c-00028_EGFP, 0-002-00000_mKate)"
1,RAM-12_11_PFC_3.5_A,0-002-00001_mKate,"(0-002-00001_mKate,)",mKate,359.15,417.22,77.0,54.2523,252.4054,263.5753,...,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-002-00001_mKate,)",RAM-12_11_PFC_3.5_A_0-002-00001_mKate,,"(0-002-00001_mKate,)"
2,RAM-12_11_PFC_3.5_A,0-002-00002_mKate,"(0-01c-00047_EGFP, 0-002-00002_mKate)",mKate,149.95,354.17,125.0,54.2523,910.3442,837.9077,...,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00047_EGFP, 0-002-00002_mKate)",RAM-12_11_PFC_3.5_A_0-002-00002_mKate,,"(0-01c-00047_EGFP, 0-002-00002_mKate)"
3,RAM-12_11_PFC_3.5_A,0-002-00003_mKate,"(0-01c-00128_EGFP, 0-002-00003_mKate)",mKate,193.05,160.84,80.0,54.2523,280.5944,262.2604,...,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00128_EGFP, 0-002-00003_mKate)",RAM-12_11_PFC_3.5_A_0-002-00003_mKate,,"(0-01c-00128_EGFP, 0-002-00003_mKate)"
4,RAM-12_11_PFC_3.5_A,0-002-00004_mKate,"(0-FFF-00238_EGFP, 0-002-00004_mKate)",mKate,221.09,254.73,90.0,54.2523,237.5106,230.533,...,RAM-12_11_PFC_3.5_A_4.tif,Thu Feb 01 11:15:38 PST 2024,RAM-12_11,VR5,ABC,OFF_DOX,"(0-FFF-00238_EGFP, 0-002-00004_mKate)",RAM-12_11_PFC_3.5_A_0-002-00004_mKate,,"(0-FFF-00238_EGFP, 0-002-00004_mKate)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41248,RAM-14b_8_PFC_3.9_B,0-FFF-00229_cFos,"(0-FFF-00229_cFos,)",cFos,31.40,410.30,75.0,163.3016,437.2036,495.5051,...,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00229_cFos,)",RAM-14b_8_PFC_3.9_B_0-FFF-00229_cFos,,"(0-FFF-00229_cFos,)"
41249,RAM-14b_8_PFC_3.9_B,0-FFF-00230_cFos,"(0-FFF-00230_cFos, 0-FFF-00067_EGFP)",cFos,23.08,426.77,80.0,163.3016,311.7118,323.7442,...,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00230_cFos,)",RAM-14b_8_PFC_3.9_B_0-FFF-00230_cFos,,"(0-FFF-00230_cFos, 0-FFF-00067_EGFP)"
41250,RAM-14b_8_PFC_3.9_B,0-FFF-00231_cFos,"(0-FFF-00231_cFos,)",cFos,54.17,439.03,77.0,163.3016,646.9208,672.5289,...,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00231_cFos,)",RAM-14b_8_PFC_3.9_B_0-FFF-00231_cFos,,"(0-FFF-00231_cFos,)"
41251,RAM-14b_8_PFC_3.9_B,0-FFF-00232_cFos,"(0-FFF-00232_cFos,)",cFos,30.05,470.72,75.0,163.3016,459.0720,463.2822,...,RAM-14b_8_PFC_3.9_B_2.tif,Thu Feb 01 15:01:48 PST 2024,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00232_cFos,)",RAM-14b_8_PFC_3.9_B_0-FFF-00232_cFos,,"(0-FFF-00232_cFos,)"


In [19]:
def get_dummies(x):
    groupings = [rid.split('_')[-1] for rid in x]

    dummy_cFos = False
    dummy_EGFP = False
    dummy_mKate = False

    if 'cFos' in groupings:
        dummy_cFos = True
    if 'EGFP' in groupings:
        dummy_EGFP = True
    if 'mKate' in groupings:
        dummy_mKate = True

    return dummy_cFos, dummy_EGFP, dummy_mKate

df_dbl_true['dummy'] = df_dbl_true.true_grouping.apply(get_dummies)
df_dbl_true['dummy_cFos'], df_dbl_true['dummy_EGFP'], df_dbl_true['dummy_mKate'] = zip(*df_dbl_true['dummy'])

# reorder cols
df_dbl_true = df_dbl_true['iid_rid dummy_cFos dummy_EGFP dummy_mKate image_name roi_id stain CoM_x CoM_y background mean_intensity filename rat_n react treatment dox grouping agg_grouping updated_grouping true_grouping'.split()]
df_dbl_true

Unnamed: 0,iid_rid,dummy_cFos,dummy_EGFP,dummy_mKate,image_name,roi_id,stain,CoM_x,CoM_y,background,mean_intensity,filename,rat_n,react,treatment,dox,grouping,agg_grouping,updated_grouping,true_grouping
0,RAM-12_11_PFC_3.5_A_0-002-00000_mKate,False,True,True,RAM-12_11_PFC_3.5_A,0-002-00000_mKate,mKate,490.69,263.66,54.2523,130.8234,RAM-12_11_PFC_3.5_A_4.tif,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-01c-00028_EGFP, 0-002-00000_mKate)","(0-01c-00028_EGFP, 0-002-00000_mKate)"
1,RAM-12_11_PFC_3.5_A_0-002-00001_mKate,False,False,True,RAM-12_11_PFC_3.5_A,0-002-00001_mKate,mKate,359.15,417.22,54.2523,252.4054,RAM-12_11_PFC_3.5_A_4.tif,RAM-12_11,VR5,ABC,OFF_DOX,"(0-002-00001_mKate,)","(0-002-00001_mKate,)",,"(0-002-00001_mKate,)"
2,RAM-12_11_PFC_3.5_A_0-002-00002_mKate,False,True,True,RAM-12_11_PFC_3.5_A,0-002-00002_mKate,mKate,149.95,354.17,54.2523,910.3442,RAM-12_11_PFC_3.5_A_4.tif,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00047_EGFP, 0-002-00002_mKate)","(0-01c-00047_EGFP, 0-002-00002_mKate)",,"(0-01c-00047_EGFP, 0-002-00002_mKate)"
3,RAM-12_11_PFC_3.5_A_0-002-00003_mKate,False,True,True,RAM-12_11_PFC_3.5_A,0-002-00003_mKate,mKate,193.05,160.84,54.2523,280.5944,RAM-12_11_PFC_3.5_A_4.tif,RAM-12_11,VR5,ABC,OFF_DOX,"(0-01c-00128_EGFP, 0-002-00003_mKate)","(0-01c-00128_EGFP, 0-002-00003_mKate)",,"(0-01c-00128_EGFP, 0-002-00003_mKate)"
4,RAM-12_11_PFC_3.5_A_0-002-00004_mKate,False,True,True,RAM-12_11_PFC_3.5_A,0-002-00004_mKate,mKate,221.09,254.73,54.2523,237.5106,RAM-12_11_PFC_3.5_A_4.tif,RAM-12_11,VR5,ABC,OFF_DOX,"(0-FFF-00238_EGFP, 0-002-00004_mKate)","(0-FFF-00238_EGFP, 0-002-00004_mKate)",,"(0-FFF-00238_EGFP, 0-002-00004_mKate)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41248,RAM-14b_8_PFC_3.9_B_0-FFF-00229_cFos,True,False,False,RAM-14b_8_PFC_3.9_B,0-FFF-00229_cFos,cFos,31.40,410.30,163.3016,437.2036,RAM-14b_8_PFC_3.9_B_2.tif,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00229_cFos,)","(0-FFF-00229_cFos,)",,"(0-FFF-00229_cFos,)"
41249,RAM-14b_8_PFC_3.9_B_0-FFF-00230_cFos,True,True,False,RAM-14b_8_PFC_3.9_B,0-FFF-00230_cFos,cFos,23.08,426.77,163.3016,311.7118,RAM-14b_8_PFC_3.9_B_2.tif,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00230_cFos,)","(0-FFF-00230_cFos, 0-FFF-00067_EGFP)",,"(0-FFF-00230_cFos, 0-FFF-00067_EGFP)"
41250,RAM-14b_8_PFC_3.9_B_0-FFF-00231_cFos,True,False,False,RAM-14b_8_PFC_3.9_B,0-FFF-00231_cFos,cFos,54.17,439.03,163.3016,646.9208,RAM-14b_8_PFC_3.9_B_2.tif,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00231_cFos,)","(0-FFF-00231_cFos,)",,"(0-FFF-00231_cFos,)"
41251,RAM-14b_8_PFC_3.9_B_0-FFF-00232_cFos,True,False,False,RAM-14b_8_PFC_3.9_B,0-FFF-00232_cFos,cFos,30.05,470.72,163.3016,459.0720,RAM-14b_8_PFC_3.9_B_2.tif,RAM-14b_8,FR1,ABC,OFF_DOX,"(0-FFF-00232_cFos,)","(0-FFF-00232_cFos,)",,"(0-FFF-00232_cFos,)"


In [20]:
# do our doubles agree?
def check_double_diff(df_true):
    print('double labeled differences: ')
    for stain_x, stain_y in itertools.combinations(['cFos', 'EGFP', 'mKate'], r=2):
        x_on_y = df_true.query(f'dummy_{stain_x} == True and dummy_{stain_y} == True and stain == "{stain_x}"')
        y_on_x = df_true.query(f'dummy_{stain_x} == True and dummy_{stain_y} == True and stain == "{stain_y}"')
        diff = x_on_y.__len__() - y_on_x.__len__()
        print(f'{stain_x}, {stain_y}:    {diff}')

# do our triples agree?
def check_triple_ns(comb, df_true):
    stain_x, stain_y, stain_z = comb
    q = df_true.query(
        f'dummy_{stain_x} == True and dummy_{stain_y} == True and dummy_{stain_z} == True and\
         (stain == "{stain_x}" or stain == "{stain_y}" or stain == "{stain_z}")'
    )

    q_x = q.query(f'stain == "{stain_x}"')
    q_y = q.query(f'stain == "{stain_y}"')
    q_z = q.query(f'stain == "{stain_z}"')

    print(f'\ntriple {stain_x},{stain_y},{stain_z} ns:')
    print(stain_x, ' :', q_x.__len__())
    print(stain_y, ' :', q_y.__len__())
    print(stain_z, ' :', q_z.__len__())

check_double_diff(df_dbl_true)

for comb in itertools.combinations(['cFos', 'EGFP', 'mKate'], r=3):
    check_triple_ns(comb, df_dbl_true)

### GREAT. now we can write this to disk and proceed with our analyses
df_dbl_true.to_csv('RAM-12,13,14_FINAL.csv')

double labeled differences: 
cFos, EGFP:    0
cFos, mKate:    0
EGFP, mKate:    0

triple cFos,EGFP,mKate ns:
cFos  : 1006
EGFP  : 1006
mKate  : 1006


# For completeness I will repeat this entire pipeline on the triple labeled set
The triple labeled data can only give us useful information about triple labeled cell, unlike the double labeled set from which we can effectively reconstruct all n-ways of colocalization (we consider that any given stains are colocalized if they form a complete subgraph; that is, they all point to each other).

This is just an exercise to compare the consistency across our two datasets. Do we need the triple labeled set or is inferring all higher orders of colocalization from the double labeled set alone sufficient?

In [21]:
df_triple_adj = [get_adjacency(df, coloc_type='triple') for df in df_triple_relabel]
df_triple_adj_test = pd.concat(df_triple_adj)
df_triple_adj_test

agg_groupings = df_triple_adj_test.groupby(['image_name', 'roi_id']).grouping.sum()\
    .apply(lambda x: tuple(sorted(sorted(list(set(x))), key=lambda y: sort_order[y.split('_')[-1]])))\
    .reset_index().reset_index().drop('index', axis=1)\
    .rename(columns={'grouping':'agg_grouping'})

df_triple_agg_groupings = agg_groupings\
    .merge(df_triple_adj_test.drop_duplicates(subset=['image_name', 'roi_id']), on=['image_name', 'roi_id'], how='left')

# set types
df_triple_agg_groupings['CoM_x'] = df_triple_agg_groupings.CoM_x.astype(float)
df_triple_agg_groupings['CoM_y'] = df_triple_agg_groupings.CoM_y.astype(float)
df_triple_agg_groupings['background'] = df_triple_agg_groupings.background.astype(float)
df_triple_agg_groupings['mean_intensity'] = df_triple_agg_groupings.mean_intensity.astype(float)

df_grouped_counts = df_triple_agg_groupings.groupby(['image_name', 'agg_grouping'])['agg_grouping']\
    .count().rename('counts').to_frame()\
    .reset_index().reset_index().drop('index', axis=1)
df_grouped_counts['len'] = df_grouped_counts.agg_grouping.apply(lambda x: len(x))

# duplicates where already dropped so we expect this length (counts > len) to be exactly 0
assert len(df_grouped_counts[df_grouped_counts.counts > df_grouped_counts.len]) == 0

# lets examine only cases of incomplete subgraphs
df_mismatched = df_grouped_counts[df_grouped_counts.counts < df_grouped_counts.len]

df_coloc_mismatch = df_mismatched.explode('agg_grouping')[['image_name', 'agg_grouping']]\
    .drop_duplicates().rename(columns={'agg_grouping': 'roi_id'})\
    .merge(df_triple_agg_groupings, how='left', on=['image_name', 'roi_id'])

df_coloc_mismatch['implied_grouping'] = df_coloc_mismatch\
    .apply(lambda x: implied_grouping(df_triple_agg_groupings, x.image_name, x.roi_id), axis=1)

df_coloc_mismatch['len'] = df_coloc_mismatch.implied_grouping.apply(lambda x: len(x))
print(df_coloc_mismatch.len.value_counts(),'\n')

df_coloc_mismatch_3way = df_coloc_mismatch.query('len == 3')
df_coloc_mismatch_4way = df_coloc_mismatch.query('len == 4')
df_coloc_mismatch_5way = df_coloc_mismatch.query('len == 5')

# check that the number of instances of each erronous implied grouping is equal 
# to the size of that grouping (i.e., an mismatched implied grouping of size 4 should
# appear exactly 4 times, once for each of the roi_id's in the grouping)
assert df_coloc_mismatch_4way.groupby('implied_grouping').implied_grouping\
    .apply(lambda x: len(x)).unique().item() == 4
assert df_coloc_mismatch_5way.groupby('implied_grouping').implied_grouping\
    .apply(lambda x: len(x)).unique().item() == 5

# mismatch handling
# 4way case
mismatched_3ways = df_coloc_mismatch_3way.implied_grouping.unique()
tie_broken_3ways = [tie_breaker_3way(df_coloc_mismatch_3way, grp) for grp in mismatched_3ways]
df_4way_tiebreak = pd.concat(tie_broken_4ways)

# 4way case
mismatched_4ways = df_coloc_mismatch_4way.implied_grouping.unique()
tie_broken_4ways = [tie_breaker_4way(df_coloc_mismatch_4way, grp) for grp in mismatched_4ways]
df_4way_tiebreak = pd.concat(tie_broken_4ways)

# 5way case
mismatched_5ways = df_coloc_mismatch_5way.implied_grouping.unique()
tie_broken_5ways = [tie_breaker_5way(df_coloc_mismatch_5way, grp) for grp in mismatched_5ways]
df_5way_tiebreak = pd.concat(tie_broken_5ways)

# final concatentation (only 4way and 5way since 3way ties were not observed)
df_tiebreak = pd.concat([df_4way_tiebreak,df_5way_tiebreak])
df_tiebreak

# build new id col to merge on
df_tiebreak['iid_rid'] = df_tiebreak[['image_name', 'roi_id']].agg('_'.join, axis=1)
df_triple_agg_groupings['iid_rid'] = df_triple_agg_groupings[['image_name', 'roi_id']].agg('_'.join, axis=1)

# merge updates
df_trpl_true = df_triple_agg_groupings.merge(df_tiebreak[['iid_rid', 'updated_grouping']].copy(), how='left', on='iid_rid')
df_trpl_true['true_grouping'] = df_trpl_true.updated_grouping.fillna(df_trpl_true.agg_grouping)

# get dummies
df_trpl_true['dummy'] = df_trpl_true.true_grouping.apply(get_dummies)
df_trpl_true['dummy_cFos'], df_trpl_true['dummy_EGFP'], df_trpl_true['dummy_mKate'] = zip(*df_trpl_true['dummy'])

# reorder
df_trpl_true = df_trpl_true['iid_rid dummy_cFos dummy_EGFP dummy_mKate image_name roi_id stain CoM_x CoM_y background mean_intensity filename rat_n react treatment dox grouping agg_grouping updated_grouping true_grouping'.split()]
df_trpl_true

# check if our cell counts agree
print()
check_double_diff(df_trpl_true)

for comb in itertools.combinations(['cFos', 'EGFP', 'mKate'], r=3):
    check_triple_ns(comb, df_trpl_true)

df_trpl_true.to_csv('RAM-12,13,14_FINAL_from_triples.csv')

len
5    5
4    4
Name: count, dtype: int64 

0-01c-00102_EGFP and 0-01c-00145_EGFP cannot be colocalized; skipping distance computation for this pair
0-FFF-00315_cFos and 0-FFF-00316_cFos cannot be colocalized; skipping distance computation for this pair
0-FFF-00247_EGFP and 0-01c-00051_EGFP cannot be colocalized; skipping distance computation for this pair

double labeled differences: 
cFos, EGFP:    0
cFos, mKate:    0
EGFP, mKate:    0

triple cFos,EGFP,mKate ns:
cFos  : 1162
EGFP  : 1162
mKate  : 1162
