In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import seaborn as sns

# loading some functions we wrote before
sys.path.append("/Users/jonathanramos/Desktop/LRI/Image ROI Data Wrangling/")
from clean import *
from norm import *
from count import *

In [2]:
def add_path(df, f):
    df['path'] = f

    return df

def parse_path(df):
    p = df.path.unique()[0]
    p = p.split('{')[-1].split('}')[0].replace('c-Fos', 'cFos')\
        .replace('Somatic', 'mKate').replace('with','w')
    df['coloc'] = p

    return df

def get_stain(df):
    fname = df.FileName.unique()[0]
    suffix = fname.split('_')[-1]

    if '2' in suffix:
        stain = 'cFos'
    elif '3' in suffix:
        stain = 'EGFP'
    elif '4' in suffix:
        stain = 'mKate'
    df['Stain'] = stain

    return df

def get_image_name(df):
    fname = df.FileName.unique()[0]
    image_name = '_'.join(fname.split('_')[:-1])
    df['image_name'] = image_name

    return df

def get_coord(df):
    df['coord'] = list(zip(df.XM, df.YM))

    return df

def build_roi_id(df):
    df['CellNumber'] = df['CellNumber'].astype(str)
    df['roi_id'] = df[['image_name', 'Stain', 'CellNumber']].agg('_'.join, axis=1)

    return df

single = [(pd.read_csv(f).dropna(subset=['XM', 'YM']).drop_duplicates(subset=['Mean', 'XM', 'YM']), f) for f in glob.glob('RAM-RE_DO/single-label_results/*.csv')]
double = [(pd.read_csv(f).dropna(subset=['XM', 'YM']).drop_duplicates(subset=['Mean', 'XM', 'YM']), f) for f in glob.glob('RAM-RE_DO/double-label_results/*.csv')]
triple = [(pd.read_csv(f).dropna(subset=['XM', 'YM']).drop_duplicates(subset=['Mean', 'XM', 'YM']), f) for f in glob.glob('RAM-RE_DO/triple-label_results/*.csv')]

single = [parse_path(add_path(df, f)) for df, f in single]
double = [parse_path(add_path(df, f)) for df, f in double]
triple = [parse_path(add_path(df, f)) for df, f in triple]

# only build roi_ids for singles, then map doubles/triples to single by (x,y) coords
single = [build_roi_id(get_coord(get_image_name(get_stain(df)))) for df in single]
double = [get_coord(get_image_name(get_stain(df))) for df in double]
triple = [get_coord(get_image_name(get_stain(df))) for df in triple]

In [3]:
def get_df(iid, stain):
    assert iid in set(['RAM-12_14_PFC_4.0_B', 'RAM-14b_3_PFC_3.7_B', 'RAM-14b_4_PFC_3.0_D', 'RAM-12_16_PFC_3.7_C'])
    assert stain in set(['cFos', 'EGFP', 'mKate'])

    target = [df for df in single if iid in set(df.image_name) and stain in set(df.Stain)]
    assert len(target) == 1

    return target[0]

def iid_stain(df):
    iid = df.image_name.unique()
    stain = df.Stain.unique()
    assert len(iid) == 1
    assert len(stain) == 1

    return iid[0], stain[0]

def map_rid(df):
    iid, stain = iid_stain(df)
    df_target = get_df(iid, stain)
    df = df.merge(df_target[['coord', 'roi_id']].copy(), how='left', on='coord').dropna(subset=['Mean', 'XM', 'YM'])
    assert df.roi_id.isnull().sum() == 0

    return df

double = [map_rid(df) for df in double]
triple = [map_rid(df) for df in triple]

# Pairing up triples

In [4]:
df_triple = pd.concat(triple)
imgs = df_triple.image_name.unique()

paired_triples = []
for img in imgs:
    df = df_triple[df_triple.image_name == img].copy()
    df.Stain.unique()

    df_X = df[df.Stain == 'cFos'].copy()
    df_Y = df[df.Stain == 'EGFP'].copy()
    df_Z = df[df.Stain == 'mKate'].copy()


    # toss coords into array
    curr_coords = np.array(list(df_X.coord))
    target_coords = np.array(list(df_Y.coord))

    # compute loss matrix for optimization
    # this is a square matrix of distances (every point compared with ever other point)
    C = cdist(curr_coords, target_coords)

    # optimize: what combination of pairs yields the smallest sum of distances?
    row_ind, col_ind = linear_sum_assignment(C)

    # toss zip up winning pairs
    pairs = [(tuple(curr_coords[r_i]), tuple(target_coords[c_i])) for r_i, c_i in zip(row_ind, col_ind)]

    # map coords and roi_ids of paired cells to our original df_X
    d_pairs = dict(pairs)
    df_X['target_coord_1'] = df_X.coord.apply(lambda x: d_pairs[x])

    d_rid = dict(zip(df_Y.coord, df_Y.roi_id))
    df_X['target_roi_id_1'] = df_X.target_coord_1.apply(lambda x: d_rid[x])

    ###### REPEAT FOR SECOND STAIN 
    # for brevity we will assume that if x is coloc to y and x is coloc to z, then y is coloc to z
    # toss coords into array
    curr_coords = np.array(list(df_X.coord))
    target_coords = np.array(list(df_Z.coord))

    # compute loss matrix for optimization
    # this is a square matrix of distances (every point compared with ever other point)
    C = cdist(curr_coords, target_coords)

    # optimize: what combination of pairs yields the smallest sum of distances?
    row_ind, col_ind = linear_sum_assignment(C)

    # toss zip up winning pairs
    pairs = [(tuple(curr_coords[r_i]), tuple(target_coords[c_i])) for r_i, c_i in zip(row_ind, col_ind)]

    # map coords and roi_ids of paired cells to our original df_X
    d_pairs = dict(pairs)
    df_X['target_coord_2'] = df_X.coord.apply(lambda x: d_pairs[x] if x in d_pairs else np.nan)

    d_rid = dict(zip(df_Z.coord, df_Z.roi_id))
    df_X['target_roi_id_2'] = df_X.target_coord_2.apply(lambda x: d_rid[x] if x in d_rid else np.nan)

    # in this case of unequal len, remove unpaired cells; these are not true triple labeled cells.
    df_X = df_X.dropna(subset=['target_coord_1', 'target_coord_2']).copy()

    # build grouping col
    df_X['grouping'] = df_X.apply(lambda x: (x.roi_id, x.target_roi_id_1, x.target_roi_id_2), axis=1)

    grouping = df_X.grouping.values

    def map_groupings(rid, groups):
        match = []
        for g in groups:
            if rid in g:
                match.append(g)

        if len(match) > 0:
            assert len(match) == 1
            return match[0]
        
        else:
            return np.nan


    df['grouping'] = df.apply(lambda x: map_groupings(x.roi_id, grouping), axis=1)

    paired_triples.append(df)

df_triple_adj = pd.concat(paired_triples)

# missing groupings indicate that grouping was incomplete
# let's only keep complete groupings
df_triple_adj = df_triple_adj.dropna(subset='grouping')
df_triple_adj = df_triple_adj.drop(columns=['coloc'])

print(df_triple_adj.shape)
df_triple_adj.head()

(87, 28)


Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,XM,YM,Circ.,AR,...,Experiment,Subject,Indi_var_group,FileName,AnalysisDate,path,image_name,coord,roi_id,grouping
0,1,49.802,568.279,277.201,142.351,1029.517,221.609,203.01,0.819,1.083,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/triple-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(221.609, 203.01)",RAM-14b_4_PFC_3.0_D_cFos_3,"(RAM-14b_4_PFC_3.0_D_cFos_3, RAM-14b_4_PFC_3.0..."
1,2,66.637,662.647,326.518,146.344,1305.305,202.968,275.929,0.996,1.169,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/triple-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(202.968, 275.929)",RAM-14b_4_PFC_3.0_D_cFos_4,"(RAM-14b_4_PFC_3.0_D_cFos_4, RAM-14b_4_PFC_3.0..."
2,3,63.831,589.014,325.642,146.243,1275.614,265.164,316.104,0.958,1.0,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/triple-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(265.164, 316.104)",RAM-14b_4_PFC_3.0_D_cFos_147,"(RAM-14b_4_PFC_3.0_D_cFos_147, RAM-14b_4_PFC_3..."
3,5,34.371,538.963,290.646,152.435,969.692,225.799,43.66,0.565,1.083,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/triple-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(225.799, 43.66)",RAM-14b_4_PFC_3.0_D_cFos_182,"(RAM-14b_4_PFC_3.0_D_cFos_182, RAM-14b_4_PFC_3..."
4,6,56.817,305.969,118.564,147.07,608.11,200.064,121.803,0.934,1.083,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/triple-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(200.064, 121.803)",RAM-14b_4_PFC_3.0_D_cFos_207,"(RAM-14b_4_PFC_3.0_D_cFos_207, RAM-14b_4_PFC_3..."


# Pairing up doubles

In [5]:
def flip_coloc(df):
    iid = df.image_name.unique()
    coloc = df.coloc.unique()
    assert len(iid) == 1
    assert len(coloc) == 1

    iid, coloc = iid[0], coloc[0]
    X, Y = tuple(coloc.split('_coloc_w_'))
    flip = f'{Y}_coloc_w_{X}'

    return iid, flip

def get_inverse(iid, coloc):
    assert iid in set(['RAM-12_14_PFC_4.0_B', 'RAM-14b_3_PFC_3.7_B', 'RAM-14b_4_PFC_3.0_D', 'RAM-12_16_PFC_3.7_C'])
    assert coloc in set(['cFos_coloc_w_EGFP', 'cFos_coloc_w_mKate', 'EGFP_coloc_w_mKate',\
       'mKate_coloc_w_cFos', 'EGFP_coloc_w_cFos', 'mKate_coloc_w_EGFP'])

    target = [df for df in double if iid in set(df.image_name) and coloc in set(df.coloc)]
    assert len(target) == 1

    return target[0]

def dist(p1, p2):
    x1, y1 = p1
    x2, y2 = p2
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def map_doubles(df, plot=True):
    # get inverse df of corresponding double label 
    # that is, if we are examining mKate on cFos, fetch cFos on mKate
    iid, f = flip_coloc(df)
    df_inverse = get_inverse(iid, f)

    # check that a given df and it's inverse are fully paired
    if not len(df) == len(df_inverse):
        print(iid, f)
        print(len(df) - len(df_inverse))

    # toss coords into array
    curr_coords = np.array(list(df.coord))
    target_coords = np.array(list(df_inverse.coord))

    # compute loss matrix for optimization
    # this is a square matrix of distances (every point compared with ever other point)
    C = cdist(curr_coords, target_coords)

    # optimize: what combination of pairs yields the smallest sum of distances?
    row_ind, col_ind = linear_sum_assignment(C)

    # toss zip up winning pairs
    pairs = [(tuple(curr_coords[r_i]), tuple(target_coords[c_i])) for r_i, c_i in zip(row_ind, col_ind)]

    # map coords and roi_ids of paired cells to our original df
    d_pairs = dict(pairs)
    df['target_coord'] = df.coord.apply(lambda x: d_pairs[x])

    d_rid = dict(zip(df_inverse.coord, df_inverse.roi_id))
    df['target_roi_id'] = df.target_coord.apply(lambda x: d_rid[x])

    # optional visualization, try checking plots for visual confirmation
    if plot == True:
        sns.set_theme()
        cmap = {'cFos':'bo', 'EGFP':'go', 'mKate':'ro'}
        target = f.split('_')[0]
        current = f.split('_')[-1]
        plt.plot([i[0] for i in curr_coords], [i[1] for i in curr_coords],cmap[current], markersize = 6, alpha=0.3, label=current)
        plt.plot([i[0] for i in target_coords], [i[1] for i in target_coords],cmap[target],  markersize = 6, alpha=0.3, label=target)
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
        plt.title(f'{df.path[0].split("/")[-1]}')
        for row_i, col_i in zip(row_ind, col_ind):
            plt.plot([curr_coords[row_i,0], target_coords[col_i,0]], [curr_coords[row_i,1], target_coords[col_i,1]], 'k', linewidth=2.0)

        ax = plt.gca()
        ax.set_xlim([0, 450])
        ax.set_ylim([0, 450])
        ax.set_aspect('equal')

        plt.savefig(f'{df.path[0].split("/")[-1]}'.replace('.csv', '.png'), dpi=700, bbox_inches='tight')
        plt.show()

    return df

mapped_doubles = [map_doubles(df, plot=False) for i, df in enumerate(double)] # enumeration is for debugging

In [6]:
df = mapped_doubles[0]

def coloc_cols(df):
    self_stain = df.Stain.unique()[0]
    target_stain = df.coloc.apply(lambda x: x.split('_')[-1]).unique()[0]

    df[f'coloc_w_{self_stain}'] = df.roi_id
    df[f'coloc_w_{target_stain}'] = df.target_roi_id

    return df

coloc_doubles = [coloc_cols(df) for df in mapped_doubles]

coloc_doubles[0].head()

Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,XM,YM,Circ.,AR,...,AnalysisDate,path,coloc,image_name,coord,roi_id,target_coord,target_roi_id,coloc_w_cFos,coloc_w_EGFP
0,1,49.802,568.279,277.201,142.351,1029.517,221.609,203.01,0.819,1.083,...,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,cFos_coloc_w_EGFP,RAM-14b_4_PFC_3.0_D,"(221.609, 203.01)",RAM-14b_4_PFC_3.0_D_cFos_3,"(221.929, 202.624)",RAM-14b_4_PFC_3.0_D_EGFP_15,RAM-14b_4_PFC_3.0_D_cFos_3,RAM-14b_4_PFC_3.0_D_EGFP_15
1,2,66.637,662.647,326.518,146.344,1305.305,202.968,275.929,0.996,1.169,...,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,cFos_coloc_w_EGFP,RAM-14b_4_PFC_3.0_D,"(202.968, 275.929)",RAM-14b_4_PFC_3.0_D_cFos_4,"(203.116, 275.689)",RAM-14b_4_PFC_3.0_D_EGFP_58,RAM-14b_4_PFC_3.0_D_cFos_4,RAM-14b_4_PFC_3.0_D_EGFP_58
2,3,43.489,587.897,308.04,141.933,1118.95,128.434,84.142,0.65,1.169,...,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,cFos_coloc_w_EGFP,RAM-14b_4_PFC_3.0_D,"(128.434, 84.142)",RAM-14b_4_PFC_3.0_D_cFos_15,"(126.186, 80.878)",RAM-14b_4_PFC_3.0_D_EGFP_132,RAM-14b_4_PFC_3.0_D_cFos_15,RAM-14b_4_PFC_3.0_D_EGFP_132
3,4,54.712,1068.078,488.961,150.64,1906.323,242.107,44.713,0.993,1.0,...,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,cFos_coloc_w_EGFP,RAM-14b_4_PFC_3.0_D,"(242.107, 44.713)",RAM-14b_4_PFC_3.0_D_cFos_44,"(242.126, 44.83)",RAM-14b_4_PFC_3.0_D_EGFP_123,RAM-14b_4_PFC_3.0_D_cFos_44,RAM-14b_4_PFC_3.0_D_EGFP_123
4,5,65.935,711.327,437.075,142.152,1438.77,126.99,176.164,0.904,1.134,...,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,cFos_coloc_w_EGFP,RAM-14b_4_PFC_3.0_D,"(126.99, 176.164)",RAM-14b_4_PFC_3.0_D_cFos_57,"(127.394, 176.967)",RAM-14b_4_PFC_3.0_D_EGFP_94,RAM-14b_4_PFC_3.0_D_cFos_57,RAM-14b_4_PFC_3.0_D_EGFP_94


In [7]:
sort_order = {'cFos':0, 'EGFP':1, 'mKate':2}
def get_adjacency(df, sort_order):
    df['grouping'] = df.apply(\
        lambda x: tuple([y for y in (x.loc[[col for col in df.columns if 'coloc_w' in col]]) \
        if not pd.isnull(y)]), axis=1)

    df['grouping'] = df.grouping.apply(lambda x: tuple(sorted([y for y in x],\
        key=lambda z: sort_order[z.split('_')[-2]])))

    return df.drop(columns=[col for col in df.columns if 'coloc' in col], axis=1)


df_double_adj = [get_adjacency(df, sort_order) for df in coloc_doubles]
df_double_adj = pd.concat(df_double_adj)
df_double_adj = df_double_adj.drop(columns=['target_coord', 'target_roi_id'])

print(df_double_adj.shape)
df_double_adj.head()

(812, 28)


Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,XM,YM,Circ.,AR,...,Experiment,Subject,Indi_var_group,FileName,AnalysisDate,path,image_name,coord,roi_id,grouping
0,1,49.802,568.279,277.201,142.351,1029.517,221.609,203.01,0.819,1.083,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(221.609, 203.01)",RAM-14b_4_PFC_3.0_D_cFos_3,"(RAM-14b_4_PFC_3.0_D_cFos_3, RAM-14b_4_PFC_3.0..."
1,2,66.637,662.647,326.518,146.344,1305.305,202.968,275.929,0.996,1.169,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(202.968, 275.929)",RAM-14b_4_PFC_3.0_D_cFos_4,"(RAM-14b_4_PFC_3.0_D_cFos_4, RAM-14b_4_PFC_3.0..."
2,3,43.489,587.897,308.04,141.933,1118.95,128.434,84.142,0.65,1.169,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(128.434, 84.142)",RAM-14b_4_PFC_3.0_D_cFos_15,"(RAM-14b_4_PFC_3.0_D_cFos_15, RAM-14b_4_PFC_3...."
3,4,54.712,1068.078,488.961,150.64,1906.323,242.107,44.713,0.993,1.0,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(242.107, 44.713)",RAM-14b_4_PFC_3.0_D_cFos_44,"(RAM-14b_4_PFC_3.0_D_cFos_44, RAM-14b_4_PFC_3...."
4,5,65.935,711.327,437.075,142.152,1438.77,126.99,176.164,0.904,1.134,...,RAM-14b,4D,,RAM-14b_4_PFC_3.0_D_2.tif,_2_7_2024_at_14-14,RAM-RE_DO/double-label_results/RAM-14b_4_PFC_3...,RAM-14b_4_PFC_3.0_D,"(126.99, 176.164)",RAM-14b_4_PFC_3.0_D_cFos_57,"(RAM-14b_4_PFC_3.0_D_cFos_57, RAM-14b_4_PFC_3...."


### aggregate across double images

In [8]:
df_double_agg = df_double_adj.groupby('roi_id')['grouping'].sum()\
    .apply(lambda x: tuple(sorted(list(np.unique(x)), key=lambda y: sort_order[y.split('_')[-2]])))\
    .reset_index().rename(columns={'grouping': 'agg_grouping'})

df_double_agg

Unnamed: 0,roi_id,agg_grouping
0,RAM-12_14_PFC_4.0_B_EGFP_10,"(RAM-12_14_PFC_4.0_B_EGFP_10, RAM-12_14_PFC_4...."
1,RAM-12_14_PFC_4.0_B_EGFP_101,"(RAM-12_14_PFC_4.0_B_cFos_92, RAM-12_14_PFC_4...."
2,RAM-12_14_PFC_4.0_B_EGFP_106,"(RAM-12_14_PFC_4.0_B_EGFP_106, RAM-12_14_PFC_4..."
3,RAM-12_14_PFC_4.0_B_EGFP_11,"(RAM-12_14_PFC_4.0_B_EGFP_11, RAM-12_14_PFC_4...."
4,RAM-12_14_PFC_4.0_B_EGFP_111,"(RAM-12_14_PFC_4.0_B_cFos_83, RAM-12_14_PFC_4...."
...,...,...
704,RAM-14b_4_PFC_3.0_D_mKate_88,"(RAM-14b_4_PFC_3.0_D_cFos_99, RAM-14b_4_PFC_3...."
705,RAM-14b_4_PFC_3.0_D_mKate_91,"(RAM-14b_4_PFC_3.0_D_cFos_238, RAM-14b_4_PFC_3..."
706,RAM-14b_4_PFC_3.0_D_mKate_94,"(RAM-14b_4_PFC_3.0_D_cFos_166, RAM-14b_4_PFC_3..."
707,RAM-14b_4_PFC_3.0_D_mKate_97,"(RAM-14b_4_PFC_3.0_D_EGFP_29, RAM-14b_4_PFC_3...."


In [9]:
# check that all the triple labeled cells are in the double labeled set
assert set(df_triple_adj.roi_id).issubset(set(df_double_agg.roi_id))

# if roi_id is actually triple, replace double grouping with triple grouping
d_triple = dict(zip(df_triple_adj.roi_id, df_triple_adj.grouping))
df_double_agg['updated_grouping'] = df_double_agg.apply(lambda x: d_triple[x.roi_id] if x.roi_id in d_triple else x.agg_grouping, axis=1)
df_double_agg

np.unique(df_double_agg.updated_grouping.apply(len), return_counts=True)

(array([2, 3]), array([594, 115]))

### map agg groupings back into single set

In [10]:
df_single = pd.concat(single)

# build dict to map agg groupings
d_double_agg = dict(zip(df_double_agg.roi_id.values, df_double_agg.updated_grouping))

# if roi_id is in d_double_agg, then fetch the agg_grouping,
# build new grouping tuple consiting of only (roi_id,) otherwise
df_single['agg_grouping'] = df_single.apply(lambda x: d_double_agg[x.roi_id] if x.roi_id in d_double_agg else (x.roi_id,), axis=1)
df_agg = df_single

print(np.unique(df_agg.agg_grouping.apply(len), return_counts=True))
print(df_agg.shape)
df_agg.head()

(array([1, 2, 3]), array([1521,  594,  115]))
(2230, 29)


Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,XM,YM,Circ.,AR,...,Subject,Ind_var_group,FileName,AnalysisDate,path,coloc,image_name,coord,roi_id,agg_grouping
0,1,54.011,308.266,108.432,83.029,495.232,396.394,284.255,0.578,1.144,...,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(396.394, 284.255)",RAM-12_14_PFC_4.0_B_cFos_1,"(RAM-12_14_PFC_4.0_B_cFos_1,)"
1,2,44.191,346.999,137.648,97.663,617.302,324.225,270.936,0.802,1.0,...,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(324.225, 270.936)",RAM-12_14_PFC_4.0_B_cFos_2,"(RAM-12_14_PFC_4.0_B_cFos_2, RAM-12_14_PFC_4.0..."
2,3,82.77,476.138,244.242,100.101,1151.172,267.948,242.179,0.943,1.487,...,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(267.948, 242.179)",RAM-12_14_PFC_4.0_B_cFos_3,"(RAM-12_14_PFC_4.0_B_cFos_3, RAM-12_14_PFC_4.0..."
3,4,44.892,235.868,101.689,80.612,437.505,221.637,391.785,0.738,1.083,...,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(221.637, 391.785)",RAM-12_14_PFC_4.0_B_cFos_4,"(RAM-12_14_PFC_4.0_B_cFos_4,)"
4,5,42.086,243.73,107.312,84.139,466.085,149.162,198.285,0.937,1.302,...,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(149.162, 198.285)",RAM-12_14_PFC_4.0_B_cFos_5,"(RAM-12_14_PFC_4.0_B_cFos_5, RAM-12_14_PFC_4.0..."


# Enforcing complete subgraph colocalization

In [11]:
df_grouped_counts = df_agg.groupby(['image_name', 'agg_grouping'])['agg_grouping']\
    .count().rename('counts').to_frame()\
    .reset_index().reset_index().drop('index', axis=1)
df_grouped_counts['len'] = df_grouped_counts.agg_grouping.apply(lambda x: len(x))

# if a grouping's length (the number of roi ids listed in the tuple) is equal to
# the number of times it appears in a given image, that grouping is plausible
# that is, if some mKate cell points to some cFos cell, and that cFos cell points
# to the same mKate cell, then that adjacency tuple must appear exactly twice in
# the given image (qualify per image here since roi_ids start from 0 for each image)

# now lets consider the case wher the counts and the lengths do not match. This 
# mismatch means that either a row was duplicated (counts > len) or that the subgraph
# defined by its adjacency tuple is not complete (counts < len); i.e. some mKate cell
# points to some cFos cell, but that cFos cell says it's single labeled. 

# duplicates where already dropped so we expect this length (counts > len) to be exactly 0
assert len(df_grouped_counts[df_grouped_counts.counts > df_grouped_counts.len]) == 0

# lets examine only cases of incomplete subgraphs
df_mismatched = df_grouped_counts[df_grouped_counts.counts < df_grouped_counts.len]

print(np.unique(df_mismatched.len, return_counts=True))
print(df_mismatched.shape)
df_mismatched.head()

(array([2, 3]), array([14,  7]))
(21, 4)


Unnamed: 0,image_name,agg_grouping,counts,len
68,RAM-12_14_PFC_4.0_B,"(RAM-12_14_PFC_4.0_B_EGFP_173, RAM-12_14_PFC_4...",1,2
349,RAM-12_14_PFC_4.0_B,"(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...",1,3
350,RAM-12_14_PFC_4.0_B,"(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...",1,2
650,RAM-12_16_PFC_3.7_C,"(RAM-12_16_PFC_3.7_C_EGFP_63, RAM-12_16_PFC_3....",1,2
692,RAM-12_16_PFC_3.7_C,"(RAM-12_16_PFC_3.7_C_cFos_13, RAM-12_16_PFC_3....",1,2


### explode out all roi_ids contained in adjacency grouping tuples, merge with other data cols

In [12]:
df_coloc_mismatch = df_mismatched.explode('agg_grouping')[['image_name', 'agg_grouping']]\
    .drop_duplicates().rename(columns={'agg_grouping': 'roi_id'})\
    .merge(df_agg, how='left', on=['image_name', 'roi_id'])
    # .dropna()

print(df_coloc_mismatch.shape)
df_coloc_mismatch.head()

(21, 29)


Unnamed: 0,image_name,roi_id,Unnamed: 3,Area,Mean,StdDev,Min,Max,XM,YM,...,roiName,Experiment,Subject,Ind_var_group,FileName,AnalysisDate,path,coloc,coord,agg_grouping
0,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_EGFP_173,173,44.892,125.016,53.772,35.615,232.081,284.334,149.083,...,0b-00173,RAM-12,14b,,RAM-12_14_PFC_4.0_B_3,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,EGFP,"(284.334, 149.083)","(RAM-12_14_PFC_4.0_B_EGFP_173, RAM-12_14_PFC_4..."
1,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_mKate_43,43,37.176,91.7,45.727,21.07,251.984,286.474,149.1,...,0c-00043,RAM-12,14b,,RAM-12_14_PFC_4.0_B_4,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,mKate,"(286.474, 149.1)","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0..."
2,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_7,7,34.371,762.828,497.563,89.197,1646.951,287.023,149.117,...,0a-00007,RAM-12,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,"(287.023, 149.117)","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0..."
3,RAM-12_16_PFC_3.7_C,RAM-12_16_PFC_3.7_C_EGFP_63,63,86.979,1025.68,329.847,476.699,1774.115,68.261,326.416,...,0b-00063,RAM-12,16C,,RAM-12_16_PFC_3.7_C_3,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_16_PFC_3...,EGFP,"(68.261, 326.416)","(RAM-12_16_PFC_3.7_C_EGFP_63, RAM-12_16_PFC_3...."
4,RAM-12_16_PFC_3.7_C,RAM-12_16_PFC_3.7_C_mKate_95,95,91.187,1236.296,982.475,36.076,2860.279,64.557,325.333,...,0c-00095,RAM-12,16C,,RAM-12_16_PFC_3.7_C_4,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_16_PFC_3...,mKate,"(64.557, 325.333)","(RAM-12_16_PFC_3.7_C_cFos_99, RAM-12_16_PFC_3...."


### Get implied groupings for mismatched subgraphs

In [13]:
def implied_grouping(df, im, rid):
    implied_adj = [rid]
    updated_adj = []

    while set(implied_adj) != set(updated_adj):
        implied_adj += updated_adj

        for r in implied_adj:
            q = f"image_name == '{im}' and roi_id == '{r}'"
            neighbors = df.query(q)['agg_grouping']

            try: 
                neighbors = neighbors.item() 
            except: 
                print(q)
                print(neighbors,'\n')
                return('network search failed')

            for n in neighbors:
                updated_adj.append(n)

    return tuple(sorted(sorted(list(set(implied_adj))), key=lambda x: sort_order[x.split('_')[-2]]))

df_coloc_mismatch['implied_grouping'] = df_coloc_mismatch\
    .apply(lambda x: implied_grouping(df_agg, x.image_name, x.roi_id), axis=1)

### Consider differently sized groups of mismatched roi_ids seperately

In [14]:
df_coloc_mismatch['len'] = df_coloc_mismatch.implied_grouping.apply(lambda x: len(x))
print(df_coloc_mismatch.len.value_counts())

df_coloc_mismatch_3way = df_coloc_mismatch.query('len == 3')
df_coloc_mismatch_4way = df_coloc_mismatch.query('len == 4')
df_coloc_mismatch_5way = df_coloc_mismatch.query('len == 5')

# check that the number of instances of each erronous implied grouping is equal 
# to the size of that grouping (i.e., an mismatched implied grouping of size 4 should
# appear exactly 4 times, once for each of the roi_id's in the grouping)
assert df_coloc_mismatch_3way.groupby('implied_grouping').implied_grouping\
    .apply(lambda x: len(x)).unique().item() == 3

### luckily we did not see groups of 4 or 5 in this case 
### so we only have to consider the 3way case
# assert df_coloc_mismatch_4way.groupby('implied_grouping').implied_grouping\
#     .apply(lambda x: len(x)).unique().item() == 4
# assert df_coloc_mismatch_5way.groupby('implied_grouping').implied_grouping\
#     .apply(lambda x: len(x)).unique().item() == 5

df_coloc_mismatch_3way.head()

len
3    21
Name: count, dtype: int64


Unnamed: 0,image_name,roi_id,Unnamed: 3,Area,Mean,StdDev,Min,Max,XM,YM,...,Subject,Ind_var_group,FileName,AnalysisDate,path,coloc,coord,agg_grouping,implied_grouping,len
0,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_EGFP_173,173,44.892,125.016,53.772,35.615,232.081,284.334,149.083,...,14b,,RAM-12_14_PFC_4.0_B_3,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,EGFP,"(284.334, 149.083)","(RAM-12_14_PFC_4.0_B_EGFP_173, RAM-12_14_PFC_4...","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...",3
1,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_mKate_43,43,37.176,91.7,45.727,21.07,251.984,286.474,149.1,...,14b,,RAM-12_14_PFC_4.0_B_4,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,mKate,"(286.474, 149.1)","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...",3
2,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_7,7,34.371,762.828,497.563,89.197,1646.951,287.023,149.117,...,14b,,RAM-12_14_PFC_4.0_B_2,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,"(287.023, 149.117)","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...","(RAM-12_14_PFC_4.0_B_cFos_7, RAM-12_14_PFC_4.0...",3
3,RAM-12_16_PFC_3.7_C,RAM-12_16_PFC_3.7_C_EGFP_63,63,86.979,1025.68,329.847,476.699,1774.115,68.261,326.416,...,16C,,RAM-12_16_PFC_3.7_C_3,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_16_PFC_3...,EGFP,"(68.261, 326.416)","(RAM-12_16_PFC_3.7_C_EGFP_63, RAM-12_16_PFC_3....","(RAM-12_16_PFC_3.7_C_cFos_99, RAM-12_16_PFC_3....",3
4,RAM-12_16_PFC_3.7_C,RAM-12_16_PFC_3.7_C_mKate_95,95,91.187,1236.296,982.475,36.076,2860.279,64.557,325.333,...,16C,,RAM-12_16_PFC_3.7_C_4,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_16_PFC_3...,mKate,"(64.557, 325.333)","(RAM-12_16_PFC_3.7_C_cFos_99, RAM-12_16_PFC_3....","(RAM-12_16_PFC_3.7_C_cFos_99, RAM-12_16_PFC_3....",3


### the 3way case

In [15]:
import itertools

def dist(p1, p2):
    x1, y1 = p1
    x2, y2 = p2
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def tie_breaker_3way(df_3way, current_grp):
    grp = df_3way[df_3way['implied_grouping'] == current_grp].copy(deep=True)
    coords = dict(zip(grp.roi_id,list(zip(grp.XM.values, grp.YM.values))))

    distances = []
    for p1, p2 in itertools.combinations(coords.keys(), 2):
        stain_type1 = p1.split('_')[-2]
        stain_type2 = p2.split('_')[-2]
        if not stain_type1 == stain_type2:
            distances.append(((p1, p2), dist(coords[p1], coords[p2])))
        else:
            print(f'{p1} and {p2} cannot be colocalized; skipping distance computation for this pair')

    d = dict(distances)
    winner = set(min(d, key=d.get))
    leftover = set(current_grp) - winner

    winner = tuple(sorted([rid for rid in winner], key=lambda x: sort_order[x.split('_')[-2]]))
    leftover = tuple(leftover)

    # update groupings
    grp['updated_grouping'] = grp.apply(lambda x: winner if x.roi_id in winner else leftover, axis=1)

    return grp

mismatched_3ways = df_coloc_mismatch_3way.implied_grouping.unique()
tie_broken_3ways = [tie_breaker_3way(df_coloc_mismatch_3way, grp) for grp in mismatched_3ways]
df_3way_tiebreak = pd.concat(tie_broken_3ways)

### updated groupiongs with our new true groupings

In [16]:
df_tiebreak = df_3way_tiebreak
df_tiebreak['iid_rid'] = df_tiebreak[['image_name', 'roi_id']].agg('_'.join, axis=1)
df_agg['iid_rid'] = df_agg[['image_name', 'roi_id']].agg('_'.join, axis=1)

df_true = df_agg.merge(df_tiebreak[['iid_rid', 'updated_grouping']].copy(), how='left', on='iid_rid')
df_true['true_grouping'] = df_true.updated_grouping.fillna(df_true.agg_grouping)

print(df_true.shape)
df_true.head()

(2230, 32)


Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,XM,YM,Circ.,AR,...,AnalysisDate,path,coloc,image_name,coord,roi_id,agg_grouping,iid_rid,updated_grouping,true_grouping
0,1,54.011,308.266,108.432,83.029,495.232,396.394,284.255,0.578,1.144,...,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(396.394, 284.255)",RAM-12_14_PFC_4.0_B_cFos_1,"(RAM-12_14_PFC_4.0_B_cFos_1,)",RAM-12_14_PFC_4.0_B_RAM-12_14_PFC_4.0_B_cFos_1,,"(RAM-12_14_PFC_4.0_B_cFos_1,)"
1,2,44.191,346.999,137.648,97.663,617.302,324.225,270.936,0.802,1.0,...,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(324.225, 270.936)",RAM-12_14_PFC_4.0_B_cFos_2,"(RAM-12_14_PFC_4.0_B_cFos_2, RAM-12_14_PFC_4.0...",RAM-12_14_PFC_4.0_B_RAM-12_14_PFC_4.0_B_cFos_2,,"(RAM-12_14_PFC_4.0_B_cFos_2, RAM-12_14_PFC_4.0..."
2,3,82.77,476.138,244.242,100.101,1151.172,267.948,242.179,0.943,1.487,...,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(267.948, 242.179)",RAM-12_14_PFC_4.0_B_cFos_3,"(RAM-12_14_PFC_4.0_B_cFos_3, RAM-12_14_PFC_4.0...",RAM-12_14_PFC_4.0_B_RAM-12_14_PFC_4.0_B_cFos_3,,"(RAM-12_14_PFC_4.0_B_cFos_3, RAM-12_14_PFC_4.0..."
3,4,44.892,235.868,101.689,80.612,437.505,221.637,391.785,0.738,1.083,...,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(221.637, 391.785)",RAM-12_14_PFC_4.0_B_cFos_4,"(RAM-12_14_PFC_4.0_B_cFos_4,)",RAM-12_14_PFC_4.0_B_RAM-12_14_PFC_4.0_B_cFos_4,,"(RAM-12_14_PFC_4.0_B_cFos_4,)"
4,5,42.086,243.73,107.312,84.139,466.085,149.162,198.285,0.937,1.302,...,_2_7_2024_at_14-14,RAM-RE_DO/single-label_results/RAM-12_14_PFC_4...,cFos,RAM-12_14_PFC_4.0_B,"(149.162, 198.285)",RAM-12_14_PFC_4.0_B_cFos_5,"(RAM-12_14_PFC_4.0_B_cFos_5, RAM-12_14_PFC_4.0...",RAM-12_14_PFC_4.0_B_RAM-12_14_PFC_4.0_B_cFos_5,,"(RAM-12_14_PFC_4.0_B_cFos_5, RAM-12_14_PFC_4.0..."


In [17]:
def get_dummies(x):
    groupings = [rid.split('_')[-2] for rid in x]

    dummy_cFos = False
    dummy_EGFP = False
    dummy_mKate = False

    if 'cFos' in groupings:
        dummy_cFos = True
    if 'EGFP' in groupings:
        dummy_EGFP = True
    if 'mKate' in groupings:
        dummy_mKate = True

    return dummy_cFos, dummy_EGFP, dummy_mKate

df_true.columns = [col.lower() for col in df_true.columns]
df_true['dummy'] = df_true.true_grouping.apply(get_dummies)
df_true['dummy_cFos'], df_true['dummy_EGFP'], df_true['dummy_mKate'] = zip(*df_true['dummy'])


# reorder cols
#['stain', 'CoM_x', 'CoM_y', 'background', 'mean_intensity', 'filename', 'rat_n', 'react', 'treatment', 'dox', 'grouping']
df_true = df_true['roi_id dummy_cFos dummy_EGFP dummy_mKate image_name roi_id stain xm ym background mean filename agg_grouping updated_grouping true_grouping'.split()]
df_true

Unnamed: 0,roi_id,dummy_cFos,dummy_EGFP,dummy_mKate,image_name,roi_id.1,stain,xm,ym,background,mean,filename,agg_grouping,updated_grouping,true_grouping
0,RAM-12_14_PFC_4.0_B_cFos_1,True,False,False,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_1,cFos,396.394,284.255,80.520,308.266,RAM-12_14_PFC_4.0_B_2,"(RAM-12_14_PFC_4.0_B_cFos_1,)",,"(RAM-12_14_PFC_4.0_B_cFos_1,)"
1,RAM-12_14_PFC_4.0_B_cFos_2,True,True,False,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_2,cFos,324.225,270.936,80.520,346.999,RAM-12_14_PFC_4.0_B_2,"(RAM-12_14_PFC_4.0_B_cFos_2, RAM-12_14_PFC_4.0...",,"(RAM-12_14_PFC_4.0_B_cFos_2, RAM-12_14_PFC_4.0..."
2,RAM-12_14_PFC_4.0_B_cFos_3,True,True,False,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_3,cFos,267.948,242.179,80.520,476.138,RAM-12_14_PFC_4.0_B_2,"(RAM-12_14_PFC_4.0_B_cFos_3, RAM-12_14_PFC_4.0...",,"(RAM-12_14_PFC_4.0_B_cFos_3, RAM-12_14_PFC_4.0..."
3,RAM-12_14_PFC_4.0_B_cFos_4,True,False,False,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_4,cFos,221.637,391.785,80.520,235.868,RAM-12_14_PFC_4.0_B_2,"(RAM-12_14_PFC_4.0_B_cFos_4,)",,"(RAM-12_14_PFC_4.0_B_cFos_4,)"
4,RAM-12_14_PFC_4.0_B_cFos_5,True,True,False,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_cFos_5,cFos,149.162,198.285,80.520,243.730,RAM-12_14_PFC_4.0_B_2,"(RAM-12_14_PFC_4.0_B_cFos_5, RAM-12_14_PFC_4.0...",,"(RAM-12_14_PFC_4.0_B_cFos_5, RAM-12_14_PFC_4.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2225,RAM-12_14_PFC_4.0_B_mKate_45,False,False,True,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_mKate_45,mKate,126.038,396.117,19.531,221.307,RAM-12_14_PFC_4.0_B_4,"(RAM-12_14_PFC_4.0_B_mKate_45,)",,"(RAM-12_14_PFC_4.0_B_mKate_45,)"
2226,RAM-12_14_PFC_4.0_B_mKate_46,False,True,True,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_mKate_46,mKate,20.795,385.174,19.531,513.845,RAM-12_14_PFC_4.0_B_4,"(RAM-12_14_PFC_4.0_B_EGFP_300, RAM-12_14_PFC_4...",,"(RAM-12_14_PFC_4.0_B_EGFP_300, RAM-12_14_PFC_4..."
2227,RAM-12_14_PFC_4.0_B_mKate_47,False,True,True,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_mKate_47,mKate,48.338,409.231,19.531,82.007,RAM-12_14_PFC_4.0_B_4,"(RAM-12_14_PFC_4.0_B_EGFP_299, RAM-12_14_PFC_4...",,"(RAM-12_14_PFC_4.0_B_EGFP_299, RAM-12_14_PFC_4..."
2228,RAM-12_14_PFC_4.0_B_mKate_48,False,False,True,RAM-12_14_PFC_4.0_B,RAM-12_14_PFC_4.0_B_mKate_48,mKate,79.833,159.605,19.531,71.338,RAM-12_14_PFC_4.0_B_4,"(RAM-12_14_PFC_4.0_B_mKate_48,)",,"(RAM-12_14_PFC_4.0_B_mKate_48,)"


In [18]:
# do our doubles agree?
def check_double_diff(df_true):
    print('double labeled differences: ')
    for stain_x, stain_y in itertools.combinations(['cFos', 'EGFP', 'mKate'], r=2):
        x_on_y = df_true.query(f'dummy_{stain_x} == True and dummy_{stain_y} == True and stain == "{stain_x}"')
        y_on_x = df_true.query(f'dummy_{stain_x} == True and dummy_{stain_y} == True and stain == "{stain_y}"')
        diff = x_on_y.__len__() - y_on_x.__len__()
        print(f'{stain_x}, {stain_y}:    difference: {diff};    count: {x_on_y.__len__()}')

# do our triples agree?
def check_triple_ns(comb, df_true):
    stain_x, stain_y, stain_z = comb
    q = df_true.query(
        f'dummy_{stain_x} == True and dummy_{stain_y} == True and dummy_{stain_z} == True and\
         (stain == "{stain_x}" or stain == "{stain_y}" or stain == "{stain_z}")'
    )

    q_x = q.query(f'stain == "{stain_x}"')
    q_y = q.query(f'stain == "{stain_y}"')
    q_z = q.query(f'stain == "{stain_z}"')

    print(f'\ntriple {stain_x},{stain_y},{stain_z} ns:')
    print(stain_x, ' :', q_x.__len__())
    print(stain_y, ' :', q_y.__len__())
    print(stain_z, ' :', q_z.__len__())

check_double_diff(df_true)

for comb in itertools.combinations(['cFos', 'EGFP', 'mKate'], r=3):
    check_triple_ns(comb, df_true)

double labeled differences: 
cFos, EGFP:    difference: 0;    count: 166
cFos, mKate:    difference: 0;    count: 93
EGFP, mKate:    difference: 0;    count: 146

triple cFos,EGFP,mKate ns:
cFos  : 36
EGFP  : 36
mKate  : 36


In [19]:
df_true.to_csv('RAM-REDO_FINAL.csv')