# RAM-8,9 - COLOCALIZED DATA ONLY
#### Jonathan Ramos 2/7/2024

There were some new images from RAM-9 added to the existing set I previously analzyed. Since the immunohistochemistry was done in the same batch, I can parse out these new data, add them to the existing *raw* set, and then normalize/count cells as usual. 

We have the following file naming scheme:
- _1.tif : cFos
- _2.tif : 8-oxo-DG --> EGFP
- _3.tif : mKate2

In [79]:
import numpy as np
import pandas as pd
import glob
import sys

# loading some functions we wrote before
sys.path.append("/Users/jonathanramos/Desktop/LRI/Image ROI Data Wrangling/")
from clean import *
from norm import *
from count import *


# Cleaning, Wrangling Data
## loading data, stitching sets together
### loading our new images to add to the existing set

In [80]:
# loading the NEW data sets
df_new = [pd.read_csv(f) for f in glob.glob('NEW colocalized data/*.csv')]

# some preprocessing steps
df_new_coloc = preprocessing(df_new)

# separating out colocalization types (double/triple) by the number of cols
print(np.unique([len(df.columns) for df in df_new_coloc]))
double_new = [df for df in df_new_coloc if len(df.columns) == 30]
triple_new = [df for df in df_new_coloc if len(df.columns) == 31]

# data is distributed across hundreds of small csvs; let's take a look at one of them
double_new[0]

[30 31]


Unnamed: 0,stain,colocw/hand-drawn,colocw/hand-drawn.1,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,feret_angle,feret_min,circularity,aspect_ratio,roundness,solidity,skewness,kurtosis,filename,analysis_date
0,c-Fos hand-drawn,,0-01c-00032,0-FFF-00026,188.10,192.08,46.0,42.575,44.7391,44.5000,...,0.0,8.0,0.7186,1.3018,0.5521,0.6970,0.3619,-0.8104,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
1,c-Fos hand-drawn,,0-01c-00014,0-FFF-00020,164.63,332.95,57.0,42.575,45.6667,44.0000,...,0.0,10.0,0.5973,1.1692,0.5078,0.5816,0.5207,-0.2471,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
2,c-Fos hand-drawn,,0-01c-00010,0-FFF-00018,370.16,263.66,62.0,42.575,58.0968,58.0000,...,0.0,10.0,0.7894,1.0000,0.7750,0.7381,0.2875,-0.4652,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
3,c-Fos hand-drawn,,0-01c-00034,0-FFF-00016,184.01,213.87,69.0,42.575,45.2029,44.0000,...,0.0,10.0,0.7231,1.1692,0.6147,0.7041,0.8434,-0.0402,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
4,c-Fos hand-drawn,,0-01c-00015,0-FFF-00014,127.24,206.22,74.0,42.575,54.5405,55.0000,...,0.0,10.0,0.7755,1.1692,0.6593,0.7551,0.2372,-0.7736,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,Microglia Cortex,-,-,0-01c-00000,322.15,265.55,66.0,303.667,699.8005,757.3977,...,0.0,8.0,0.8237,1.5163,0.5441,0.8684,-0.7040,-0.4629,RAM-9_4B_D_2.tif,Fri Feb 09 11:53:59 PST 2024
224,Microglia Cortex hand-drawn,0-005-00007,,0-FFF-00119,418.27,40.88,64.0,303.667,503.8918,473.0583,...,0.0,8.0,0.9998,1.3018,0.7682,0.9697,1.5840,4.1261,RAM-9_4B_D_2.tif,Fri Feb 09 11:53:59 PST 2024
225,Microglia Cortex hand-drawn,0-005-00004,,0-FFF-00098,219.27,92.01,106.0,303.667,416.1598,414.9389,...,0.0,12.0,0.9372,1.0000,0.9464,0.9464,0.3297,-0.5688,RAM-9_4B_D_2.tif,Fri Feb 09 11:53:59 PST 2024
226,Microglia Cortex hand-drawn,0-005-00001,,0-FFF-00137,490.92,196.27,136.0,303.667,341.8993,345.1563,...,90.0,12.0,0.8745,1.3192,0.6966,0.9067,0.1549,-0.7200,RAM-9_4B_D_2.tif,Fri Feb 09 11:53:59 PST 2024


### Pulling out coloc data

In [81]:
# we don't wont rows where all coloc cols are '-' 
# (indicates this cell was not colocalized with any of the other given stain types)
def drop_dash(df, coloc_type):
    '''
    takes a dataframe and drops all rows where all coloc rows
    are '-'. 
    args:
        pd.core.frame.DataFrame(N,M), N: the number of rows, M: the number of cols;
            coloc cols must contain 'coloc' in col name.
    '''
    if not coloc_type in {'double', 'triple', 'quad'}:
        raise ValueError('coloc_type must be either "double", "triple", or "quad"')
    
    cols = df.columns
    n_coloc_cols = np.array(['coloc' in f for f in list(cols)]).sum()

    # check that cols 1 and 2 are coloc cols
    assert 'coloc' in cols[1]
    assert 'coloc' in cols[2]
    if coloc_type == 'double':
        # check that the number of coloc cols is exactly 2
        assert n_coloc_cols == 2

        df_dropped = df.replace('-', np.NaN)\
            .dropna(subset=[cols[1], cols[2]], how='all')
        
    if coloc_type == 'triple':
        # check that col 3 is coloc col
        assert 'coloc' in cols[3]

        # check that there are exactly 3 coloc cols
        assert n_coloc_cols == 3

        df_dropped = df.replace('-', np.NaN)\
            .dropna(subset=[cols[1], cols[2], cols[3]], how='all')    
    
    if coloc_type == 'quad':
        # check that cols 3 and 4 are coloc cols
        assert 'coloc' in cols[3]
        assert 'coloc' in cols[4]

        # check that there are exactly 4 coloc cols
        assert n_coloc_cols == 4

        df_dropped = df.replace('-', np.NaN)\
            .dropna(subset=[cols[1], cols[2], cols[3], cols[4]], how='all')
    
    return df_dropped.reset_index()

double_new = [drop_dash(df, coloc_type='double') for df in double_new]
triple_new = [drop_dash(df, coloc_type='triple') for df in triple_new]

# let's check 
double_new[0]

Unnamed: 0,index,stain,colocw/hand-drawn,colocw/hand-drawn.1,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,feret_angle,feret_min,circularity,aspect_ratio,roundness,solidity,skewness,kurtosis,filename,analysis_date
0,0,c-Fos hand-drawn,,0-01c-00032,0-FFF-00026,188.1,192.08,46.0,42.575,44.7391,...,0.0,8.0,0.7186,1.3018,0.5521,0.697,0.3619,-0.8104,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
1,1,c-Fos hand-drawn,,0-01c-00014,0-FFF-00020,164.63,332.95,57.0,42.575,45.6667,...,0.0,10.0,0.5973,1.1692,0.5078,0.5816,0.5207,-0.2471,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
2,2,c-Fos hand-drawn,,0-01c-00010,0-FFF-00018,370.16,263.66,62.0,42.575,58.0968,...,0.0,10.0,0.7894,1.0,0.775,0.7381,0.2875,-0.4652,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
3,3,c-Fos hand-drawn,,0-01c-00034,0-FFF-00016,184.01,213.87,69.0,42.575,45.2029,...,0.0,10.0,0.7231,1.1692,0.6147,0.7041,0.8434,-0.0402,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
4,4,c-Fos hand-drawn,,0-01c-00015,0-FFF-00014,127.24,206.22,74.0,42.575,54.5405,...,0.0,10.0,0.7755,1.1692,0.6593,0.7551,0.2372,-0.7736,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
5,5,c-Fos hand-drawn,,0-01c-00001,0-FFF-00013,160.17,93.08,36.0,42.575,51.4722,...,0.0,6.0,0.6942,1.5492,0.4841,0.72,0.2186,-0.3965,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
6,6,c-Fos hand-drawn,,0-01c-00044,0-FFF-00012,347.42,141.23,51.0,42.575,52.1176,...,0.0,8.0,0.7967,1.3018,0.6121,0.7727,0.1103,-0.7636,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
7,7,c-Fos hand-drawn,,0-01c-00067,0-FFF-00009,340.27,60.06,70.0,42.575,58.3,...,0.0,10.0,0.8913,1.0,0.875,0.8333,0.1222,-1.1012,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
8,8,c-Fos,,0-FFF-00137,0-005-00001,491.49,198.76,78.0,42.575,69.1795,...,90.0,10.0,0.8174,1.1692,0.6949,0.7959,0.431,-0.3694,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024
9,9,c-Fos,,0-FFF-00098,0-005-00004,217.75,91.31,55.0,42.575,76.8182,...,90.0,8.0,0.8592,1.3018,0.6601,0.8333,0.2737,-0.7866,RAM-9_4B_D_1.tif,Fri Feb 09 11:53:59 PST 2024


## Building the necessary cols

In particular we will need a rat_n (sid) col, stain_type col, and a treatment col. the filename col functions as the image name (iid) col.

We need the following cols
- rat_n (sid)
- treatment
- filename (fid)
- imagename (iid)
- stain_type
- CoM_x
- CoM_y
- mean_intensity
- background

In [82]:
def get_ratn(df):
    '''
    takes a dataframe with a col called "filename" and builds a new col,
    "rat_n" by parsing filename labels
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "filename"
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "rat_n 
    '''
    df['rat_n'] =   df.filename.apply(lambda x: '-'.join(x.split('_')[:2]))\
        .replace({' ': ''}, regex=True) # for some reason, we have more leading whitespace chars

    # some checks. we want be sure that the structure of all our rat_n labels is consistent
    # in particular, we expect something of the form 'RAM-9-4B', that is we have exactly
    # two dashes '-' separating some letters, followed by a two numbers (ending in alpha)
    assert df.rat_n.apply(lambda x: x.split('-')[0].isalpha()).sum() == len(df)
    assert df.rat_n.apply(lambda x: x.split('-')[1].isnumeric()).sum() == len(df)
    assert df.rat_n.apply(lambda x: x.split('-')[2].isalnum()).sum() == len(df)

    return df

def get_treatment(df, treatment):
    '''
    takes a dictionary of treatments (built from cohort key) and maps rat_ns to
    treatment. Builds new col called "treatment"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "rat_n"
        treatment: dict, key:val pairs map rat_n (str) to treatment(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "treatment"
    '''
    # creating new treatment col by mapping from cohort key dict
    df['treatment'] = df.rat_n.map(treatment)

    # check that all rat_ns were accounted for
    assert df.treatment.isna().sum() == 0

    return df 

def get_react(df, react):
    '''
    takes a dictionary of treatments (built from cohort key) and maps rat_ns to
    reactivation type. Builds new col called "react"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "rat_n"
        treatment: dict, key:val pairs map rat_n (str) to react(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "react"
    '''
    # creating new treatment col by mapping from cohort key dict
    df['react'] = df.rat_n.map(react)

    # check that all rat_ns were accounted for
    assert df.react.isna().sum() == 0

    return df 


def get_staintype(df, stains):
    '''
    takes a dictionary of treatments (built from cohort key) and maps filenames to
    stain types. Builds new col called "stain_type"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "filename"
        treatment: dict, key:val pairs map rat_n (str) to treatment(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "stain_type"
    '''
    # creating new stain_type col from filename
    df['stain_type'] = df.filename.replace(stains, regex=True)
    
    # check that all filenames were accounted for
    assert df.stain_type.isna().sum() == 0

    return df 

def get_imagename(df):
    '''
    takes a dataframe with a col called "filename" and builds a new col,
    "image_name" by parsing filename labels
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "filename"
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "image_name 
    '''
    df['image_name'] = df.filename.replace({'_[0-9]\.tif': ''}, regex=True)

    return df

def get_sex(df, sex):
    '''
    takes a dictionary of sexes (built from cohort key) and maps rat_ns to
    sex (either "M" or "F"). Builds new col called "sex"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "rat_n"
        treatment: dict, key:val pairs map rat_n (str) to treatment(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "sex"
    '''
    df['sex'] = df.rat_n.replace(sex, regex=True)
    
    # check that all filenames were accounted for
    assert df.sex.isna().sum() == 0

    return df 


def col_wrapper(df, treatment, react, stains, sex):
    '''
    wrapper function for pipeline to build cols
    '''

    df_ratn = get_ratn(df)
    df_treatment = get_treatment(df_ratn, treatment)
    df_react = get_react(df_treatment, react)
    df_staintype = get_staintype(df_react, stains)
    df_imgname = get_imagename(df_staintype)
    df_sex = get_sex(df_imgname, sex)

    return df_sex

### building coloc stain type col from single stain type col

In [83]:
def coloc_staintype(df, coloc, max='quad'):
    '''
    '''
    stains = df.stain_type.unique()
    set_stains = set(stains)

    # for any n number stains, we need any given stain and its complement (all others except that one)
    stain_combinations = [(stain, tuple(sorted(list(set_stains.difference({stain}))))) for stain in stains]

    # construct string, build dict
    # double label
    if coloc == 'double':
        # check that we have exactly two stains
        if len(stains) == 2:
            # build dictionary for replace
            coloc_comb = dict([(stain, f'{stain}_coloc_w_{comb[0]}') for stain, comb in stain_combinations])
        
        elif len(stains) < 2:
            coloc_comb = dict([(stain, f'lonely_{stain}') for stain in stains])
                

    # triple label    
    if coloc == 'triple':
        # check that we have exactly three stains
        if len(stains) == 3:
            # build dictionary for replace
            # if we have quad label, we must denote which group of 3 are triple-labeled
            if max == 'quad':
                coloc_comb = dict([(stain, f'{stain}_coloc_w_{comb[0]},{comb[1]}') for stain, comb in stain_combinations])

            # if we have only triple label, we can just call it triple
            elif max == 'triple':
                coloc_comb = dict([(stain, f'triple_{stain}') for stain, comb in stain_combinations])

        elif len(stains) < 3:
            coloc_comb = dict([(stain, f'lonely_{stain}') for stain in stains])

    # quad    
    if coloc == 'quad':
        # check that we have exatly four stains
        if len(stains) == 4:
            # build dictionary for replace
            coloc_comb = dict([(stain, f'quad_{stain}') for stain in stains])

        elif len(stains) < 4:
            coloc_comb = dict([(stain, f'lonely_{stain}') for stain in stains])

    # toss coloc stain type strings into col
    df['coloc_stain_type'] = df.stain_type.replace(coloc_comb)

    return df

In [84]:
treatment = {
    'RAM-9-4B' : 'OFF_DOX',
    'RAM-9-8B' : 'OFF_DOX',
    'RAM-9-13B' : 'OFF_DOX',
    'RAM-9-10B' : 'OFF_DOX'
}

react = {
    'RAM-9-4B' : 'NR',
    'RAM-9-8B' : 'FR1',
    'RAM-9-13B' : 'NR',
    'RAM-9-10B' : 'NR'  
}

stains = {
    '.*_1.tif$' : 'cFos',
    '.*_2.tif$' : 'EGFP',
    '.*_3.tif$' : 'mKate2'
}

sex = {
    'RAM-9-4B' : 'M',
    'RAM-9-8B' : 'F',
    'RAM-9-13B' : 'F',
    'RAM-9-10B' : 'F'
}

# let's run it
double_newcols = [coloc_staintype(col_wrapper(df, treatment, react, stains, sex), coloc='double', max='triple') for df in double_new]
triple_newcols = [coloc_staintype(col_wrapper(df, treatment, react, stains, sex), coloc='triple', max='triple') for df in triple_new]

# spot check a df with new cols
triple_newcols[0].head()

Unnamed: 0,index,stain,colocw/hand-drawn,colocw/hand-drawn.1,colocw/8-oxo-dG,roi_id,CoM_x,CoM_y,pixel_area,background,...,kurtosis,filename,analysis_date,rat_n,treatment,react,stain_type,image_name,sex,coloc_stain_type
0,0,c-Fos hand-drawn,,0-FFF-00321,0-002-00042,0-FFF-00036,392.42,343.92,37.0,146.6419,...,-1.4305,RAM-9_13B_A_1.tif,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos
1,1,c-Fos hand-drawn,,0-FFF-00288,0-002-00013,0-FFF-00033,155.96,269.51,39.0,146.6419,...,-0.5645,RAM-9_13B_A_1.tif,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos
2,2,c-Fos hand-drawn,,0-01c-00136,0-002-00019,0-FFF-00029,49.53,249.3,44.0,146.6419,...,0.2419,RAM-9_13B_A_1.tif,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos
3,3,c-Fos hand-drawn,,0-01c-00151,0-002-00028,0-FFF-00019,224.93,188.82,44.0,146.6419,...,0.0359,RAM-9_13B_A_1.tif,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos
4,4,c-Fos hand-drawn,,0-FFF-00250,0-002-00018,0-FFF-00015,327.15,174.65,66.0,146.6419,...,2.3823,RAM-9_13B_A_1.tif,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos


## Retaining only full coloc cell groupings
The SNR threshold may have caused some colocalized cells to become "incomplete." That is, say we had a PV cell that was colocalized to a c-Fos cell, resulting in two entries in our double labeled set: a PV_coloc_w/c-Fos cell, and a c-Fos_coloc_w/_PV cell. It may be the case that this particular PV cell was removed by the SNR threshold, but the c-Fos cell was not. If the PV cell was removed, we would still have that left over c-Fos_coloc_w/_PV cell in our set. 

The kernel of the issue is that we set a threshold based on one stain type, but we want to remove a cell of a different stain type; that is, we want to maintain full coloc groupings among our remaining colocalized cells.

My approach here is to create groupings (sets) of coloc cells roi ids and remove cells who do not meat the following criteria:
- each set must contain exactly the number of roi ids required to create the coloc stain type combinations (2 for double, 3 for triple, and 4 for quad)
- within each image, each unique set must appear the appropriate number of times (2 for double, 3 for triple, and 4 for quad)

However, before creating groupings the following issues must be addressed:
- colocw/ titles are not consistent or accurately named
- roi id strings are not unique across stain types or across images

NOTE because not only are roi id strings reused across stain types, they are reset for each unique image, we must carry out this process on an image by image basis. We may only concat our distributed dfs after subset selection and data replacement (when cols are finally standardized across images and stain type combinations).

In [85]:
def rename_colocw_cols(df):
    '''
    '''
    coloc_cols = [col for col in df.columns if 'colocw/' in col]
    coloc_rename = []
    for col in coloc_cols:
        # for a given colocw/ col, get unique (single) stain_type (there should be exactly 1)
        single_stain_type = df[df[col].isna()].stain_type.unique()
        assert len(single_stain_type) == 1

        # append in (key, val) format
        coloc_rename.append((col, f'coloc_w/_{single_stain_type.item()}'))

    d_coloc_rename = dict(coloc_rename)
    df = df.rename(columns=d_coloc_rename)

    return df

def rename_roi_ids(df):
    '''
    '''
    coloc_cols = [col for col in df.columns if 'coloc_w/' in col]
    for col in coloc_cols:        
        stain = col.split('_')[-1]
        # add stain string to end of roi id if not nan
        df[col] = df[col].apply(lambda x: x+f'_{stain}' if pd.notnull(x) else x)

    df['roi_id'] = df.apply(lambda x: x.roi_id + '_' + x.stain_type, axis=1)

    return df

def count_coloc_groupings(df):
    '''
    '''
    roi_id_cols = [col for col in df.columns if 'coloc_w/_' in col] + ['roi_id']
    groups = df.apply(lambda x: tuple(sorted([y for y in list(x[roi_id_cols]) if pd.notnull(y)])), axis=1)
    df['coloc_roi_id_grouping'] = groups
    groups, counts = np.unique(df.coloc_roi_id_grouping, return_counts=True)
    d_group_counts = dict(zip(groups, counts))

    df['roi_id_grouping_counts'] = df.coloc_roi_id_grouping.map(d_group_counts)

    return df

def paired(df, coloc_type):
    '''
    '''
    # remove unnecesary index col
    if 'index' in df.columns:
        df = df.drop('index', axis=1)

    # enforce coloc_type
    if not coloc_type in {'double', 'triple', 'quad'}:
        raise ValueError('coloc_type must be either "double", "triple", or "quad"')
    
    cols = df.columns
    n_coloc_cols = np.array(['colocw/' in f for f in list(cols)]).sum()

    if coloc_type == 'double':
        # check that the number of coloc cols is exactly 2, and 
        # that there are exactly 2 unique (single) stain_types in df
        assert n_coloc_cols == 2
        assert len(df.stain_type.unique()) == 2

        # rename coloc_w/ cols; rename roi_id strings
        df_rename = rename_roi_ids(rename_colocw_cols(df))

        # create roi id coloc groupings, count groupings
        df_grouped = count_coloc_groupings(df_rename)

        # only return rows that were fully paired; for double we expect exactly 2
        df_paired = df_grouped[df_grouped.roi_id_grouping_counts == 2]

    if coloc_type == 'triple':
        # check that there are exactly 3 coloc cols, and that
        # there are exactly 3 unique (single) stain_types in df
        assert n_coloc_cols == 3
        assert len(df.stain_type.unique()) == 3

        # rename coloc_w/ cols; rename roi_id strings
        df_rename = rename_roi_ids(rename_colocw_cols(df))

        # create roi id coloc groupings, count groupings
        df_grouped = count_coloc_groupings(df_rename)

        # only return rows that were fully paired; for triple we expect exactly 3
        df_paired = df_grouped[df_grouped.roi_id_grouping_counts == 3]
    
    if coloc_type == 'quad':
        # check that there are exactly 4 coloc cols, and that
        # there are exactly 3 unique (single) stain_types in df
        assert n_coloc_cols == 4
        assert len(df.stain_type.unique()) == 4

        # rename coloc_w/ cols; rename roi_id strings
        df_rename = rename_roi_ids(rename_colocw_cols(df))

        # create roi id coloc groupings, count groupings
        df_grouped = count_coloc_groupings(df_rename)

        # only return rows that were fully paired; for quad we expect exactly 4
        df_paired = df_grouped[df_grouped.roi_id_grouping_counts == 4]
    
    return df_paired

# let's run it
double_paired = [paired(df, coloc_type='double') for df in double_newcols if len(df.stain_type.unique()) >= 2]
triple_paired = [paired(df, coloc_type='triple') for df in triple_newcols if len(df.stain_type.unique()) >= 3]

print(triple_paired[0].shape)
triple_paired[0].head()

(33, 40)


Unnamed: 0,stain,coloc_w/_cFos,coloc_w/_EGFP,coloc_w/_mKate2,roi_id,CoM_x,CoM_y,pixel_area,background,mean_intensity,...,analysis_date,rat_n,treatment,react,stain_type,image_name,sex,coloc_stain_type,coloc_roi_id_grouping,roi_id_grouping_counts
0,c-Fos hand-drawn,,0-FFF-00321_EGFP,0-002-00042_mKate2,0-FFF-00036_cFos,392.42,343.92,37.0,146.6419,302.1833,...,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos,"(0-002-00042_mKate2, 0-FFF-00036_cFos, 0-FFF-0...",3
1,c-Fos hand-drawn,,0-FFF-00288_EGFP,0-002-00013_mKate2,0-FFF-00033_cFos,155.96,269.51,39.0,146.6419,154.0976,...,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos,"(0-002-00013_mKate2, 0-FFF-00033_cFos, 0-FFF-0...",3
2,c-Fos hand-drawn,,0-01c-00136_EGFP,0-002-00019_mKate2,0-FFF-00029_cFos,49.53,249.3,44.0,146.6419,154.4059,...,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos,"(0-002-00019_mKate2, 0-01c-00136_EGFP, 0-FFF-0...",3
3,c-Fos hand-drawn,,0-01c-00151_EGFP,0-002-00028_mKate2,0-FFF-00019_cFos,224.93,188.82,44.0,146.6419,174.4724,...,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos,"(0-002-00028_mKate2, 0-01c-00151_EGFP, 0-FFF-0...",3
4,c-Fos hand-drawn,,0-FFF-00250_EGFP,0-002-00018_mKate2,0-FFF-00015_cFos,327.15,174.65,66.0,146.6419,209.0991,...,Fri Feb 09 11:46:15 PST 2024,RAM-9-13B,OFF_DOX,NR,cFos,RAM-9_13B_A,F,triple_cFos,"(0-002-00018_mKate2, 0-FFF-00015_cFos, 0-FFF-0...",3


## Integrate old data
### New data subset selection

In [86]:
# taking out only the col subset we need
cols = ['rat_n', 'sex', 'treatment', 'react', 'stain_type', 'coloc_stain_type', 'filename', 'image_name', 'CoM_x', 'CoM_y', 'mean_intensity', 'background']

# rexp
df_double_subset = pd.concat([df[cols] for df in double_paired])
df_triple_subset = pd.concat([df[cols] for df in triple_paired])

df_double_subset

Unnamed: 0,rat_n,sex,treatment,react,stain_type,coloc_stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background
0,RAM-9-4B,M,OFF_DOX,NR,cFos,cFos_coloc_w_EGFP,RAM-9_4B_D_1.tif,RAM-9_4B_D,188.10,192.08,44.7391,42.575
1,RAM-9-4B,M,OFF_DOX,NR,cFos,cFos_coloc_w_EGFP,RAM-9_4B_D_1.tif,RAM-9_4B_D,164.63,332.95,45.6667,42.575
2,RAM-9-4B,M,OFF_DOX,NR,cFos,cFos_coloc_w_EGFP,RAM-9_4B_D_1.tif,RAM-9_4B_D,370.16,263.66,58.0968,42.575
3,RAM-9-4B,M,OFF_DOX,NR,cFos,cFos_coloc_w_EGFP,RAM-9_4B_D_1.tif,RAM-9_4B_D,184.01,213.87,45.2029,42.575
4,RAM-9-4B,M,OFF_DOX,NR,cFos,cFos_coloc_w_EGFP,RAM-9_4B_D_1.tif,RAM-9_4B_D,127.24,206.22,54.5405,42.575
...,...,...,...,...,...,...,...,...,...,...,...,...
21,RAM-9-10B,F,OFF_DOX,NR,EGFP,EGFP_coloc_w_cFos,RAM-9_10B_A_2.tif,RAM-9_10B_A,155.73,138.18,505.4767,458.422
22,RAM-9-10B,F,OFF_DOX,NR,EGFP,EGFP_coloc_w_cFos,RAM-9_10B_A_2.tif,RAM-9_10B_A,169.76,61.95,874.9739,458.422
23,RAM-9-10B,F,OFF_DOX,NR,EGFP,EGFP_coloc_w_cFos,RAM-9_10B_A_2.tif,RAM-9_10B_A,500.28,52.13,1344.3059,458.422
24,RAM-9-10B,F,OFF_DOX,NR,EGFP,EGFP_coloc_w_cFos,RAM-9_10B_A_2.tif,RAM-9_10B_A,373.88,390.09,533.9654,458.422


### load old data

In [87]:
# loading the OLD data set
df_old = [pd.read_csv(f) for f in sorted(glob.glob('OLD data/*.csv'))]

# upon inspecting the cols, this is the closest I could get to match the format
# of the new data. I still need to build image_name cols and do a lot of renaming
print('\nRAM-8:', df_old[0].columns)
df_RAM8 = df_old[0][['Rat_n', 'Sex', 'Dox', 'React', 'Stain', 'FileName',\
                      'XM', 'YM', 'Mean', 'Background']]\
    .reset_index().drop('index', axis=1)

print('\nRAM-9:', df_old[1].columns)
df_RAM9 = df_old[1][['Rat_n', 'Sex', 'Dox', 'React', 'Stain', 'FileName',\
                      'XM', 'YM', 'Mean', 'Background']]\
    .reset_index().drop('index', axis=1)

new_cols = {
    'Rat_n': 'rat_n',
    'Sex': 'sex',
    'Dox': 'treatment',
    'React': 'react',
    'Stain': 'stain_type',
    'FileName': 'filename',
    'XM': 'CoM_x',
    'YM': 'CoM_y',
    'Mean': 'mean_intensity',
    'Background': 'background'
}

# relabeling some things for consistency
# cleaning up RAM8
df_RAM8['Rat_n'] = df_RAM8.Rat_n.replace({'rat': 'RAM-8-'}, regex=True)
df_RAM8['FileName'] = df_RAM8.FileName.apply(lambda x: x+'.tif' if not '.tif' in x else x)
df_RAM8['FileName'] = df_RAM8.apply(lambda x: '_'.join([x.Rat_n, x.FileName]), axis=1)
df_RAM8 = df_RAM8.rename(columns = new_cols)

# cleaning up RAM9
df_RAM9['Rat_n'] = df_RAM9.Rat_n.replace({'rat_': 'RAM-9-'}, regex=True)
df_RAM9['FileName'] = df_RAM9.FileName.apply(lambda x: x+'.tif' if not '.tif' in x else x)
df_RAM9 = df_RAM9.rename(columns = new_cols)

# check that we're all matching
assert set(df_RAM8.columns) == set(df_RAM9.columns)

# build new image_name col
df_RAM89 = get_imagename(pd.concat([df_RAM8, df_RAM9]))

# let's take a look!
print(df_RAM89.shape)
print(df_RAM89.stain_type.unique())
df_RAM89

  df_old = [pd.read_csv(f) for f in sorted(glob.glob('OLD data/*.csv'))]



RAM-8: Index(['Unnamed: 0', ' ', 'AR', 'AnalysisDate', 'Area', 'Background',
       'CellNumber', 'Circ.', 'Experiment', 'FileName', 'Ind_var_group',
       'Indi_var_group', 'Integrated_Density', 'Max', 'Mean',
       'Mean-Background', 'Min', 'Round', 'Solidity', 'Stain', 'StdDev',
       'Subject', 'XM', 'YM', 'directory', 'roiName', 'Rat_n', 'Coord', 'Sex',
       'Dox', 'React'],
      dtype='object')

RAM-9: Index(['Unnamed: 0', ' ', 'AR', 'AnalysisDate', 'Area', 'Background',
       'CellNumber', 'Circ.', 'Experiment', 'FileName', 'Ind_var_group',
       'Indi_var_group', 'Integrated_Density', 'Max', 'Mean',
       'Mean-Background', 'Min', 'Round', 'Solidity', 'Stain', 'StdDev',
       'Subject', 'XM', 'YM', 'directory', 'roiName', 'Rat_n', 'Treatment',
       'Coord', 'Sex', 'Dox', 'React'],
      dtype='object')
(30855, 11)
['single_c-Fos' 'single_EGFP' 'single_mKate2' 'c-Fos_coloc_with_EGFP'
 'EGFP_coloc_with_c-Fos' 'mKate2_coloc_with_EGFP'
 'c-Fos_coloc_with_mKate2' 'mKate

Unnamed: 0,rat_n,sex,treatment,react,stain_type,filename,CoM_x,CoM_y,mean_intensity,background,image_name
0,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,345.615,79.261,141.189,56.949,RAM-8-10C_3.4_PL_A
1,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,57.237,77.429,140.347,56.949,RAM-8-10C_3.4_PL_A
2,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,13.054,117.141,255.088,56.949,RAM-8-10C_3.4_PL_A
3,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,368.777,231.472,92.041,56.949,RAM-8-10C_3.4_PL_A
4,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,107.768,29.112,194.964,56.949,RAM-8-10C_3.4_PL_A
...,...,...,...,...,...,...,...,...,...,...,...
18180,RAM-9-7B,F,DOX_ON,FR1,mKate2,RAM-9_7B_D_3.tif,260.259,207.464,12.488,5.675,RAM-9_7B_D
18181,RAM-9-7B,F,DOX_ON,FR1,mKate2,RAM-9_7B_D_3.tif,193.557,195.296,12.334,5.675,RAM-9_7B_D
18182,RAM-9-7B,F,DOX_ON,FR1,mKate2_coloc_w/_GFP,RAM-9_7B_D_3.tif,347.029,84.184,37.996,5.675,RAM-9_7B_D
18183,RAM-9-7B,F,DOX_ON,FR1,mKate2_coloc_w/_GFP,RAM-9_7B_D_3.tif,260.259,207.464,12.488,5.675,RAM-9_7B_D


### Standardizing labels
All col names were standardized in the previous cells, however there are still some inconsitencies in the specific label strings between old vs new image data.

In particular:
- treatment labels do not match ("ON_DOX" vs "DOX_ON")
- stain_type labels are inconsistent
    - "EGFP" vs "GFP"
    - "c-Fos" vs "cFos"
    - "with_" vs "w/_"
    - "Triple-labeled_" vs "triple"
    - "single_mKate2" vs "mKate2"
    - "single_c-Fox" vs "c-Fos" etc

In [88]:
# standardizing treatment labels
df_RAM89 = df_RAM89.replace({'DOX_OFF' : 'OFF_DOX', 'DOX_ON': 'ON_DOX'}, regex=True)
print(df_RAM89.treatment.unique())

# standardizing stain_type labels
corrected_staintypes = {
    'c-Fos': 'cFos',
    'single_': '',
    'GFP': 'EGFP',
    'Triple-labeled_': 'triple_',
    'with_': 'w/_'
}

df_RAM89['stain_type'] = df_RAM89.stain_type.replace(corrected_staintypes, regex=True)\
    .replace({'EEGFP': 'EGFP'}, regex=True)

df_RAM89['stain_type'] = df_RAM89.stain_type.replace({'/':''}, regex=True)

# let's take a look! we expect exactly 12 stain_type combinations 
# (3 types of single, 6 types of double, 3 types of triple)
sorted(df_RAM89.stain_type.unique())

['ON_DOX' 'OFF_DOX']


['EGFP',
 'EGFP_coloc_w_cFos',
 'EGFP_coloc_w_mKate2',
 'cFos',
 'cFos_coloc_w_EGFP',
 'cFos_coloc_w_mKate2',
 'mKate2',
 'mKate2_coloc_w_EGFP',
 'mKate2_coloc_w_cFos',
 'triple_EGFP',
 'triple_cFos',
 'triple_mKate2']

just standardizing stain_type vs coloc_stain_type cols

In [89]:
df_RAM89_coloc = df_RAM89.query('stain_type != "EGFP" and stain_type != "cFos" and stain_type != "mKate2"').copy(deep=True)
df_RAM89_coloc['coloc_stain_type'] = df_RAM89_coloc.stain_type
df_RAM89_coloc['stain_type'] = df_RAM89_coloc.stain_type.apply(lambda x: x.split('_')[0])

# reorder cols for concat
df_RAM89_coloc = df_RAM89_coloc[cols]

print(df_RAM89_coloc.shape)
df_RAM89_coloc.head()


(8455, 12)


Unnamed: 0,rat_n,sex,treatment,react,stain_type,coloc_stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background
556,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,155.84,26.785,141.465,56.949
557,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,206.78,79.099,168.818,56.949
558,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949
559,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,408.535,6.85,107.794,56.949
560,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949


### concat old and new sets

In [90]:
# check that we're all matching across new and old sets
df_new_subset = pd.concat([df_double_subset, df_triple_subset])
print(df_new_subset.shape)

assert set(df_RAM89_coloc.columns) == set(df_new_subset.columns)

# time to concat!
df_full = pd.concat([df_RAM89_coloc, df_new_subset])

# adding a new col called group to easily distinguish between RAM-8 and RAM-9
df_full['group'] = df_full.rat_n.apply(lambda x: '-'.join(x.split('-')[:2]))

# let's take a look!
print(df_full.shape)
df_full.head()


(2184, 12)
(10639, 13)


Unnamed: 0,rat_n,sex,treatment,react,stain_type,coloc_stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group
556,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,155.84,26.785,141.465,56.949,RAM-8
557,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,206.78,79.099,168.818,56.949,RAM-8
558,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949,RAM-8
559,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,408.535,6.85,107.794,56.949,RAM-8
560,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949,RAM-8


### One more thing: building a new treatment col
To make splitting by treatment a bit easier I'm going to build a new aggregate columns from the combination of columns required to each unique group

In [91]:
df_full = df_full.rename(columns= {'treatment': 'dox'})
df_full['treatment'] = df_full.apply(lambda x: '_'.join([x.dox, x.react]), axis=1)

df_full

Unnamed: 0,rat_n,sex,dox,react,stain_type,coloc_stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group,treatment
556,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,155.840,26.785,141.465,56.9490,RAM-8,ON_DOX_FR1
557,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,206.780,79.099,168.818,56.9490,RAM-8,ON_DOX_FR1
558,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.9490,RAM-8,ON_DOX_FR1
559,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,408.535,6.85,107.794,56.9490,RAM-8,ON_DOX_FR1
560,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.9490,RAM-8,ON_DOX_FR1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,RAM-9-8B,F,OFF_DOX,FR1,EGFP,triple_EGFP,RAM-9_8B_D_2.tif,RAM-9_8B_D,406.500,223.41,665.829,183.4801,RAM-9,OFF_DOX_FR1
2,RAM-9-8B,F,OFF_DOX,FR1,mKate2,triple_mKate2,RAM-9_8B_D_3.tif,RAM-9_8B_D,406.050,223.95,732.8926,609.2310,RAM-9,OFF_DOX_FR1
0,RAM-9-10B,F,OFF_DOX,NR,cFos,triple_cFos,RAM-9_10B_C_1.tif,RAM-9_10B_C,346.600,221.87,37.6667,38.3728,RAM-9,OFF_DOX_NR
1,RAM-9-10B,F,OFF_DOX,NR,EGFP,triple_EGFP,RAM-9_10B_C_2.tif,RAM-9_10B_C,346.000,222.71,1351.7299,381.0934,RAM-9,OFF_DOX_NR


## Droppings nans, duplicates

In [92]:
# which cols have nans, how many?
print('Nan per col:')
print(df_full.isna().sum())
# it looks like we have 8 nans in the mean_intensity cols. If there is no
# intensity data, there's not much we can do about that; time to drop those.
df_full = df_full.dropna()

# how many duplicated rows do we have?
print('\nTotal n of duplicated rows:')
print(df_full.duplicated().sum())

# looks like we've got 160 duplicated rows. Let's take a look
df_full[df_full.duplicated(keep=False)]

# those duplicates look real, I'm ok with getting rid of them
df_cleaned = df_full.drop_duplicates().reset_index().drop('index', axis=1)

Nan per col:
rat_n               0
sex                 0
dox                 0
react               0
stain_type          0
coloc_stain_type    0
filename            0
image_name          0
CoM_x               0
CoM_y               0
mean_intensity      0
background          0
group               0
treatment           0
dtype: int64

Total n of duplicated rows:
41


Check results once more

In [93]:
# which cols have nans, how many?
print('Nan per col:')
print(df_cleaned.isna().sum())

print('\nTotal n of duplicated rows:')
print(df_cleaned.duplicated().sum())

### looks good to me!

Nan per col:
rat_n               0
sex                 0
dox                 0
react               0
stain_type          0
coloc_stain_type    0
filename            0
image_name          0
CoM_x               0
CoM_y               0
mean_intensity      0
background          0
group               0
treatment           0
dtype: int64

Total n of duplicated rows:
0


# Computing mean - background

In [94]:
df_cleaned['mean-background'] = df_cleaned.mean_intensity.astype('f') - df_cleaned.background.astype('f')

# print out some descriptive statistics for intensity
for stain in df_cleaned.coloc_stain_type.unique():
    df_stain = df_cleaned.query(f'coloc_stain_type == "{stain}"')
    print(f'\n===== {stain} =====')
    print(df_stain['mean-background'].describe())

df_cleaned.to_csv('RAM-8,9_cleaned,rawdata_COLOC.csv')


===== cFos_coloc_w_EGFP =====
count    2117.000000
mean       86.026947
std        78.008156
min        -3.798309
25%        37.656006
50%        64.727005
75%       108.247009
max       658.367004
Name: mean-background, dtype: float64

===== EGFP_coloc_w_cFos =====
count    1778.000000
mean      345.894379
std       299.372253
min       -82.836487
25%       143.864006
50%       244.955505
75%       444.401756
max      1897.868042
Name: mean-background, dtype: float64

===== mKate2_coloc_w_EGFP =====
count    1991.000000
mean      246.570541
std       237.578461
min       -45.468506
25%        76.408001
50%       166.722000
75%       345.655853
max      1426.952026
Name: mean-background, dtype: float64

===== cFos_coloc_w_mKate2 =====
count    1043.000000
mean       94.521988
std        88.392181
min        -3.641296
25%        35.054501
50%        70.227997
75%       127.016499
max       545.901978
Name: mean-background, dtype: float64

===== mKate2_coloc_w_cFos =====
count    1043.0

# Normalizing Intensities, Counting Mean Cell Ns
## Normalize intensity, write to disk
for these data I think we're mostly concerned with percent of cells colocalized either as a percent of EGPF or a percent of c-fos/mkate2. I also know that I had previously divded up the data down a median split of the raw intensity and computed the percent of c-Fos that had mKate2 (please see older notebooks, circa 2022).

In [96]:
group = 'RAM-8,9'
for stain in df_cleaned.coloc_stain_type.unique():

    # split by stain
    df_stain = df_cleaned.query(f'coloc_stain_type == "{stain}"')

    # split by group, normalize RAM-8 and RAM-9 independently then concat
    df_RAM8 = df_stain.query(f'group == "RAM-8"')
    df_RAM9 = df_stain.query(f'group == "RAM-9"')

    if len(df_RAM8) > 0 and len(df_RAM9) > 0:
        # normalize to ON_DOX_FR1
        df_norm_RAM8 = normalize_intensity(df_RAM8, norm_condition='ON_DOX_FR1', col='mean-background')
        df_norm_RAM9 = normalize_intensity(df_RAM9, norm_condition='ON_DOX_FR1', col='mean-background')
        df_norm = pd.concat([df_norm_RAM8, df_norm_RAM9])

    elif len(df_RAM8) <= 0 and len(df_RAM9) > 0:
        print(f'{stain} not found in RAM8')
        df_norm = normalize_intensity(df_RAM9, norm_condition='ON_DOX_FR1', col='mean-background')

    elif len(df_RAM8) > 0 and len(df_RAM9) <= 0:
        print(f'{stain} not found in RAM9')
        df_norm = normalize_intensity(df_RAM8, norm_condition='ON_DOX_FR1', col='mean-background')

    else:
        print(f'{stain} not found in RAM8 or RAM9')
        continue
        
    df_norm.to_csv(f'{group}_{stain}_NORM.csv')

    # reorganize into cols for prism
    df_prism = prism_reorg(df_norm, col='norm_mean-background')
    df_prism.to_csv(f'{group}_{stain}_PRISM.csv')
    
# let's take a look at one of our final output dataframes, organized for entry into prism
print(stain)
df_prism

EGFP_coloc_w_mKate2 not found in RAM8
EGFP_coloc_w_mKate2


Unnamed: 0,OFF_DOX_FR1,OFF_DOX_NR,OFF_DOX_VR5,ON_DOX_FR1
0,0.306090,0.425651,0.872179,0.797914
1,0.569203,0.842586,0.919135,1.607490
2,1.171423,0.994478,1.876442,0.285391
3,0.505221,0.238469,3.060362,0.394978
4,0.694743,2.311794,1.091533,0.586360
...,...,...,...,...
812,,,0.797983,
813,,,1.596812,
814,,,1.216580,
815,,,0.225317,


In [97]:
df_cleaned.coloc_stain_type.unique()

array(['cFos_coloc_w_EGFP', 'EGFP_coloc_w_cFos', 'mKate2_coloc_w_EGFP',
       'cFos_coloc_w_mKate2', 'mKate2_coloc_w_cFos', 'triple_cFos',
       'triple_EGFP', 'triple_mKate2', 'EGFP_coloc_w_mKate2'],
      dtype=object)

## Count mean cell ns, write to disk

In [98]:
# count n of unique image names per subject
sid = 'rat_n'
iid = 'image_name'
cols = ['treatment', 'coloc_stain_type', sid, iid]
group = 'RAM-8,9'

# wrapper fn calls
for stain in df_cleaned.coloc_stain_type.unique():
    
    # split by stain type
    df_stain = df_cleaned[df_cleaned.coloc_stain_type == stain]

    # compute mean cell ns
    df_means = mean_cell_n(df_stain, df_cleaned, cols, sid, iid)

    # write to disk
    df_means.to_csv(f'{group}_{stain}_mean_cell_ns.csv')

# let's take a look at one of our final output dataframes
print(stain)
df_means

EGFP_coloc_w_mKate2


Unnamed: 0,rat_n,treatment,coloc_stain_type,cell_count_sums,image_n,mean_cell_n
0,RAM-9-11B,OFF_DOX_FR1,EGFP_coloc_w_mKate2,128,3,42.666667
1,RAM-9-3B,OFF_DOX_FR1,EGFP_coloc_w_mKate2,81,4,20.25
2,RAM-9-8B,OFF_DOX_FR1,EGFP_coloc_w_mKate2,31,4,7.75
3,RAM-9-10B,OFF_DOX_NR,EGFP_coloc_w_mKate2,45,3,15.0
4,RAM-9-13B,OFF_DOX_NR,EGFP_coloc_w_mKate2,252,6,42.0
5,RAM-9-4B,OFF_DOX_NR,EGFP_coloc_w_mKate2,229,6,38.166667
6,RAM-9-12B,OFF_DOX_VR5,EGFP_coloc_w_mKate2,219,4,54.75
7,RAM-9-14B,OFF_DOX_VR5,EGFP_coloc_w_mKate2,297,4,74.25
8,RAM-9-2B,OFF_DOX_VR5,EGFP_coloc_w_mKate2,153,4,38.25
9,RAM-9-5B,OFF_DOX_VR5,EGFP_coloc_w_mKate2,148,4,37.0


# Binning High vs Low cFos
cFos intensity will be split and binned by the median intensity across all groups. I will add a new col called "cfos_bin" containing the labels 'cfos_hi' or 'cfos_lo' denoting whether that cell was either above or below the median mean-background intensity respectively. 

In [99]:
df_cfos = df_cleaned.query(f'stain_type == "cFos"').reset_index().drop('index', axis=1)
cfos_median = df_cfos['mean-background'].median()
df_cfos['cfos_bin'] = df_cfos.apply(lambda x: 'cfos_hi' if x['mean-background'] >= cfos_median else 'cfos_lo', axis=1)

print(f'median cfos mean-background: {cfos_median}')
df_cfos.head()

median cfos mean-background: 65.31959533691406


Unnamed: 0,rat_n,sex,dox,react,stain_type,coloc_stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group,treatment,mean-background,cfos_bin
0,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,155.84,26.785,141.465,56.949,RAM-8,ON_DOX_FR1,84.515991,cfos_hi
1,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,206.78,79.099,168.818,56.949,RAM-8,ON_DOX_FR1,111.868988,cfos_hi
2,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949,RAM-8,ON_DOX_FR1,35.091999,cfos_lo
3,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,408.535,6.85,107.794,56.949,RAM-8,ON_DOX_FR1,50.844997,cfos_lo
4,RAM-8-10C,F,ON_DOX,FR1,cFos,cFos_coloc_w_EGFP,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949,RAM-8,ON_DOX_FR1,198.138992,cfos_hi


## Counting High/Low cFos

In [100]:
# count n of unique image names per subject
sid = 'rat_n'
iid = 'image_name'
cols = ['treatment', 'cfos_bin', sid, iid]
group = 'RAM-8,9'

# wrapper fn calls
for stain in df_cfos.cfos_bin.unique():
    
    # split by stain type
    df_stain = df_cfos[df_cfos.cfos_bin == stain]

    # compute mean cell ns
    df_means = mean_cell_n(df_stain, df_cfos, cols, sid, iid)

    # write to disk
    df_means.to_csv(f'{group}_{stain}_mean_cell_ns.csv')

# let's take a look at one of our final output dataframes
print(stain)
df_means

cfos_lo


Unnamed: 0,rat_n,treatment,cfos_bin,cell_count_sums,image_n,mean_cell_n
0,RAM-8-15C,OFF_DOX_FR1,cfos_lo,58,6,9.666667
1,RAM-8-4C,OFF_DOX_FR1,cfos_lo,94,3,31.333333
2,RAM-8-7C,OFF_DOX_FR1,cfos_lo,10,1,10.0
3,RAM-8-8C,OFF_DOX_FR1,cfos_lo,61,3,20.333333
4,RAM-9-11B,OFF_DOX_FR1,cfos_lo,42,3,14.0
5,RAM-9-3B,OFF_DOX_FR1,cfos_lo,85,4,21.25
6,RAM-9-8B,OFF_DOX_FR1,cfos_lo,34,4,8.5
7,RAM-8-1C,OFF_DOX_NR,cfos_lo,20,6,3.333333
8,RAM-9-10B,OFF_DOX_NR,cfos_lo,44,3,14.666667
9,RAM-9-13B,OFF_DOX_NR,cfos_lo,185,6,30.833333


In [135]:
# count n of unique image names per subject
sid = 'rat_n'
iid = 'image_name'
cols = ['treatment', 'coloc_stain_type', 'cfos_bin', sid, iid]
group = 'RAM-8,9'

# wrapper fn calls
for stain in df_cfos.cfos_bin.unique():
    
    # split by stain type
    target_stain = 'cFos_coloc_w_mKate2'
    df_bin = df_cfos[(df_cfos.cfos_bin == stain) & (df_cfos.coloc_stain_type == target_stain)]

    # compute mean cell ns
    df_means = mean_cell_n(df_bin, df_cfos, cols, sid, iid)

    # for rats who did not have any counts of given stain type, construct new null df and concat
    if set(df_means.rat_n) != set(df_cleaned.rat_n):
        diff = set(df_cleaned.rat_n.unique()).difference(set(df_means.rat_n.unique()))
        key = df_cleaned[['rat_n', 'treatment']].drop_duplicates()
        df_missing = key[key.rat_n.isin(diff)].reset_index().drop('index', axis=1)
        df_missing['coloc_stain_type'] = target_stain
        df_missing['cfos_bin'] = stain
        df_missing['cell_count_sums'] = 0
        df_missing['image_n'] = np.nan
        df_missing['mean_cell_n'] = 0

        df_means = pd.concat([df_means, df_missing]).sort_values(by=['treatment', 'rat_n'])

    # write to disk
    df_means.to_csv(f'{group}_{stain}_mean_cell_ns.csv')

# let's take a look at one of our final output dataframes
print(stain)
df_means

cfos_lo


Unnamed: 0,rat_n,treatment,coloc_stain_type,cfos_bin,cell_count_sums,image_n,mean_cell_n
0,RAM-8-15C,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,9,6.0,1.5
1,RAM-8-4C,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,32,3.0,10.666667
4,RAM-8-7C,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,0,,0.0
2,RAM-8-8C,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,15,3.0,5.0
3,RAM-9-11B,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,22,3.0,7.333333
4,RAM-9-3B,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,35,4.0,8.75
5,RAM-9-8B,OFF_DOX_FR1,cFos_coloc_w_mKate2,cfos_lo,2,4.0,0.5
6,RAM-8-1C,OFF_DOX_NR,cFos_coloc_w_mKate2,cfos_lo,3,6.0,0.5
7,RAM-9-10B,OFF_DOX_NR,cFos_coloc_w_mKate2,cfos_lo,6,3.0,2.0
8,RAM-9-13B,OFF_DOX_NR,cFos_coloc_w_mKate2,cfos_lo,43,6.0,7.166667


In [106]:
set(df_means.rat_n.unique())

{'RAM-8-13C',
 'RAM-8-15C',
 'RAM-8-1C',
 'RAM-8-3C',
 'RAM-8-4C',
 'RAM-8-8C',
 'RAM-9-10B',
 'RAM-9-11B',
 'RAM-9-12B',
 'RAM-9-13B',
 'RAM-9-14B',
 'RAM-9-1B',
 'RAM-9-2B',
 'RAM-9-3B',
 'RAM-9-4B',
 'RAM-9-5B',
 'RAM-9-7B',
 'RAM-9-8B'}

In [130]:
diff = set(df_cleaned.rat_n.unique()).difference(set(df_means.rat_n.unique()))
key = df_cleaned[['rat_n', 'treatment']].drop_duplicates()
missing = key[key.rat_n.isin(diff)]

In [119]:
df_cleaned.groupby(['rat_n', 'treatment']).value_counts().reset_index()

Unnamed: 0,rat_n,treatment,sex,dox,react,stain_type,coloc_stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group,mean-background,count
0,RAM-8-10C,ON_DOX_FR1,F,ON_DOX,FR1,EGFP,EGFP_coloc_w_cFos,RAM-8-10C_3.4_PL_A_3.tif,RAM-8-10C_3.4_PL_A,13.937,117.486,752.815,28.5070,RAM-8,724.307983,1
1,RAM-8-10C,ON_DOX_FR1,F,ON_DOX,FR1,EGFP,EGFP_coloc_w_cFos,RAM-8-10C_3.4_PL_A_3.tif,RAM-8-10C_3.4_PL_A,108.588,28.583,199.784,28.5070,RAM-8,171.276993,1
2,RAM-8-10C,ON_DOX_FR1,F,ON_DOX,FR1,EGFP,EGFP_coloc_w_cFos,RAM-8-10C_3.4_PL_A_3.tif,RAM-8-10C_3.4_PL_A,113.590,69.766,252.306,28.5070,RAM-8,223.798996,1
3,RAM-8-10C,ON_DOX_FR1,F,ON_DOX,FR1,EGFP,EGFP_coloc_w_cFos,RAM-8-10C_3.4_PL_A_3.tif,RAM-8-10C_3.4_PL_A,125.398,79.062,263.86,28.5070,RAM-8,235.352982,1
4,RAM-8-10C,ON_DOX_FR1,F,ON_DOX,FR1,EGFP,EGFP_coloc_w_cFos,RAM-8-10C_3.4_PL_A_3.tif,RAM-8-10C_3.4_PL_A,156.326,26.634,264.535,28.5070,RAM-8,236.028000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10593,RAM-9-8B,OFF_DOX_FR1,F,OFF_DOX,FR1,mKate2,mKate2_coloc_w_cFos,RAM-9_8B_A_3.tif,RAM-9_8B_A,66.930,439.46,844.679,832.1098,RAM-9,12.569214,1
10594,RAM-9-8B,OFF_DOX_FR1,F,OFF_DOX,FR1,mKate2,mKate2_coloc_w_cFos,RAM-9_8B_A_3.tif,RAM-9_8B_A,356.870,301.27,929.9109,832.1098,RAM-9,97.801086,1
10595,RAM-9-8B,OFF_DOX_FR1,F,OFF_DOX,FR1,mKate2,mKate2_coloc_w_cFos,RAM-9_8B_B_3.tif,RAM-9_8B_B,233.290,133.93,503.2779,469.3472,RAM-9,33.930695,1
10596,RAM-9-8B,OFF_DOX_FR1,F,OFF_DOX,FR1,mKate2,mKate2_coloc_w_cFos,RAM-9_8B_D_3.tif,RAM-9_8B_D,406.050,223.95,732.8926,609.2310,RAM-9,123.661560,1
