# RAM-8,9 - SINGLE LABELED DATA ONLY
#### Jonathan Ramos 2/7/2024

There were some new images from RAM-9 added to the existing set I previously analzyed. Since the immunohistochemistry was done in the same batch, I can parse out these new data, add them to the existing *raw* set, and then normalize/count cells as usual. 

We have the following file naming scheme:
- _1.tif : cFos
- _2.tif : 8-oxo-DG --> EGFP
- _3.tif : mKate2

In [1]:
import numpy as np
import pandas as pd
import glob
import sys

# loading some functions we wrote before
sys.path.append("/Users/jonathanramos/Desktop/LRI/Image ROI Data Wrangling/")
from clean import *
from norm import *
from count import *


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Cleaning, Wrangling Data
## loading data, stitching sets together

In [2]:
df_new = [pd.read_csv(f) for f in glob.glob('NEW data/*/*.csv')]

# some quick cleaning: removing leading whitespaces, and dropping 
# rows with no intensity measurement 
df_new = pd.concat(preprocessing(df_new))

print(df_new.shape)
df_new.head()


(6962, 30)


Unnamed: 0,cell_number,roi_id,roi_source,roi_type,CoM_x,CoM_y,pixel_area,background,mean_intensity,median_intensity,...,feret_angle,feret_min,circularity,aspect_ratio,roundness,solidity,skewness,kurtosis,filename,analysis_date
0,1,01c-00000,Microglia Cortex,OVAL,254.46,164.11,178.0,235.848,785.211,602.2727,...,90.0,14.0,0.8784,1.2795,0.6956,0.9175,0.4643,-1.277,RAM-9_4B_B_2.tif,Mon Jan 29 17:57:47 PST 2024
1,2,01c-00001,Microglia Cortex,OVAL,377.47,376.0,165.0,235.848,782.5741,806.5072,...,90.0,14.0,0.9316,1.1195,0.8374,0.9375,-0.0291,-0.9945,RAM-9_4B_B_2.tif,Mon Jan 29 17:57:47 PST 2024
2,3,01c-00002,Microglia Cortex,OVAL,289.3,162.01,90.0,235.848,555.8387,596.6047,...,90.0,10.0,0.9431,1.1692,0.8018,0.9184,-0.1012,-1.2252,RAM-9_4B_B_2.tif,Mon Jan 29 17:57:47 PST 2024
3,4,01c-00003,Microglia Cortex,OVAL,473.18,500.49,62.0,235.848,633.6916,642.472,...,90.0,8.0,0.9686,1.3018,0.7442,0.9394,-0.3937,-0.3676,RAM-9_4B_B_2.tif,Mon Jan 29 17:57:47 PST 2024
4,5,01c-00004,Microglia Cortex,OVAL,409.21,368.65,155.0,235.848,616.111,585.8311,...,0.0,14.0,1.0,1.0,0.9936,1.0197,0.1089,-1.08,RAM-9_4B_B_2.tif,Mon Jan 29 17:57:47 PST 2024


## Building the necessary cols
In particular we will need a rat_n (sid) col, stain_type col, and a treatment col. the filename col functions as the image name (iid) col.

We need the following cols
- rat_n (sid)
- treatment
- filename (fid)
- imagename (iid)
- stain_type
- CoM_x
- CoM_y
- mean_intensity
- background

In [3]:
def get_ratn(df):
    '''
    takes a dataframe with a col called "filename" and builds a new col,
    "rat_n" by parsing filename labels
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "filename"
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "rat_n 
    '''
    df['rat_n'] =   df.filename.apply(lambda x: '-'.join(x.split('_')[:2]))\
        .replace({' ': ''}, regex=True) # for some reason, we have more leading whitespace chars

    # some checks. we want be sure that the structure of all our rat_n labels is consistent
    # in particular, we expect something of the form 'RAM-9-4B', that is we have exactly
    # two dashes '-' separating some letters, followed by a two numbers (ending in alpha)
    assert df.rat_n.apply(lambda x: x.split('-')[0].isalpha()).sum() == len(df)
    assert df.rat_n.apply(lambda x: x.split('-')[1].isnumeric()).sum() == len(df)
    assert df.rat_n.apply(lambda x: x.split('-')[2].isalnum()).sum() == len(df)

    return df

def get_treatment(df, treatment):
    '''
    takes a dictionary of treatments (built from cohort key) and maps rat_ns to
    treatment. Builds new col called "treatment"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "rat_n"
        treatment: dict, key:val pairs map rat_n (str) to treatment(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "treatment"
    '''
    # creating new treatment col by mapping from cohort key dict
    df['treatment'] = df.rat_n.map(treatment)

    # check that all rat_ns were accounted for
    assert df.treatment.isna().sum() == 0

    return df 

def get_react(df, react):
    '''
    takes a dictionary of treatments (built from cohort key) and maps rat_ns to
    reactivation type. Builds new col called "react"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "rat_n"
        treatment: dict, key:val pairs map rat_n (str) to react(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "react"
    '''
    # creating new treatment col by mapping from cohort key dict
    df['react'] = df.rat_n.map(react)

    # check that all rat_ns were accounted for
    assert df.react.isna().sum() == 0

    return df 


def get_staintype(df, stains):
    '''
    takes a dictionary of treatments (built from cohort key) and maps filenames to
    stain types. Builds new col called "stain_type"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "filename"
        treatment: dict, key:val pairs map rat_n (str) to treatment(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "stain_type"
    '''
    # creating new stain_type col from filename
    df['stain_type'] = df.filename.replace(stains, regex=True)
    
    # check that all filenames were accounted for
    assert df.stain_type.isna().sum() == 0

    return df 

def get_imagename(df):
    '''
    takes a dataframe with a col called "filename" and builds a new col,
    "image_name" by parsing filename labels
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "filename"
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "image_name 
    '''
    df['image_name'] = df.filename.replace({'_[0-9]\.tif': ''}, regex=True)

    return df

def get_sex(df, sex):
    '''
    takes a dictionary of sexes (built from cohort key) and maps rat_ns to
    sex (either "M" or "F"). Builds new col called "sex"
    args:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            must containg col called "rat_n"
        treatment: dict, key:val pairs map rat_n (str) to treatment(str)
    return:
        df: pd.core.frame.DataFrame(n:m), n: the number of rows, m: the number of cols;
            contains new col called "sex"
    '''
    df['sex'] = df.rat_n.replace(sex, regex=True)
    
    # check that all filenames were accounted for
    assert df.sex.isna().sum() == 0

    return df 


def col_wrapper(df, treatment, react, stains, sex):
    '''
    wrapper function for pipeline to build cols
    '''

    df_ratn = get_ratn(df)
    df_treatment = get_treatment(df_ratn, treatment)
    df_react = get_react(df_treatment, react)
    df_staintype = get_staintype(df_react, stains)
    df_imgname = get_imagename(df_staintype)
    df_sex = get_sex(df_imgname, sex)

    return df_sex

In [4]:
treatment = {
    'RAM-9-4B' : 'OFF_DOX',
    'RAM-9-8B' : 'OFF_DOX',
    'RAM-9-13B' : 'OFF_DOX',
    'RAM-9-10B' : 'OFF_DOX'
}

react = {
    'RAM-9-4B' : 'NR',
    'RAM-9-8B' : 'FR1',
    'RAM-9-13B' : 'NR',
    'RAM-9-10B' : 'NR'  
}

stains = {
    '.*_1.tif$' : 'cFos',
    '.*_2.tif$' : 'EGFP',
    '.*_3.tif$' : 'mKate2'
}

sex = {
    'RAM-9-4B' : 'M',
    'RAM-9-8B' : 'F',
    'RAM-9-13B' : 'F',
    'RAM-9-10B' : 'F'
}

df_new_cols = col_wrapper(df_new, treatment, react, stains, sex)

## Integrate old data
### subset selection

In [5]:
cols = ['rat_n', 'sex', 'treatment', 'react', 'stain_type', 'filename', 'image_name', 'CoM_x', 'CoM_y', 'mean_intensity', 'background']
df_new_subset = df_new_cols[cols]

print(df_new_subset.shape)
df_new_subset.head()

(6962, 11)


Unnamed: 0,rat_n,sex,treatment,react,stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background
0,RAM-9-4B,M,OFF_DOX,NR,EGFP,RAM-9_4B_B_2.tif,RAM-9_4B_B,254.46,164.11,785.211,235.848
1,RAM-9-4B,M,OFF_DOX,NR,EGFP,RAM-9_4B_B_2.tif,RAM-9_4B_B,377.47,376.0,782.5741,235.848
2,RAM-9-4B,M,OFF_DOX,NR,EGFP,RAM-9_4B_B_2.tif,RAM-9_4B_B,289.3,162.01,555.8387,235.848
3,RAM-9-4B,M,OFF_DOX,NR,EGFP,RAM-9_4B_B_2.tif,RAM-9_4B_B,473.18,500.49,633.6916,235.848
4,RAM-9-4B,M,OFF_DOX,NR,EGFP,RAM-9_4B_B_2.tif,RAM-9_4B_B,409.21,368.65,616.111,235.848


### load old data, match cols for concat

In [6]:
df_old = [pd.read_csv(f) for f in sorted(glob.glob('OLD data/*.csv'))]

# upon inspecting the cols, this is the closest I could get to match the format
# of the new data. I still need to build image_name cols and do a lot of renaming
print('\nRAM-8:', df_old[0].columns)
df_RAM8 = df_old[0][['Rat_n', 'Sex', 'Dox', 'React', 'Stain', 'FileName',\
                      'XM', 'YM', 'Mean', 'Background']]\
    .reset_index().drop('index', axis=1)

print('\nRAM-9:', df_old[1].columns)
df_RAM9 = df_old[1][['Rat_n', 'Sex', 'Dox', 'React', 'Stain', 'FileName',\
                      'XM', 'YM', 'Mean', 'Background']]\
    .reset_index().drop('index', axis=1)

new_cols = {
    'Rat_n': 'rat_n',
    'Sex': 'sex',
    'Dox': 'treatment',
    'React': 'react',
    'Stain': 'stain_type',
    'FileName': 'filename',
    'XM': 'CoM_x',
    'YM': 'CoM_y',
    'Mean': 'mean_intensity',
    'Background': 'background'
}

# relabeling some things for consistency
# cleaning up RAM8
df_RAM8['Rat_n'] = df_RAM8.Rat_n.replace({'rat': 'RAM-8-'}, regex=True)
df_RAM8['FileName'] = df_RAM8.FileName.apply(lambda x: x+'.tif' if not '.tif' in x else x)
df_RAM8['FileName'] = df_RAM8.apply(lambda x: '_'.join([x.Rat_n, x.FileName]), axis=1)
df_RAM8 = df_RAM8.rename(columns = new_cols)

# cleaning up RAM9
df_RAM9['Rat_n'] = df_RAM9.Rat_n.replace({'rat_': 'RAM-9-'}, regex=True)
df_RAM9['FileName'] = df_RAM9.FileName.apply(lambda x: x+'.tif' if not '.tif' in x else x)
df_RAM9 = df_RAM9.rename(columns = new_cols)

# check that we're all matching
assert set(df_RAM8.columns) == set(df_RAM9.columns)

# build new image_name col
df_RAM89 = get_imagename(pd.concat([df_RAM8, df_RAM9]))

# reorder to match df_new_subset
df_RAM89 = df_RAM89[cols]

# let's take a look!
print(df_RAM89.shape)
df_RAM89.head()

  df_old = [pd.read_csv(f) for f in sorted(glob.glob('OLD data/*.csv'))]



RAM-8: Index(['Unnamed: 0', ' ', 'AR', 'AnalysisDate', 'Area', 'Background',
       'CellNumber', 'Circ.', 'Experiment', 'FileName', 'Ind_var_group',
       'Indi_var_group', 'Integrated_Density', 'Max', 'Mean',
       'Mean-Background', 'Min', 'Round', 'Solidity', 'Stain', 'StdDev',
       'Subject', 'XM', 'YM', 'directory', 'roiName', 'Rat_n', 'Coord', 'Sex',
       'Dox', 'React'],
      dtype='object')

RAM-9: Index(['Unnamed: 0', ' ', 'AR', 'AnalysisDate', 'Area', 'Background',
       'CellNumber', 'Circ.', 'Experiment', 'FileName', 'Ind_var_group',
       'Indi_var_group', 'Integrated_Density', 'Max', 'Mean',
       'Mean-Background', 'Min', 'Round', 'Solidity', 'Stain', 'StdDev',
       'Subject', 'XM', 'YM', 'directory', 'roiName', 'Rat_n', 'Treatment',
       'Coord', 'Sex', 'Dox', 'React'],
      dtype='object')
(30855, 11)


Unnamed: 0,rat_n,sex,treatment,react,stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background
0,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,345.615,79.261,141.189,56.949
1,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,57.237,77.429,140.347,56.949
2,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949
3,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949
4,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,107.768,29.112,194.964,56.949


### concat old and new sets

In [7]:
# check that we're all matching across new and old sets
assert set(df_RAM89.columns) == set(df_new_subset.columns)

# time to concat!
df_full = pd.concat([df_RAM89, df_new_subset])

# adding a new col called group to easily distinguish between RAM-8 and RAM-9
df_full['group'] = df_full.rat_n.apply(lambda x: '-'.join(x.split('-')[:2]))

# let's take a look!
print(df_full.shape)
df_full.head()


(37817, 12)


Unnamed: 0,rat_n,sex,treatment,react,stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group
0,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,345.615,79.261,141.189,56.949,RAM-8
1,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,57.237,77.429,140.347,56.949,RAM-8
2,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949,RAM-8
3,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949,RAM-8
4,RAM-8-10C,F,DOX_ON,FR1,single_c-Fos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,107.768,29.112,194.964,56.949,RAM-8


### Standardizing labels
All col names were standardized in the previous cells, however there are still some inconsitencies in the specific label strings between old vs new image data.

In particular:
- treatment labels do not match ("ON_DOX" vs "DOX_ON")
- stain_type labels are inconsistent
    - "EGFP" vs "GFP"
    - "c-Fos" vs "cFos"
    - "with_" vs "w/_"
    - "Triple-labeled_" vs "triple"
    - "single_mKate2" vs "mKate2"
    - "single_c-Fox" vs "c-Fos" etc


In [8]:
# standardizing treatment labels
df_full = df_full.replace({'DOX_OFF' : 'OFF_DOX', 'DOX_ON': 'ON_DOX'}, regex=True)
print(df_full.treatment.unique())

# standardizing stain_type labels
corrected_staintypes = {
    'c-Fos': 'cFos',
    'single_': '',
    'GFP': 'EGFP',
    'Triple-labeled_': 'triple_',
    'with_': 'w/_'
}

df_full['stain_type'] = df_full.stain_type.replace(corrected_staintypes, regex=True)\
    .replace({'EEGFP': 'EGFP'}, regex=True)

# let's take a look! we expect exactly 12 stain_type combinations 
# (3 types of single, 6 types of double, 3 types of triple)
sorted(df_full.stain_type.unique())

['ON_DOX' 'OFF_DOX']


['EGFP',
 'EGFP_coloc_w/_cFos',
 'EGFP_coloc_w/_mKate2',
 'cFos',
 'cFos_coloc_w/_EGFP',
 'cFos_coloc_w/_mKate2',
 'mKate2',
 'mKate2_coloc_w/_EGFP',
 'mKate2_coloc_w/_cFos',
 'triple_EGFP',
 'triple_cFos',
 'triple_mKate2']

### One more thing: building a new treatment col
To make splitting by treatment a bit easier I'm going to build a new aggregate columns from the combination of columns required to each unique group

In [9]:
df_full = df_full.rename(columns= {'treatment': 'dox'})
df_full['treatment'] = df_full.apply(lambda x: '_'.join([x.dox, x.react]), axis=1)

df_full

Unnamed: 0,rat_n,sex,dox,react,stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group,treatment
0,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,345.615,79.261,141.189,56.9490,RAM-8,ON_DOX_FR1
1,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,57.237,77.429,140.347,56.9490,RAM-8,ON_DOX_FR1
2,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.9490,RAM-8,ON_DOX_FR1
3,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.9490,RAM-8,ON_DOX_FR1
4,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,107.768,29.112,194.964,56.9490,RAM-8,ON_DOX_FR1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,RAM-9-13B,F,OFF_DOX,NR,mKate2,RAM-9_13B_C_3.tif,RAM-9_13B_C,82.41,411.25,29.7811,11.6306,RAM-9,OFF_DOX_NR
32,RAM-9-13B,F,OFF_DOX,NR,mKate2,RAM-9_13B_C_3.tif,RAM-9_13B_C,11.05,324.87,32.5196,11.6306,RAM-9,OFF_DOX_NR
33,RAM-9-13B,F,OFF_DOX,NR,mKate2,RAM-9_13B_C_3.tif,RAM-9_13B_C,170.39,415.47,24.5219,11.6306,RAM-9,OFF_DOX_NR
34,RAM-9-13B,F,OFF_DOX,NR,mKate2,RAM-9_13B_C_3.tif,RAM-9_13B_C,22.81,474.62,19.6235,11.6306,RAM-9,OFF_DOX_NR


### Removing Coloc Stain Types
This coloc set is incomplete (still waiting on coloc results for new RAM-9 images), and so in this notebook I plan to analyze only single stain types

In [10]:
df_single = df_full.query('stain_type == "EGFP" or stain_type == "cFos" or stain_type == "mKate2"')\
    .reset_index()\
    .drop('index', axis=1)

# let's take a look!
print(df_single.shape)
df_single.head()

(29362, 13)


Unnamed: 0,rat_n,sex,dox,react,stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group,treatment
0,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,345.615,79.261,141.189,56.949,RAM-8,ON_DOX_FR1
1,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,57.237,77.429,140.347,56.949,RAM-8,ON_DOX_FR1
2,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949,RAM-8,ON_DOX_FR1
3,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949,RAM-8,ON_DOX_FR1
4,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,107.768,29.112,194.964,56.949,RAM-8,ON_DOX_FR1


## Droppings nans, duplicates

In [11]:
# which cols have nans, how many?
print('Nan per col:')
print(df_single.isna().sum())
# it looks like we have 8 nans in the mean_intensity cols. If there is no
# intensity data, there's not much we can do about that; time to drop those.
df_single = df_single.dropna()

# how many duplicated rows do we have?
print('\nTotal n of duplicated rows:')
print(df_single.duplicated().sum())

# looks like we've got 160 duplicated rows. Let's take a look
df_single[df_single.duplicated(keep=False)]

# those duplicates look real, I'm ok with getting rid of them
df_cleaned = df_single.drop_duplicates()

Nan per col:
rat_n             0
sex               0
dox               0
react             0
stain_type        0
filename          0
image_name        0
CoM_x             8
CoM_y             8
mean_intensity    8
background        0
group             0
treatment         0
dtype: int64

Total n of duplicated rows:
160


Check results once more

In [12]:
# which cols have nans, how many?
print('Nan per col:')
print(df_cleaned.isna().sum())

print('\nTotal n of duplicated rows:')
print(df_cleaned.duplicated().sum())

### looks good to me!

Nan per col:
rat_n             0
sex               0
dox               0
react             0
stain_type        0
filename          0
image_name        0
CoM_x             0
CoM_y             0
mean_intensity    0
background        0
group             0
treatment         0
dtype: int64

Total n of duplicated rows:
0


# Computing mean - background

In [13]:
df_cleaned['mean-background'] = df_cleaned.mean_intensity.astype('f') - df_cleaned.background.astype('f')

# print out some descriptive statistics for intensity
for stain in df_cleaned.stain_type.unique():
    df_stain = df_cleaned.query(f'stain_type == "{stain}"')
    print(f'\n===== {stain} =====')
    print(df_stain['mean-background'].describe())

df_cleaned.to_csv('RAM-8,9_cleaned,rawdata.csv')


===== cFos =====
count    6032.000000
mean       87.215027
std        83.764038
min       -34.805908
25%        34.011045
50%        63.659508
75%       111.345505
max       727.375000
Name: mean-background, dtype: float64

===== EGFP =====
count    17929.000000
mean       252.988205
std        221.403397
min       -140.956604
25%        104.195000
50%        189.037994
75%        332.751038
max       1933.448853
Name: mean-background, dtype: float64

===== mKate2 =====
count    5231.000000
mean      217.504929
std       223.139175
min      -126.713684
25%        61.590298
50%       136.454590
75%       297.742004
max      1482.285034
Name: mean-background, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['mean-background'] = df_cleaned.mean_intensity.astype('f') - df_cleaned.background.astype('f')


# Normalizing Intensities, Counting Mean Cell Ns
## Normalize intensity, write to disk
for these data I think we're mostly concerned with percent of cells colocalized either as a percent of EGPF or a percent of c-fos/mkate2. I also know that I had previously divded up the data down a median split of the raw intensity and computed the percent of c-Fos that had mKate2 (please see older notebooks, circa 2022).

In [14]:
group = 'RAM-8,9'
for stain in df_cleaned.stain_type.unique():

    # split by stain
    df_stain = df_cleaned.query(f'stain_type == "{stain}"')

    # split by group, normalize RAM-8 and RAM-9 independently then concat
    df_RAM8 = df_stain.query(f'group == "RAM-8"')
    df_RAM9 = df_stain.query(f'group == "RAM-9"')

    # normalize to ON_DOX_FR1
    df_norm_RAM8 = normalize_intensity(df_RAM8, norm_condition='ON_DOX_FR1', col='mean-background')
    df_norm_RAM9 = normalize_intensity(df_RAM9, norm_condition='ON_DOX_FR1', col='mean-background')
    df_norm = pd.concat([df_norm_RAM8, df_norm_RAM9])
    
    df_norm.to_csv(f'{group}_{stain}_NORM.csv')

    # reorganize into cols for prism
    df_prism = prism_reorg(df_norm, col='norm_mean-background')
    df_prism.to_csv(f'{group}_{stain}_PRISM.csv')
    
# let's take a look at one of our final output dataframes, organized for entry into prism
print(stain)
df_prism

mKate2


Unnamed: 0,OFF_DOX_FR1,OFF_DOX_NR,OFF_DOX_VR5,ON_DOX_FR1,ON_DOX_VR5
0,77.514603,6.745389,8.939567,0.586939,0.204462
1,25.048449,69.743958,87.559837,4.669683,0.353710
2,33.844952,53.031898,133.856094,0.248386,1.222596
3,36.498318,29.856552,52.404133,0.224104,1.021846
4,23.692842,16.358969,33.524494,0.372579,1.062212
...,...,...,...,...,...
2589,,,74.836502,,
2590,,,61.132721,,
2591,,,18.610285,,
2592,,,80.763306,,


## Count mean cell ns, write to disk

In [15]:
# count n of unique image names per subject
sid = 'rat_n'
iid = 'image_name'
cols = ['treatment', 'stain_type', sid, iid]
group = 'RAM-8,9'

# wrapper fn calls
for stain in df_cleaned.stain_type.unique():
    
    # split by stain type
    df_stain = df_cleaned[df_cleaned.stain_type == stain]

    # compute mean cell ns
    df_means = mean_cell_n(df_stain, df_cleaned, cols, sid, iid)

    # write to disk
    df_means.to_csv(f'{group}_{stain}_mean_cell_ns.csv')

# let's take a look at one of our final output dataframes
print(stain)
df_means

mKate2


Unnamed: 0,rat_n,treatment,stain_type,cell_count_sums,image_n,mean_cell_n
0,RAM-8-15C,OFF_DOX_FR1,mKate2,120,6,20.0
1,RAM-8-4C,OFF_DOX_FR1,mKate2,135,3,45.0
2,RAM-8-7C,OFF_DOX_FR1,mKate2,15,1,15.0
3,RAM-8-8C,OFF_DOX_FR1,mKate2,91,3,30.333333
4,RAM-9-11B,OFF_DOX_FR1,mKate2,340,3,113.333333
5,RAM-9-3B,OFF_DOX_FR1,mKate2,226,4,56.5
6,RAM-9-8B,OFF_DOX_FR1,mKate2,247,4,61.75
7,RAM-8-1C,OFF_DOX_NR,mKate2,48,6,8.0
8,RAM-9-10B,OFF_DOX_NR,mKate2,113,3,37.666667
9,RAM-9-13B,OFF_DOX_NR,mKate2,580,6,96.666667


# Binning High vs Low cFos
cFos intensity will be split and binned by the median intensity across all groups. I will add a new col called "cfos_bin" containing the labels 'cfos_hi' or 'cfos_lo' denoting whether that cell was either above or below the median mean-background intensity respectively. 

In [16]:
df_cfos = df_cleaned.query(f'stain_type == "cFos"').reset_index().drop('index', axis=1)
cfos_median = df_cfos['mean-background'].median()
df_cfos['cfos_bin'] = df_cfos.apply(lambda x: 'cfos_hi' if x['mean-background'] >= cfos_median else 'cfos_lo', axis=1)

print(f'median cfos mean-background: {cfos_median}')
df_cfos.head()

median cfos mean-background: 63.659507751464844


Unnamed: 0,rat_n,sex,dox,react,stain_type,filename,image_name,CoM_x,CoM_y,mean_intensity,background,group,treatment,mean-background,cfos_bin
0,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,345.615,79.261,141.189,56.949,RAM-8,ON_DOX_FR1,84.23999,cfos_hi
1,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,57.237,77.429,140.347,56.949,RAM-8,ON_DOX_FR1,83.397995,cfos_hi
2,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,13.054,117.141,255.088,56.949,RAM-8,ON_DOX_FR1,198.138992,cfos_hi
3,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,368.777,231.472,92.041,56.949,RAM-8,ON_DOX_FR1,35.091999,cfos_lo
4,RAM-8-10C,F,ON_DOX,FR1,cFos,RAM-8-10C_3.4_PL_A_2.tif,RAM-8-10C_3.4_PL_A,107.768,29.112,194.964,56.949,RAM-8,ON_DOX_FR1,138.014999,cfos_hi


## Counting High/Low cFos

In [17]:
# count n of unique image names per subject
sid = 'rat_n'
iid = 'image_name'
cols = ['treatment', 'cfos_bin', sid, iid]
group = 'RAM-8,9'

# wrapper fn calls
for stain in df_cfos.cfos_bin.unique():
    
    # split by stain type
    df_stain = df_cfos[df_cfos.cfos_bin == stain]

    # compute mean cell ns
    df_means = mean_cell_n(df_stain, df_cfos, cols, sid, iid)

    # write to disk
    df_means.to_csv(f'{group}_{stain}_mean_cell_ns.csv')

# let's take a look at one of our final output dataframes
print(stain)
df_means

cfos_lo


Unnamed: 0,rat_n,treatment,cfos_bin,cell_count_sums,image_n,mean_cell_n
0,RAM-8-15C,OFF_DOX_FR1,cfos_lo,63,6,10.5
1,RAM-8-4C,OFF_DOX_FR1,cfos_lo,188,3,62.666667
2,RAM-8-7C,OFF_DOX_FR1,cfos_lo,20,1,20.0
3,RAM-8-8C,OFF_DOX_FR1,cfos_lo,122,3,40.666667
4,RAM-9-11B,OFF_DOX_FR1,cfos_lo,53,3,17.666667
5,RAM-9-3B,OFF_DOX_FR1,cfos_lo,223,4,55.75
6,RAM-9-8B,OFF_DOX_FR1,cfos_lo,142,4,35.5
7,RAM-8-1C,OFF_DOX_NR,cfos_lo,67,6,11.166667
8,RAM-9-10B,OFF_DOX_NR,cfos_lo,102,3,34.0
9,RAM-9-13B,OFF_DOX_NR,cfos_lo,165,6,27.5
