# Sleep Dep full db, CELL COUNT DATA ONLY
##### Jonathan Ramos 1/23/2024
It looks like I can just get rid of the majority of these uninformative cols at the right end of the set. I will only take the following:
- subject_number
- image_name
- stain_type
- xm
- ym
- ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction
- cohort
- ZT
- treatment
- sex
- magnification_factor

Since I do not yet know the intensity normalization scheme, I will only handle cell count data per stain type combination for now. 

1/24/2024:
completed pipeline, added magnification factor flag in wrapper function to handle rescaling counts across images of unequal magnification/zoom factors. (all images scaled to highest level of magnification to prioritize cutting out data rather than extrapolating) -JR

1/25/2024:
added "xm" and "ym" columns to identify duplicate rows. checked for duplciate values; dropped 918 duplicate rows before counting mean cell ns -JR

in total, 249 rows with missing data were dropped, 918 duplicated rows were dropped, resulting in a final cleaned dataframe with 881727 rows.

## Loading data, dropping nan

In [1]:
import numpy as np
import pandas as pd

# load data
df_fulldb = pd.read_csv('FullDB_ROIsPhenotypesValidated.csv')

# check cols
print(df_fulldb.columns)

# pull out cols that we need
df_subset = df_fulldb[['subject_number', 'image_name', 'stain_type', 'xm', 'ym', 'ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction', 'cohort', 'ZT', 'treatment', 'sex', 'magnification_factor']]

# check for nans
df_subset.count() - len(df_subset)

# it looks like the mean_intensity col is missing 249 values. let's just remove those rows for now.
df_dropna = df_subset[~df_subset.ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction.isna()]

# let's take a look
df_dropna.head()

  df_fulldb = pd.read_csv('FullDB_ROIsPhenotypesValidated.csv')


Index(['subject_number', 'image_name', 'stain_type', 'cell_number',
       'area_minus_background_micronsquared',
       'ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction',
       'mean_intensity_beforebackgroundsubtraction', 'ROI_intensity_stdev',
       'ROI_minimum_intensity', 'ROI_maximum_intensity',
       'ROI_intensity_normalizedtoZT0_percohort_perstain',
       'magnification_factor', 'xm', 'ym', 'image_background_intensity',
       'cohort', 'ZT', 'location_wrt_bregma', 'treatment', 'sex',
       'exact_sac_time', 'weight_at_sac', 'analysisdate',
       'magnification_adjusted_area', 'which_stain', 'phenotypeGlobal',
       'phenotypeOxo', 'phenotypePV', 'phenotypeWFA', 'Is_Duplicate',
       'redundant_duplicate', 'confirmed_single', 'confirmed_double',
       'confirmed_triple', 'confirmed_AMBIGUOUS', 'unique_Cell_id',
       'empiricalPhenotype', 'Proximal_Coordinates', 'Valid_StainType',
       'sumConfirmed', 'real_stain_type', 'phenotypeWfa', 'valid_phenotype'

Unnamed: 0,subject_number,image_name,stain_type,xm,ym,ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction,cohort,ZT,treatment,sex,magnification_factor
137,RF001,RF001_2.52_L,8-oxo-dG,202.8106,420.7033,15.4825,3.1,18,SD,F,1.633
138,RF001,RF001_2.52_L,Triple-labeled_8-oxo-dG,303.1551,252.7436,48.1643,3.1,18,SD,F,1.633
139,RF001,RF001_2.52_L,Double-labeled_8-oxo_co-occurring_with_WFA,303.1551,252.7436,48.1643,3.1,18,SD,F,1.633
140,RF001,RF001_2.52_L,8-oxo-dG,303.1551,252.7436,48.1643,3.1,18,SD,F,1.633
141,RF001,RF001_2.52_L,Double-labeled_8-oxo_co-occurring_with_WFA,406.0875,203.2267,29.5644,3.1,18,SD,F,1.633


## Checking for duplicated rows

In [2]:
# check duplicates
print(f'number of duplicated rows: {df_dropna.duplicated().sum()}')

# drop duplicate rows
df_clean = df_dropna[~df_dropna.duplicated()]
print(f'number of duplicate rows removed: {len(df_dropna) - len(df_clean)}')

# investigating duplicates
print('\nduplicates by stain:')
df_duplicated = df_dropna[df_dropna.duplicated()]
stains, counts = np.unique((df_duplicated.stain_type), return_counts=True)

# it looks like most duplicates occur in rows with stain_type == 8-oxo-dG 
for key, val in dict(zip(stains, counts)).items():
    print(f'{key}: {val}')

# if these rows were really duplicated i would expect this resulting 
# dataframe to have a total of 8 rows, containing exactly 4 unique rows
print('\nchecking duplicates by visual inspection:')
pd.concat([
    df_dropna.query(f'xm == {df_duplicated.xm.iloc[0]} and\
                      ym == {df_duplicated.ym.iloc[0]} and\
                      stain_type == "{df_duplicated.stain_type.iloc[0]}"'),

    df_dropna.query(f'xm == {df_duplicated.xm.iloc[123]} and\
                      ym == {df_duplicated.ym.iloc[123]} and\
                      stain_type == "{df_duplicated.stain_type.iloc[123]}"'),

    df_dropna.query(f'xm == {df_duplicated.xm.iloc[456]} and\
                      ym == {df_duplicated.ym.iloc[456]} and\
                      stain_type == "{df_duplicated.stain_type.iloc[456]}"'),

    df_dropna.query(f'xm == {df_duplicated.xm.iloc[789]} and\
                      ym == {df_duplicated.ym.iloc[789]} and\
                      stain_type == "{df_duplicated.stain_type.iloc[789]}"')
])


number of duplicated rows: 918
number of duplicate rows removed: 918

duplicates by stain:
8-oxo-dG: 794
Double-labeled_8-oxo_co-occurring_with_PV: 23
Double-labeled_8-oxo_co-occurring_with_WFA: 9
Double-labeled_PV_co-occurring_with_8-oxo: 16
Double-labeled_PV_co-occurring_with_WFA: 1
Double-labeled_WFA_co-occurring_with_8-oxo: 11
Double-labeled_WFA_co-occurring_with_PV: 3
Parvalbumin: 13
Triple-labeled_8-oxo-dG: 2
Triple-labeled_WFA: 1
WFA: 45

checking duplicates by visual inspection:


Unnamed: 0,subject_number,image_name,stain_type,xm,ym,ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction,cohort,ZT,treatment,sex,magnification_factor
205,RF001,RF001_2.52_L,8-oxo-dG,380.4925,36.4539,37.5291,3.1,18,SD,F,1.633
206,RF001,RF001_2.52_L,8-oxo-dG,380.4925,36.4539,37.5291,3.1,18,SD,F,1.633
159602,RF045,RF045_2.52_R,Double-labeled_8-oxo_co-occurring_with_PV,164.247,70.055,738.541,1.2,6,TOD_Control,F,1.633
159603,RF045,RF045_2.52_R,Double-labeled_8-oxo_co-occurring_with_PV,164.247,70.055,738.541,1.2,6,TOD_Control,F,1.633
451563,RM055,RM055_4.2_L,8-oxo-dG,343.726,170.528,1043.84,3.2,15,TOD_Control,M,1.633
451564,RM055,RM055_4.2_L,8-oxo-dG,343.726,170.528,1043.84,3.2,15,TOD_Control,M,1.633
592435,RM086,RM086_4.2_L,8-oxo-dG,32.416,77.776,595.339,4.2,3,TOD_Control,M,1.633
592436,RM086,RM086_4.2_L,8-oxo-dG,32.416,77.776,595.339,4.2,3,TOD_Control,M,1.633


## Counting mean cell ns per subject
These parameterized functions will go into a separate module for future use

In [3]:
def count_imgs(df, sid_col, iid_col):
    '''
    takes a dataframe and counts the number of unique strings that occur in the 
    "image_name" col for each rat in "rat_n" col
    args:
        df: pd.core.frame.DataFrame(n, m) 
            n: the number of rows, 
            m: the number of features
        sid_col: str, denoting the name of the col containing unique subject ids
        iid_col: str, denoting the name of the col containing unique image ids
    return:
        df_imgn: pd.core.frame.DataFrame(n=|sid_col|), m=2)
            n: the number of rows, equal to the cardinality of the sid_col set
            (the number of unique ID strings in sid_col)
            this df contains 2 cols: a sid col, and an iid col containing counts
    '''
    assert iid_col in df.columns

    df_imgn = df.groupby([sid_col])\
        .apply(lambda x: len(np.unique(x.image_name)))\
        .reset_index(name='image_n')
    
    return df_imgn

def count_cells(df, cols):
    '''
    takes a df and counts the number of instances each distinct row 
    (created by unique combinations of labels from columns indicated
    by cols arg); counts are reported in a new col called "cell_counts"
    args:
        df: pd.core.frame.DataFrame(N, M); N: the number of rows, M: the
            number of cols (assumed to have already been split by stain_type)
        cols: list(n), n: the number of cols over which to count distinct rows
    return:
        df_counts: pd.core.frame.DataFrame(N,M+1)
    '''
    df_counts = df.value_counts(cols)\
        .reset_index(name='cell_counts')\
        .sort_values(by=cols)
    
    return df_counts

def sum_cells(df, cols, iid_col):
    '''
    takes cell count df, groups by cols denoted in cols list and computes sum
    of cell_counts col for each group. Adds new column "cell_count_sums"
    containing sums.
    args:
        df: pd.core.frame.DataFrame(N, M), N: the number of rows (N=|id_col|),
            M: the number of cols, must contain col called "cell_counts"
        cols: list(M-2), list containing col name strings that define each group 
            for group by and reduction (in this case summing)
        iid_col: str, denotes 
    '''
    # remove image id col (we want to sum counts across all images per rat)
    reduce_cols = list(filter(lambda x: x != iid_col, cols))

    if 'scaled_counts' in df.columns:
            # group by, reduce 
        df_sums = df.groupby(by=reduce_cols)\
            .apply(lambda x: np.sum(x.scaled_counts))\
            .reset_index(name='cell_count_sums')
    
    else:
        # group by, reduce 
        df_sums = df.groupby(by=reduce_cols)\
            .apply(lambda x: np.sum(x.cell_counts))\
            .reset_index(name='cell_count_sums')
    
    return df_sums

def average_counts(df_sums, df_ns, cols, sid_col, iid_col):
    '''
    takes df of cell count sums and df of image ns, and computes the mean cell 
    n (divides cell count sums by the number of images) for each subject.
    args:
        df_sums: pd.core.frame.DataFrame(ni, mi), ni: the number of rows
            (ni=|sid|), mi: the number of cols (mi = |cols|); must 
            contain a col "cell_count_sums". 
        df_ns: pd.core.frame.DataFrame(nj, mj), nj: the number of rows 
            (nj=|sid|), mj: the number of cols (mj=2); must contain a col
            "image_n" 
        cols: list(n), n: the number of cols (contains all cols necessary to 
            create every unique group combination)
        sid_col: str, denoting the name of the col containing unique subject ids
        iid_col: str, denoting the name of the col containing unique image ids
    return:
        mean_cell_ns: pd.core.frame.DataFrame(N,M), N: the number of rows (N=
        |sid|), M: the number of cols (M=|cols|+2)
        
    '''
    # list of cols with out image id, since it was removed during the reduction step
    reduce_cols = list(filter(lambda x: x != iid_col, cols))

    # compute mean cell n
    mean_cell_ns = df_sums.join(df_ns.set_index(sid_col), on=sid_col, how='inner')\
        .sort_values(by=reduce_cols)
    mean_cell_ns['mean_cell_n'] = mean_cell_ns.cell_count_sums / mean_cell_ns.image_n

    # reorder so that subject id is the first col
    col_reorder = [sid_col] + list(filter(lambda x: x != sid_col, list(mean_cell_ns.columns)))
    mean_cell_ns = mean_cell_ns[col_reorder]

    return mean_cell_ns

def mean_cell_n(df_stain, df_full, cols, sid, iid, return_counts=False):
    '''
    wrapper function to compute mean cell ns; magnification/zoom factor 
    is assuemd to be equal across all images
    args:
        df_stain: pd.core.frame.DataFrame; df containing data for a given stain type
        df_full: pd.core.frame.DataFrame; df containing data for full (cleaned) set
        cols: list, contains str denoting col names for grouping
        sid: str, col name denoting col containing unique subject ids
        iid: str, col name denoting col containing unique image ids
        return_counts: bool, flag for added utility during debugging
    '''
    # count n of unique image names per subject
    img_ns = count_imgs(df_full, sid, iid)

    # count n of cells per image for each subject
    cell_counts = count_cells(df_stain, cols)

    # sum cell counts across all images for each subject
    cell_sums = sum_cells(cell_counts, cols, iid)

    # compute mean cell count per image for each subject
    mean_cell_ns = average_counts(cell_sums, img_ns, cols, sid, iid)

    if not return_counts:
        return mean_cell_ns
    
    return (cell_counts, mean_cell_ns)


def scaled_mean_cell_n(df_stain, df_full, cols, sid, iid, return_counts=False, scale_counts=False):
    '''
    wrapper function to compute mean cell ns for images of unequal zoom factor
    args:
        df_stain: pd.core.frame.DataFrame; df containing data for a given stain type
        df_full: pd.core.frame.DataFrame; df containing data for full (cleaned) set
        cols: list, contains str denoting col names for grouping
        sid: str, col name denoting col containing unique subject ids
        iid: str, col name denoting col containing unique image ids
        return_counts: bool, flag for added utility during debugging
    '''
    # count n of unique image names per subject
    img_ns = count_imgs(df_full, sid, iid)

    # count n of cells per image for each subject
    cell_counts = count_cells(df_stain, cols)

    # we must scale at the cell count per image level because it may be the case that
    # not every image from a given animal was scaled the same way
    if scale_counts:
        # get magnification_factor for each img
        mag = df_full[[sid, iid, 'magnification_factor']].drop_duplicates()
        cell_counts = cell_counts.merge(mag)

        # scale all counts by max magnification factor
        max_mag = cell_counts.magnification_factor.unique().max()
        cell_counts['scale'] = cell_counts.magnification_factor / max_mag
        cell_counts['scaled_counts'] = cell_counts.cell_counts * cell_counts.scale

    # sum cell counts across all images for each subject
    cell_sums = sum_cells(cell_counts, cols, iid)

    # compute mean cell count per image for each subject
    mean_cell_ns = average_counts(cell_sums, img_ns, cols, sid, iid)

    if not return_counts:
        return mean_cell_ns
    
    return cell_counts, mean_cell_n

## Time to run it! Write to disk and check result

In [4]:
# set up
group = 'SD_fulldb' # identifying str for writing files to disk
sid = 'subject_number'
iid = 'image_name'
cols = ['sex', 'treatment', 'ZT', 'subject_number', 'image_name']

# time to count
for stain in df_clean.stain_type.unique():
    # split by stain type
    df_stain = df_clean[df_clean.stain_type == stain]

    # compute mean count
    df_means = scaled_mean_cell_n(df_stain, df_clean, cols, sid, iid, scale_counts=True)

    # write result to disk
    df_means.to_csv(f'{group}_{stain}_SCALED_mean_cell_ns.csv')

# let's take a look at one of our final output dataframes
# we know if an image was scaled if "cell_count_sums" is not an integer
# (we expect sums of count data to be int)
print(stain)
df_means

Double-labeled_WFA_co-occurring_with_PV


Unnamed: 0,subject_number,sex,treatment,ZT,cell_count_sums,image_n,mean_cell_n
0,RF018,F,SD,0,128.0,6,21.333333
1,RF020,F,SD,0,129.0,6,21.500000
2,RF050,F,SD,0,151.0,6,25.166667
3,RF052,F,SD,0,28.0,6,4.666667
4,RF026,F,SD,3,72.0,6,12.000000
...,...,...,...,...,...,...,...
143,RM070,M,ZT0,0,163.0,6,27.166667
144,RM073,M,ZT0,0,142.0,6,23.666667
145,RM078,M,ZT0,0,114.0,6,19.000000
146,RM081,M,ZT0,0,142.0,6,23.666667
