# Sleep Dep, FULL DB
It looks like I can just get rid of the majority of these uninformative cols at the right end of the set. I will only take the following:
- subject_number
- image_name
- stain_type
- magnification_factor
- ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction
- cohort
- ZT
- treatment
- sex


In [122]:
import numpy as np
import pandas as pd

# load data
df_fulldb = pd.read_csv('FullDB_ROIsPhenotypesValidated.csv')

# check cols
print(df_fulldb.columns)

# pull out cols that we need
df_subset = df_fulldb[['subject_number', 'image_name', 'stain_type', 'ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction', 'cohort', 'ZT', 'treatment', 'sex', 'magnification_factor']]

# check for nans
df_subset.count() - len(df_subset)

# it looks like the mean_intensity col is missing 249 values. let's just remove those rows for now.
df_dropped = df_subset[~df_subset.ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction.isna()]

# let's take a look
df_dropped.head()

  df_fulldb = pd.read_csv('FullDB_ROIsPhenotypesValidated.csv')


Index(['subject_number', 'image_name', 'stain_type', 'cell_number',
       'area_minus_background_micronsquared',
       'ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction',
       'mean_intensity_beforebackgroundsubtraction', 'ROI_intensity_stdev',
       'ROI_minimum_intensity', 'ROI_maximum_intensity',
       'ROI_intensity_normalizedtoZT0_percohort_perstain',
       'magnification_factor', 'xm', 'ym', 'image_background_intensity',
       'cohort', 'ZT', 'location_wrt_bregma', 'treatment', 'sex',
       'exact_sac_time', 'weight_at_sac', 'analysisdate',
       'magnification_adjusted_area', 'which_stain', 'phenotypeGlobal',
       'phenotypeOxo', 'phenotypePV', 'phenotypeWFA', 'Is_Duplicate',
       'redundant_duplicate', 'confirmed_single', 'confirmed_double',
       'confirmed_triple', 'confirmed_AMBIGUOUS', 'unique_Cell_id',
       'empiricalPhenotype', 'Proximal_Coordinates', 'Valid_StainType',
       'sumConfirmed', 'real_stain_type', 'phenotypeWfa', 'valid_phenotype'

Unnamed: 0,subject_number,image_name,stain_type,ROI_mean_intensity_from_PIPSQUEAK_withbackgroundsubtraction,cohort,ZT,treatment,sex,magnification_factor
137,RF001,RF001_2.52_L,8-oxo-dG,15.4825,3.1,18,SD,F,1.633
138,RF001,RF001_2.52_L,Triple-labeled_8-oxo-dG,48.1643,3.1,18,SD,F,1.633
139,RF001,RF001_2.52_L,Double-labeled_8-oxo_co-occurring_with_WFA,48.1643,3.1,18,SD,F,1.633
140,RF001,RF001_2.52_L,8-oxo-dG,48.1643,3.1,18,SD,F,1.633
141,RF001,RF001_2.52_L,Double-labeled_8-oxo_co-occurring_with_WFA,29.5644,3.1,18,SD,F,1.633


In [255]:
def count_imgs(df, sid_col, iid_col):
    '''
    takes a dataframe and counts the number of unique strings that occur in the 
    "image_name" col for each rat in "rat_n" col
    args:
        df: pd.core.frame.DataFrame(n, m) 
            n: the number of rows, 
            m: the number of features
        sid_col: str, denoting the name of the col containing unique subject ids
        iid_col: str, denoting the name of the col containing unique image ids
    return:
        df_imgn: pd.core.frame.DataFrame(n=|sid_col|), m=2)
            n: the number of rows, equal to the cardinality of the sid_col set
            (the number of unique ID strings in sid_col)
            this df contains 2 cols: a sid col, and an iid col containing counts
    '''
    assert iid_col in df.columns

    df_imgn = df.groupby([sid_col])\
        .apply(lambda x: len(np.unique(x.image_name)))\
        .reset_index(name='image_n')
    
    return df_imgn

def count_cells(df, cols):
    '''
    takes a df and counts the number of instances each distinct row 
    (created by unique combinations of labels from columns indicated
    by cols arg); counts are reported in a new col called "cell_counts"
    args:
        df: pd.core.frame.DataFrame(N, M); N: the number of rows, M: the
            number of cols (assumed to have already been split by stain_type)
        cols: list(n), n: the number of cols over which to count distinct rows
    return:
        df_counts: pd.core.frame.DataFrame(N,M+1)
    '''
    df_counts = df.value_counts(cols)\
        .reset_index(name='cell_counts')\
        .sort_values(by=cols)
    
    return df_counts

def sum_cells(df, cols, iid_col):
    '''
    takes cell count df, groups by cols denoted in cols list and computes sum
    of cell_counts col for each group. Adds new column "cell_count_sums"
    containing sums.
    args:
        df: pd.core.frame.DataFrame(N, M), N: the number of rows (N=|id_col|),
            M: the number of cols, must contain col called "cell_counts"
        cols: list(M-2), list containing col name strings that define each group 
            for group by and reduction (in this case summing)
        iid_col: str, denotes 
    '''
    # remove image id col (we want to sum counts across all images per rat)
    reduce_cols = list(filter(lambda x: x != iid_col, cols))

    if 'scaled_counts' in df.columns:
            # group by, reduce 
        df_sums = df.groupby(by=reduce_cols)\
            .apply(lambda x: np.sum(x.scaled_counts))\
            .reset_index(name='cell_count_sums')
    
    else:
        # group by, reduce 
        df_sums = df.groupby(by=reduce_cols)\
            .apply(lambda x: np.sum(x.cell_counts))\
            .reset_index(name='cell_count_sums')
    
    return df_sums

def average_counts(df_sums, df_ns, cols, sid_col, iid_col):
    '''
    takes df of cell count sums and df of image ns, and computes the mean cell 
    n (divides cell count sums by the number of images) for each subject.
    args:
        df_sums: pd.core.frame.DataFrame(ni, mi), ni: the number of rows
            (ni=|sid|), mi: the number of cols (mi = |cols|); must 
            contain a col "cell_count_sums". 
        df_ns: pd.core.frame.DataFrame(nj, mj), nj: the number of rows 
            (nj=|sid|), mj: the number of cols (mj=2); must contain a col
            "image_n" 
        cols: list(n), n: the number of cols (contains all cols necessary to 
            create every unique group combination)
        sid_col: str, denoting the name of the col containing unique subject ids
        iid_col: str, denoting the name of the col containing unique image ids
    return:
        mean_cell_ns: pd.core.frame.DataFrame(N,M), N: the number of rows (N=
        |sid|), M: the number of cols (M=|cols|+2)
        
    '''
    # list of cols with out image id, since it was removed during the reduction step
    reduce_cols = list(filter(lambda x: x != iid_col, cols))

    # compute mean cell n
    mean_cell_ns = df_sums.join(df_ns.set_index(sid_col), on=sid_col, how='inner')\
        .sort_values(by=reduce_cols)
    mean_cell_ns['mean_cell_n'] = mean_cell_ns.cell_count_sums / mean_cell_ns.image_n

    # reorder so that subject id is the first col
    col_reorder = [sid_col] + list(filter(lambda x: x != sid_col, list(mean_cell_ns.columns)))
    mean_cell_ns = mean_cell_ns[col_reorder]

    return mean_cell_ns

def mean_cell_n(df_stain, df_full, cols, sid, iid, return_counts=False):
    '''
    wrapper function to compute mean cell ns; magnification/zoom factor 
    is assuemd to be equal across all images
    args:
        df_stain: pd.core.frame.DataFrame; df containing data for a given stain type
        df_full: pd.core.frame.DataFrame; df containing data for full (cleaned) set
        cols: list, contains str denoting col names for grouping
        sid: str, col name denoting col containing unique subject ids
        iid: str, col name denoting col containing unique image ids
        return_counts: bool, flag for added utility during debugging
    '''
    # count n of unique image names per subject
    img_ns = count_imgs(df_full, sid, iid)

    # count n of cells per image for each subject
    cell_counts = count_cells(df_stain, cols)

    # sum cell counts across all images for each subject
    cell_sums = sum_cells(cell_counts, cols, iid)

    # compute mean cell count per image for each subject
    mean_cell_ns = average_counts(cell_sums, img_ns, cols, sid, iid)

    if not return_counts:
        return mean_cell_ns
    
    return (cell_counts, mean_cell_ns)


def scaled_mean_cell_n(df_stain, df_full, cols, sid, iid, return_counts=False, scale_counts=False):
    '''
    wrapper function to compute mean cell ns for images of unequal zoom factor
    args:
        df_stain: pd.core.frame.DataFrame; df containing data for a given stain type
        df_full: pd.core.frame.DataFrame; df containing data for full (cleaned) set
        cols: list, contains str denoting col names for grouping
        sid: str, col name denoting col containing unique subject ids
        iid: str, col name denoting col containing unique image ids
        return_counts: bool, flag for added utility during debugging
    '''
    # count n of unique image names per subject
    img_ns = count_imgs(df_full, sid, iid)

    # count n of cells per image for each subject
    cell_counts = count_cells(df_stain, cols)

    if scale_counts:
        # get magnification_factor for each img
        mag = df_full[[sid, iid, 'magnification_factor']].drop_duplicates()
        cell_counts = cell_counts.merge(mag)

        # scale all counts by max magnification factor
        max_mag = cell_counts.magnification_factor.unique().max()
        cell_counts['scale'] = cell_counts.magnification_factor / max_mag
        cell_counts['scaled_counts'] = cell_counts.cell_counts * cell_counts.scale

    # sum cell counts across all images for each subject
    cell_sums = sum_cells(cell_counts, cols, iid)

    # compute mean cell count per image for each subject
    mean_cell_ns = average_counts(cell_sums, img_ns, cols, sid, iid)

    if not return_counts:
        return mean_cell_ns
    
    return (cell_counts, mean_cell_ns)



In [256]:
group = 'SD_fulldb'
for stain in df_dropped.stain_type.unique():
    sid = 'subject_number'
    iid = 'image_name'
    cols = ['sex', 'treatment', 'ZT', 'subject_number', 'image_name']

    df_stain = df_dropped[df_dropped.stain_type == stain]
    df_means = scaled_mean_cell_n(df_stain, df_dropped, cols, sid, iid, scale_counts=True)
    




In [264]:
df_means[df_means.subject_number == 'RF009'].cell_counts.sum()

150

In [263]:
df_means[df_means.subject_number == 'RF009']

Unnamed: 0,sex,treatment,ZT,subject_number,image_name,cell_counts,magnification_factor,scale,scaled_counts,cell_count_sums,image_n,mean_cell_n
48,F,SD,6,RF009,RF009_2.52_L,30,1.0,0.61237,18.371096,91.855481,6,15.309247
49,F,SD,6,RF009,RF009_2.52_R,32,1.0,0.61237,19.595836,91.855481,6,15.309247
50,F,SD,6,RF009,RF009_3.24_L,19,1.0,0.61237,11.635028,91.855481,6,15.309247
51,F,SD,6,RF009,RF009_3.24_R,29,1.0,0.61237,17.758726,91.855481,6,15.309247
52,F,SD,6,RF009,RF009_4.68_L,24,1.0,0.61237,14.696877,91.855481,6,15.309247
53,F,SD,6,RF009,RF009_4.68_R,16,1.0,0.61237,9.797918,91.855481,6,15.309247


In [265]:
25 * (1/1.633)

15.309246785058175

In [243]:
df_means[scaled.mean_cell_n != df_means.mean_cell_n]

Unnamed: 0,subject_number,sex,treatment,ZT,cell_count_sums,image_n,mean_cell_n
8,RF009,F,SD,6,150,6,25.0
9,RF011,F,SD,6,115,6,19.166667
38,RF013,F,TOD_Control,6,157,6,26.166667
39,RF015,F,TOD_Control,6,160,6,26.666667
69,RM013,M,SD,6,169,6,28.166667
70,RM016,M,SD,6,241,6,40.166667
101,RM019,M,TOD_Control,6,158,6,26.333333
102,RM021,M,TOD_Control,6,123,6,20.5
127,RM012,M,ZT0,0,183,6,30.5
128,RM015,M,ZT0,0,135,6,22.5


In [251]:
cell_counts.merge(scaled)

Unnamed: 0,sex,treatment,ZT,subject_number,image_name,cell_counts,cell_count_sums,image_n,mean_cell_n
0,F,SD,0,RF018,RF018_2.70_L,307,128.0,6,21.333333
1,F,SD,0,RF018,RF018_2.70_R,312,128.0,6,21.333333
2,F,SD,0,RF018,RF018_3.20_L,278,128.0,6,21.333333
3,F,SD,0,RF018,RF018_3.20_R,297,128.0,6,21.333333
4,F,SD,0,RF018,RF018_4.00_L,264,128.0,6,21.333333
...,...,...,...,...,...,...,...,...,...
877,M,ZT0,0,RM084,RM084_2.52_R,826,117.0,6,19.500000
878,M,ZT0,0,RM084,RM084_3.24_L,1085,117.0,6,19.500000
879,M,ZT0,0,RM084,RM084_3.24_R,961,117.0,6,19.500000
880,M,ZT0,0,RM084,RM084_4.2_L,933,117.0,6,19.500000


In [252]:
cell_counts

Unnamed: 0,sex,treatment,ZT,subject_number,image_name,cell_counts
765,F,SD,0,RF018,RF018_2.70_L,307
756,F,SD,0,RF018,RF018_2.70_R,312
813,F,SD,0,RF018,RF018_3.20_L,278
782,F,SD,0,RF018,RF018_3.20_R,297
828,F,SD,0,RF018,RF018_4.00_L,264
...,...,...,...,...,...,...
146,M,ZT0,0,RM084,RM084_2.52_R,826
47,M,ZT0,0,RM084,RM084_3.24_L,1085
72,M,ZT0,0,RM084,RM084_3.24_R,961
83,M,ZT0,0,RM084,RM084_4.2_L,933
