In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import warnings
import sys
sys.path.append('../src/')
warnings.filterwarnings('ignore')

# Data Ingestion

#### As far as data ingestion goes, our sponsors only expect us to upload an experiment file locally. As per our meeting with them on 1/26, we'll be primarily processing two datasets: ObjCell (feature aggregate file), ObjNuclei (eveything the the nucleus) and ObjAllCyto (everything outside of the nucleus)

#### For now, I'll be using the PAM194_Keratino_CytoPanel_1 experiment. If we come to an agreement with these preprocessing steps, then we can apply them to all experiments

In [2]:
# Ideally, user would specify an experiment to run pipeline on i.e. PAM194_Keratino_CytoPanel_1

# Experiment = (user input)
# file = ''

# Reading in the files locally for experiment PAM194_Keratino_CytoPanel_1
obj_cell = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjCell.csv'
obj_nuclei = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjNuclei.csv'
obj_cyto = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjAllCyto.csv'

cell_data = pd.read_csv(obj_cell, sep = ',')
nuclei_data = pd.read_csv(obj_nuclei, sep = ',')
cyto_data = pd.read_csv(obj_cyto, sep = ',')

In [3]:
# looking at our datasets one by one

# dataset for ObjCell
cell_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,,,0,IFNg,33,Plate 1,,0,...,809.526316,2213.948905,63.3125,62.456274,43.161184,64.916058,456.894737,558.148289,571.819079,835.682482
1,1,2,,,0,IFNg,33,Plate 1,,0,...,140.515945,297.913007,41.003843,44.32035,41.900836,55.248416,423.985791,488.225973,537.927107,690.420186
2,1,3,,,0,IFNg,33,Plate 1,,0,...,416.107143,637.340426,131.801587,141.045184,123.582589,184.291962,1244.634921,1873.365042,1386.792411,1969.450355
3,1,4,,,0,IFNg,33,Plate 1,,0,...,369.692492,442.252926,58.827859,72.721717,56.393994,68.36736,1357.431873,2371.83225,1386.081081,1289.561769
4,1,5,,,0,IFNg,33,Plate 1,,0,...,422.836423,480.79878,44.629263,90.982022,79.689204,90.452439,645.309131,1582.348315,1484.908397,2007.782927


In [4]:
# dataset for ObjNuclei
nuclei_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,PathName_Actin,PathName_DNA,PathName_DNA2,PathName_Golgi,PathName_Mito,PathName_NileRed,PathName_WGA,AreaShape_Area,AreaShape_Orientation,Number_Object_Number
0,1,1,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,133,61.222325,1
1,1,2,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,126,84.552142,2
2,1,3,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,91,67.275934,3
3,1,4,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,150,-18.041571,4
4,1,5,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,196,75.615935,5


In [5]:
# dataset for ObjAllcyto
cyto_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,,,0,IFNg,33,Plate 1,,0,...,797.291005,2800.648649,83.829016,86.225806,45.650794,83.817568,521.57513,564.890323,403.862434,818.189189
1,1,2,,,0,IFNg,33,Plate 1,,0,...,139.078833,304.300176,40.170459,42.896766,41.123471,54.117129,427.400479,495.495029,543.160032,702.931338
2,1,3,,,0,IFNg,33,Plate 1,,0,...,403.603175,583.594488,117.092822,131.679842,106.404151,161.42126,1337.491337,1914.338603,1383.3663,2127.47769
3,1,4,,,0,IFNg,33,Plate 1,,0,...,268.925081,337.824499,58.184106,71.126437,55.187622,66.273639,1030.760265,1957.073276,1290.903583,967.02149
4,1,5,,,0,IFNg,33,Plate 1,,0,...,415.832206,336.310726,40.181446,93.563662,80.755074,84.009464,645.702592,1900.503577,1764.224628,2337.820189


# Dropping Unncessary Columns

#### For data cleaning, we'll be removing columns that we've deemed unnecessary for our pipeline and NA values from our datasets

In [6]:
# This function serves to drop the columns that we've deemed unncessary for our pipeline
def drop_columns(data):
    data.drop(list(data.filter(regex = 'FileName_')), axis=1, inplace=True) # Dropping all of the columns starting with 'FileName_'
    data.drop(list(data.filter(regex = 'PathName_')), axis=1, inplace=True) # Dropping all of the columns starting with 'PathName_'
    data.drop(['Metadata_Date', 'Metadata_FileLocation', 'Metadata_Frame',
              'Metadata_Run', 'Metadata_Series'], axis=1, inplace=True)
    

In [7]:
drop_columns(cell_data)
drop_columns(nuclei_data)
drop_columns(cyto_data)

In [8]:
# Below a function that imputes these NAs for each measuremnt. For now, I'll use the mean, but let me 
# know if we should use another method for imputation such as KNN or MICE

def replace_NA(data):
    measurements = data.iloc[:,6:].columns # since we know which columns we're dropping, should this subset be fixed?
    for measure in measurements:
        if data[measure].isna().any():
            data[measure].fillna(data[measure].mean(), inplace=True)
            
print("The sum of NAs in the ObjCell dataset is:", cell_data.isna().sum().sum())
print("The sum of NAs in the ObjNuclei dataset is:", nuclei_data.isna().sum().sum())
print("The sume of NAs in the ObjALlCyto dataset is:", cyto_data.isna().sum().sum())

The sum of NAs in the ObjCell dataset is: 276
The sum of NAs in the ObjNuclei dataset is: 0
The sume of NAs in the ObjALlCyto dataset is: 414


In [9]:
replace_NA(cell_data)
replace_NA(nuclei_data)
replace_NA(cyto_data)

In [10]:
# looking at our cell data after dropping columns
cell_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,IFNg,33,Plate 1,B10,370.0,-60.810939,32.776566,53.650428,...,809.526316,2213.948905,63.3125,62.456274,43.161184,64.916058,456.894737,558.148289,571.819079,835.682482
1,1,2,IFNg,33,Plate 1,B10,3152.0,49.398792,43.897883,73.654626,...,140.515945,297.913007,41.003843,44.32035,41.900836,55.248416,423.985791,488.225973,537.927107,690.420186
2,1,3,IFNg,33,Plate 1,B10,1033.0,25.981245,42.339322,47.96022,...,416.107143,637.340426,131.801587,141.045184,123.582589,184.291962,1244.634921,1873.365042,1386.792411,1969.450355
3,1,4,IFNg,33,Plate 1,B10,1978.0,61.618333,30.14131,30.521827,...,369.692492,442.252926,58.827859,72.721717,56.393994,68.36736,1357.431873,2371.83225,1386.081081,1289.561769
4,1,5,IFNg,33,Plate 1,B10,1090.0,71.432709,39.343563,39.508533,...,422.836423,480.79878,44.629263,90.982022,79.689204,90.452439,645.309131,1582.348315,1484.908397,2007.782927


#### IMPORTANT NOTE FOR TEAM: Below, you'll see that we have a 'Number_Object_Number' feature that is identical to the object number. I'll follow up with our sponsors to see what the difference is between the two and whether we should drop it.

In [11]:
# looking at our nuclei data after dropping columns
nuclei_data.head() 

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Number_Object_Number
0,1,1,IFNg,33,Plate 1,B10,133,61.222325,1
1,1,2,IFNg,33,Plate 1,B10,126,84.552142,2
2,1,3,IFNg,33,Plate 1,B10,91,67.275934,3
3,1,4,IFNg,33,Plate 1,B10,150,-18.041571,4
4,1,5,IFNg,33,Plate 1,B10,196,75.615935,5


In [12]:
cyto_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,Granularity_1_CorrActin,Granularity_1_CorrDNA2,Granularity_1_CorrGolgi,Granularity_1_CorrMito,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,IFNg,33,Plate 1,B10,30.930115,21.388745,23.076034,34.729626,...,797.291005,2800.648649,83.829016,86.225806,45.650794,83.817568,521.57513,564.890323,403.862434,818.189189
1,1,2,IFNg,33,Plate 1,B10,43.908685,69.658228,24.140234,20.181313,...,139.078833,304.300176,40.170459,42.896766,41.123471,54.117129,427.400479,495.495029,543.160032,702.931338
2,1,3,IFNg,33,Plate 1,B10,42.928617,36.212184,26.573823,11.693345,...,403.603175,583.594488,117.092822,131.679842,106.404151,161.42126,1337.491337,1914.338603,1383.3663,2127.47769
3,1,4,IFNg,33,Plate 1,B10,30.272651,19.416316,27.274117,16.974175,...,268.925081,337.824499,58.184106,71.126437,55.187622,66.273639,1030.760265,1957.073276,1290.903583,967.02149
4,1,5,IFNg,33,Plate 1,B10,39.122643,18.667415,24.267978,19.573362,...,415.832206,336.310726,40.181446,93.563662,80.755074,84.009464,645.702592,1900.503577,1764.224628,2337.820189


# Data Standardization

#### Now we need to transform each dataset so that each feature has the same scale. 

#### IMPORTANT NOTE: Danish's PCA function handles the standardization, so we'll let his function handle it

# Outlier Removal

### Last step before PCA, we need to remove the outliers in our datasets. 

### Because we standardized our data beforehand, we'll use the standard deviation (SD) to remove any outliers above and below 5-6 SD from the mean for each feature


## Old Method - IQR

#### Below is my attempt at removing our outliers using an IQR method. We won't be using this method for outlier detection. I only kept this function in this notebook for reference.

In [13]:
pd.options.mode.chained_assignment = None  # default='warn'

# This IQR function will serve as a helper function for the function defined in the next cell

def replace_column_outliers(x):
    q1 = np.quantile(x, .25)
    q2 = np.quantile(x, .75)
    IQR = q2 - q1
    return pd.Series(np.where((x < (q1 - 1.5*IQR)) | (x > (q2+1.5*IQR)), float('nan'), x))

In [18]:
def drop_feature_outliers(df, thres):
    # create a copy of the original dataframe. This copy is what we'll use to count the outliers
    df_copy = df.copy()
    final_df = pd.DataFrame()
    
    features = df_copy.columns[6:]
    threshold = round(thres * len(features)) # For this test, we'll set a threshold of 30% of the features
    cytokines = df_copy['Metadata_Metadata_Cytokine'].unique()
    
    # The below nested iterations serve to replace outliers in the features for each cytokine at each treatment
    # This is messy for now, but if the implemntation is correct, I'll refactor this code into something cleaner
    for cytokine in cytokines:
        cyto_copy = df_copy[df_copy['Metadata_Metadata_Cytokine'] == cytokine]
        if 'untr' in cytokine:
            for feature in features:
                cyto_copy[feature] = replace_column_outliers(cyto_copy[feature]).values
            final_df = pd.concat([final_df,cyto_copy])
        else:
            tmp_df = pd.DataFrame()
            doses = cyto_copy['Metadata_Metadata_Dose'].unique()
            for dose in doses:
                cyto_dose = cyto_copy[cyto_copy['Metadata_Metadata_Dose'] == dose]
                for feature in features:
                    cyto_dose[feature] = replace_column_outliers(cyto_dose[feature]).values
                tmp_df = pd.concat([tmp_df, cyto_dose])
            final_df = pd.concat([final_df, tmp_df])

    outliers = (final_df[features].isnull()).sum(1)
    final_df['Outlier_Count'] = outliers.astype(int)
    final_df = final_df[final_df['Outlier_Count'] < threshold] 
    return final_df

test_cell = drop_feature_outliers(cell_data, 0.7)

# both the original data and the data my function returns are the same despite setting the threshold to 70%
print(cell_data.shape)
print(test_cell.shape)

# the image with the max number of outliers is 59, much less than the 70-80% that Caroline mentioned.
print(test_cell['Outlier_Count'].max()) 
print(test_cell[test_cell['Outlier_Count']==59]['ImageNumber'].tolist())

(94370, 104)
(94370, 105)
59
[110, 149, 193]


In [28]:
# But when I lower the threshold to 30%, I'm able to remove some data
test2 = drop_feature_outliers(cell_data, 0.3)
print(cell_data.shape)
print(test2.shape)

(94370, 104)
(91946, 105)


## Current Implementation - Outlier Detection using SD

#### This is our current implementation of the outler dection for our project. 

In [79]:
# This function serves as a helper function that calculates the outliers for every cytokine, dose treatment, and well

def replace_outliers_with_sd(df, feature, c, d, w, n):
    sub_df= df[df['Metadata_Metadata_Cytokine']==c]
    sub_df = sub_df[sub_df['Metadata_Metadata_Dose']==d]
    sub_df = sub_df[sub_df['Metadata_Well']==w]
    sub_df = sub_df[feature]
    m = np.mean(sub_df)
    sd = np.std(sub_df)
    bool_col = (sub_df < (m - n * sd)) | (sub_df > (m + n * sd))
    return bool_col

In [80]:
# This is the function that we'll use to tally all of the outliers for a given object within an image (i.e by row)

def outlier_detection(df, sd, thresh=0.0):
    df_copy = df.copy()
    features = df_copy.columns[6:]

    if(thresh == 0.0):
        threshold = 1 # if you don't want to establish a threshold, then we'll filter for every object that has at least 1 outlier
    else:
        threshold = round(thresh * len(features))

    # Preferable to groupby as this maintains the unique combination of cytokine, dose, and well in the same 
    # order as they appear in the original dataframe
    unique_pairs = df_copy[['Metadata_Metadata_Cytokine', 
                           'Metadata_Metadata_Dose', 
                           'Metadata_Well']].drop_duplicates().reset_index(drop=True)
    
    # This final outlier dataset will contain the outliers for all cytokines and their respective doses and wells
    all_outliers = pd.DataFrame()
    
    # loop through each specific cytokine, dose, and well combination in the specific order that they're in 
    # the original dataframe
    for i, row in unique_pairs.iterrows():
        
        # This outlier dataset will be used to find outliers for each specific cytokine, dose treatment, and well
        # for all features. 
        outlier_cols = pd.DataFrame()

        cytokine = row[0]
        dose = row[1]
        well = row[2]

        # again, this is calculating the outliers for each specific cytokine, dose, and well in
        # the same order they appear in the original dataframe. 
        for feature in features:
            test = replace_outliers_with_sd(df_copy, feature, cytokine, dose, well, sd)
            col_name = feature + '_outliers'
            outlier_cols.loc[:, col_name] = test

        all_outliers = pd.concat([all_outliers, outlier_cols])

    # get the sum value where an image is an outlier for each feature (i.e True from the above function)  
    cols = all_outliers.columns
    df['Outlier_Count'] = (all_outliers[cols] == True).sum(1)

    # Get the images that have outliers that exceed our threshold and those that don't
    outliers = df[df['Outlier_Count'] >= threshold]
    sub_data = df[df['Outlier_Count'] < threshold]
    
    # Drop the outlier count columns in both datasets
    outliers.drop(['Outlier_Count'], axis=1, inplace=True)
    sub_data.drop(['Outlier_Count'], axis=1, inplace=True)
    
    # I'm only returning the outliers for the team to show our sponsors which images are outliers. 
    # We can remove this part of the function in our final product. 
    return outliers, sub_data

In [75]:
outliers, real_data = outlier_detection(cell_data, 5, 0.3) # at 5 SD and a threshold of 30% of features

print(len(cell_data))
print(len(outliers))
print(len(real_data))

94370
16
94354


In [76]:
outliers, real_data = outlier_detection(cell_data, 5, 0.1) # at 5 SD and a threshold of 10% of features

print(len(cell_data))
print(len(outliers))
print(len(real_data))

outliers.head()

94370
822
93548


Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
178,1,179,IFNg,33,Plate 1,B10,119.0,62.950061,78.496044,79.42638,...,1.767123,2.109091,8.94186,7.61039,7.0,8.928571,8.488372,14.558442,10.48,11.303571
252,1,253,IFNg,33,Plate 1,B10,161.0,88.951047,32.083378,55.657019,...,3985.185567,3387.707865,1190.923077,1930.290698,1687.051546,2703.224719,1866.709402,2725.174419,2532.134021,2262.168539
296,1,297,IFNg,33,Plate 1,B10,125.0,80.759705,59.070665,65.736345,...,1.78125,1.627451,4260.70297,7857.895522,3891.833333,2097.596154,11.693069,14.432836,10.727273,10.980769
603,3,23,IFNg,33,Plate 1,B10,169.0,65.153956,44.594056,55.403075,...,2.326923,2.513158,3040.121951,5787.194444,2964.663462,2257.289474,18.186992,51.157407,32.903846,13.644737
606,3,26,IFNg,33,Plate 1,B10,113.0,-8.265379,42.223382,67.127095,...,4365.725,4925.30303,553.458333,563.241935,544.025,416.0,2405.222222,5106.516129,6388.8125,5575.863636


In [77]:
outliers, real_data = outlier_detection(cell_data, 6, 0.3) # at 6 SD and a threshold of 30% of features

print(len(cell_data))
print(len(outliers))
print(len(real_data))
outliers.head()

94370
4
94366


Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
27864,110,239,EGF,11,Plate 1,F4,369.0,-51.360839,11.024419,68.827641,...,7464.97973,9227.772727,5496.065068,9671.405622,7867.959459,10385.660839,3214.246575,3565.546185,5248.648649,7904.751748
27916,110,291,EGF,11,Plate 1,F4,1048.0,-12.33149,17.319831,21.190759,...,4059.878515,7266.949195,2028.933782,1877.891705,3342.637795,5685.149938,836.231201,1520.998848,2114.320585,3066.188352
27919,110,294,EGF,11,Plate 1,F4,108.0,46.727936,7.909589,63.019057,...,9471.901408,13387.5,1830.271429,3524.484375,4070.676056,6018.54,1281.614286,1280.359375,1024.788732,1977.48
59554,241,102,IL17,33,Plate 2,F10,326.0,-18.888994,30.914512,26.83214,...,1994.551471,8022.578313,5916.073171,9692.560538,1865.180147,5021.971888,256.03252,295.139013,302.040441,235.923695


In [78]:
outliers, real_data = outlier_detection(cell_data, 6, 0.1) # at 6 SD and a threshold of 10% of features

print(len(cell_data))
print(len(outliers))
print(len(real_data))
outliers.head()

94370
483
93887


Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
252,1,253,IFNg,33,Plate 1,B10,161.0,88.951047,32.083378,55.657019,...,3985.185567,3387.707865,1190.923077,1930.290698,1687.051546,2703.224719,1866.709402,2725.174419,2532.134021,2262.168539
296,1,297,IFNg,33,Plate 1,B10,125.0,80.759705,59.070665,65.736345,...,1.78125,1.627451,4260.70297,7857.895522,3891.833333,2097.596154,11.693069,14.432836,10.727273,10.980769
603,3,23,IFNg,33,Plate 1,B10,169.0,65.153956,44.594056,55.403075,...,2.326923,2.513158,3040.121951,5787.194444,2964.663462,2257.289474,18.186992,51.157407,32.903846,13.644737
606,3,26,IFNg,33,Plate 1,B10,113.0,-8.265379,42.223382,67.127095,...,4365.725,4925.30303,553.458333,563.241935,544.025,416.0,2405.222222,5106.516129,6388.8125,5575.863636
1014,4,209,IFNg,33,Plate 1,B10,481.0,71.949532,14.48941,43.093878,...,2007.870056,2834.097264,87.969773,1356.966197,1534.629944,1412.954407,402.390428,1865.552113,2320.19209,2304.173252


In [24]:
outliers, real_data = outlier_detection(cell_data, 3, 0.1) # at 3 SD and at a threshold 10% of features

print(len(cell_data))
print(len(outliers))
print(len(real_data))
outliers.head()

94370
3446
90924


Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
140,1,141,IFNg,33,Plate 1,B10,182.0,-25.678877,34.139606,78.651764,...,6178.445255,5852.793388,837.052632,2235.141593,1672.255474,1539.834711,3995.218045,7269.867257,6854.627737,7152.619835
178,1,179,IFNg,33,Plate 1,B10,119.0,62.950061,78.496044,79.42638,...,1.767123,2.109091,8.94186,7.61039,7.0,8.928571,8.488372,14.558442,10.48,11.303571
252,1,253,IFNg,33,Plate 1,B10,161.0,88.951047,32.083378,55.657019,...,3985.185567,3387.707865,1190.923077,1930.290698,1687.051546,2703.224719,1866.709402,2725.174419,2532.134021,2262.168539
262,1,263,IFNg,33,Plate 1,B10,247.0,-17.789638,29.795388,71.04034,...,1533.048077,4059.596591,1064.497006,1431.65942,480.149038,1089.022727,1303.892216,1706.644928,1201.692308,1242.670455
291,1,292,IFNg,33,Plate 1,B10,608.0,-55.923217,29.210763,63.552757,...,11.916488,50.442748,645.268966,960.450363,616.466809,500.631043,319.714943,516.765133,302.4197,372.229008


In [23]:
outliers, real_data = outlier_detection(cell_data, 4, 0.1) # at 4 SD and at a threshold 10% of features

print(len(cell_data))
print(len(outliers))
print(len(real_data))
outliers.head()

94370
1518
92852


Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
178,1,179,IFNg,33,Plate 1,B10,119.0,62.950061,78.496044,79.42638,...,1.767123,2.109091,8.94186,7.61039,7.0,8.928571,8.488372,14.558442,10.48,11.303571
252,1,253,IFNg,33,Plate 1,B10,161.0,88.951047,32.083378,55.657019,...,3985.185567,3387.707865,1190.923077,1930.290698,1687.051546,2703.224719,1866.709402,2725.174419,2532.134021,2262.168539
296,1,297,IFNg,33,Plate 1,B10,125.0,80.759705,59.070665,65.736345,...,1.78125,1.627451,4260.70297,7857.895522,3891.833333,2097.596154,11.693069,14.432836,10.727273,10.980769
506,2,210,IFNg,33,Plate 1,B10,153.0,-86.141684,25.416338,71.550598,...,817.574074,1254.44086,253.446429,602.244444,295.472222,179.731183,2978.919643,6648.488889,2040.157407,1158.494624
598,3,18,IFNg,33,Plate 1,B10,186.0,16.271919,23.779935,57.056174,...,7.3,48.554455,1533.330435,2073.226891,1303.742857,1164.732673,68.26087,107.394958,47.392857,67.306931


#### As you can see from above, A LOT of data was removed. May need to seek an alternative. 

#### I'll write our data to a clean folder from which we can conduct PCA and our analyses onto

In [48]:
# Ideally, this should write to a folder that'll serve as a temporary location for the PCA & Analysis scripts to read 
# from. We should discuss how to handle the hand-offs between our scripts

clean_cell.to_csv('C:/Users/mdbla/Documents/UW_VM_Capstone_2024/HTI/Preprocessed_Data/PAM194_ObjCell_clean.csv')
clean_nuclei.to_csv('C:/Users/mdbla/Documents/UW_VM_Capstone_2024/HTI/Preprocessed_Data/PAM194_ObjNuclei_clean.csv')
clean_cyto.to_csv('C:/Users/mdbla/Documents/UW_VM_Capstone_2024/HTI/Preprocessed_Data/PAM194_ObjCyto_clean.csv')