In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Data Ingestion

#### As far as data ingestion goes, our sponsors only expect us to upload an experiment file locally. As per our meeting with them on 1/26, we'll be primarily processing two datasets: ObjCell (feature aggregate file), ObjNuclei (eveything the the nucleus) and ObjAllCyto (everything outside of the nucleus)

#### For now, I'll be using the PAM194_Keratino_CytoPanel_1 experiment. If we come to an agreement with these preprocessing steps, then we can apply them to all experiments

In [80]:
# Ideally, user would specify an experiment to run pipeline on i.e. PAM194_Keratino_CytoPanel_1

# Experiment = (user input)
# file = ''

# Reading in the files locally for experiment PAM194_Keratino_CytoPanel_1
obj_cell = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjCell.csv'
obj_nuclei = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjNuclei.csv'
obj_cyto = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjAllCyto.csv'

cell_data = pd.read_csv(obj_cell, sep = ',')
nuclei_data = pd.read_csv(obj_nuclei, sep = ',')
cyto_data = pd.read_csv(obj_cyto, sep = ',')

In [81]:
# looking at our datasets one by one

# dataset for ObjCell
cell_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,,,0,IFNg,33,Plate 1,,0,...,809.526316,2213.948905,63.3125,62.456274,43.161184,64.916058,456.894737,558.148289,571.819079,835.682482
1,1,2,,,0,IFNg,33,Plate 1,,0,...,140.515945,297.913007,41.003843,44.32035,41.900836,55.248416,423.985791,488.225973,537.927107,690.420186
2,1,3,,,0,IFNg,33,Plate 1,,0,...,416.107143,637.340426,131.801587,141.045184,123.582589,184.291962,1244.634921,1873.365042,1386.792411,1969.450355
3,1,4,,,0,IFNg,33,Plate 1,,0,...,369.692492,442.252926,58.827859,72.721717,56.393994,68.36736,1357.431873,2371.83225,1386.081081,1289.561769
4,1,5,,,0,IFNg,33,Plate 1,,0,...,422.836423,480.79878,44.629263,90.982022,79.689204,90.452439,645.309131,1582.348315,1484.908397,2007.782927


In [82]:
# dataset for ObjNuclei
nuclei_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,PathName_Actin,PathName_DNA,PathName_DNA2,PathName_Golgi,PathName_Mito,PathName_NileRed,PathName_WGA,AreaShape_Area,AreaShape_Orientation,Number_Object_Number
0,1,1,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,133,61.222325,1
1,1,2,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,126,84.552142,2
2,1,3,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,91,67.275934,3
3,1,4,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,150,-18.041571,4
4,1,5,,,0,IFNg,33,Plate 1,,0,...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,/Users/carolinestefani/Library/CloudStorage/Bo...,196,75.615935,5


In [83]:
# dataset for ObjAllcyto
cyto_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,,,0,IFNg,33,Plate 1,,0,...,797.291005,2800.648649,83.829016,86.225806,45.650794,83.817568,521.57513,564.890323,403.862434,818.189189
1,1,2,,,0,IFNg,33,Plate 1,,0,...,139.078833,304.300176,40.170459,42.896766,41.123471,54.117129,427.400479,495.495029,543.160032,702.931338
2,1,3,,,0,IFNg,33,Plate 1,,0,...,403.603175,583.594488,117.092822,131.679842,106.404151,161.42126,1337.491337,1914.338603,1383.3663,2127.47769
3,1,4,,,0,IFNg,33,Plate 1,,0,...,268.925081,337.824499,58.184106,71.126437,55.187622,66.273639,1030.760265,1957.073276,1290.903583,967.02149
4,1,5,,,0,IFNg,33,Plate 1,,0,...,415.832206,336.310726,40.181446,93.563662,80.755074,84.009464,645.702592,1900.503577,1764.224628,2337.820189


# Dropping Unncessary Columns

#### For data cleaning, we'll be removing columns that we've deemed unnecessary for our pipeline and NA values from our datasets

In [84]:
# This function serves to drop the columns that we've deemed unncessary for our pipeline
def drop_columns(data):
    data.drop(list(data.filter(regex = 'FileName_')), axis=1, inplace=True) # Dropping all of the columns starting with 'FileName_'
    data.drop(list(data.filter(regex = 'PathName_')), axis=1, inplace=True) # Dropping all of the columns starting with 'PathName_'
    data.drop(['Metadata_Date', 'Metadata_FileLocation', 'Metadata_Frame',
              'Metadata_Run', 'Metadata_Series'], axis=1, inplace=True)
    

In [85]:
drop_columns(cell_data)
drop_columns(nuclei_data)
drop_columns(cyto_data)

In [86]:
# Below a function that imputes these NAs for each measuremnt. For now, I'll use the mean, but let me 
# know if we should use another method for imputation such as KNN or MICE

def replace_NA(data):
    measurements = data.iloc[:,6:].columns # since we know which columns we're dropping, should this subset be fixed?
    for measure in measurements:
        if data[measure].isna().any():
            data[measure].fillna(data[measure].mean(), inplace=True)
            
print("The sum of NAs in the ObjCell dataset is:", cell_data.isna().sum().sum())
print("The sum of NAs in the ObjNuclei dataset is:", nuclei_data.isna().sum().sum())
print("The sume of NAs in the ObjALlCyto dataset is:", cyto_data.isna().sum().sum())

The sum of NAs in the ObjCell dataset is: 276
The sum of NAs in the ObjNuclei dataset is: 0
The sume of NAs in the ObjALlCyto dataset is: 414


In [87]:
replace_NA(cell_data)
replace_NA(nuclei_data)
replace_NA(cyto_data)

In [88]:
# looking at our cell data after dropping columns
cell_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Granularity_1_CorrActin,Granularity_1_CorrDNA2,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,IFNg,33,Plate 1,B10,370.0,-60.810939,32.776566,53.650428,...,809.526316,2213.948905,63.3125,62.456274,43.161184,64.916058,456.894737,558.148289,571.819079,835.682482
1,1,2,IFNg,33,Plate 1,B10,3152.0,49.398792,43.897883,73.654626,...,140.515945,297.913007,41.003843,44.32035,41.900836,55.248416,423.985791,488.225973,537.927107,690.420186
2,1,3,IFNg,33,Plate 1,B10,1033.0,25.981245,42.339322,47.96022,...,416.107143,637.340426,131.801587,141.045184,123.582589,184.291962,1244.634921,1873.365042,1386.792411,1969.450355
3,1,4,IFNg,33,Plate 1,B10,1978.0,61.618333,30.14131,30.521827,...,369.692492,442.252926,58.827859,72.721717,56.393994,68.36736,1357.431873,2371.83225,1386.081081,1289.561769
4,1,5,IFNg,33,Plate 1,B10,1090.0,71.432709,39.343563,39.508533,...,422.836423,480.79878,44.629263,90.982022,79.689204,90.452439,645.309131,1582.348315,1484.908397,2007.782927


#### IMPORTANT NOTE FOR TEAM: Below, you'll see that we have a 'Number_Object_Number' feature that is identical to the object number. I'll follow up with our sponsors to see what the difference is between the two and whether we should drop it.

In [89]:
# looking at our nuclei data after dropping columns
nuclei_data.head() 

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Number_Object_Number
0,1,1,IFNg,33,Plate 1,B10,133,61.222325,1
1,1,2,IFNg,33,Plate 1,B10,126,84.552142,2
2,1,3,IFNg,33,Plate 1,B10,91,67.275934,3
3,1,4,IFNg,33,Plate 1,B10,150,-18.041571,4
4,1,5,IFNg,33,Plate 1,B10,196,75.615935,5


In [90]:
cyto_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,Granularity_1_CorrActin,Granularity_1_CorrDNA2,Granularity_1_CorrGolgi,Granularity_1_CorrMito,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,IFNg,33,Plate 1,B10,30.930115,21.388745,23.076034,34.729626,...,797.291005,2800.648649,83.829016,86.225806,45.650794,83.817568,521.57513,564.890323,403.862434,818.189189
1,1,2,IFNg,33,Plate 1,B10,43.908685,69.658228,24.140234,20.181313,...,139.078833,304.300176,40.170459,42.896766,41.123471,54.117129,427.400479,495.495029,543.160032,702.931338
2,1,3,IFNg,33,Plate 1,B10,42.928617,36.212184,26.573823,11.693345,...,403.603175,583.594488,117.092822,131.679842,106.404151,161.42126,1337.491337,1914.338603,1383.3663,2127.47769
3,1,4,IFNg,33,Plate 1,B10,30.272651,19.416316,27.274117,16.974175,...,268.925081,337.824499,58.184106,71.126437,55.187622,66.273639,1030.760265,1957.073276,1290.903583,967.02149
4,1,5,IFNg,33,Plate 1,B10,39.122643,18.667415,24.267978,19.573362,...,415.832206,336.310726,40.181446,93.563662,80.755074,84.009464,645.702592,1900.503577,1764.224628,2337.820189


# Data Standardization

#### Now we need to transform each dataset so that each feature has the same scale. 

#### IMPORTANT NOTE: Danish's PCA function handles the standardization, so we'll let his function handle it

# Outlier Removal

#### Last step before PCA, we need to remove the outliers in our datasets. 

#### Because we standardized our data beforehand, we'll use the interquartile range to remove any outliers below 1QR and above 3QR of our features.

#### IMPORTANT NOTE FOR TEAM: The below code does remove a large number of rows from our datasets. If you can review my function for improvements, I'd greatly appreciate it. 

## For Review

### Below is my attempt at removing our outliers using an IQR method. The problem I have is that it's removing too many outliers. If you run the cell below with whatever data that you use, you should get the number of outlers for each feature. 

In [102]:
pd.options.mode.chained_assignment = None  # default='warn'
def replace_column_outliers(x):
    q1 = np.quantile(x, .25)
    q2 = np.quantile(x, .75)
    IQR = q2 - q1
    return pd.Series(np.where((x < (q1 - 1.5*IQR)) | (x > (q2+1.5*IQR)), float('nan'), x))

sub_df = cyto_data[cyto_data['Metadata_Metadata_Cytokine'] == 'IL9']
cyto_dose = sub_df[sub_df['Metadata_Metadata_Dose'] == 11]
features = cyto_dose.columns[6:]
print(cyto_dose[features].isnull().sum().sum())
for feature in features:
    cyto_dose[feature] = replace_column_outliers(cyto_dose[feature]).values

#replace_column_outliers(cyto_dose['Granularity_1_CorrActin'])

#print(test.shape)
#print(test2.shape)
print(cyto_data.shape)
for feature in features:
    print("Number of outliers for feature:", feature, "is ", cyto_dose[feature].isnull().sum())

0
(94370, 138)
Number of outliers for feature: Granularity_1_CorrActin is  56
Number of outliers for feature: Granularity_1_CorrDNA2 is  120
Number of outliers for feature: Granularity_1_CorrGolgi is  50
Number of outliers for feature: Granularity_1_CorrMito is  61
Number of outliers for feature: Granularity_1_CorrNileRed is  59
Number of outliers for feature: Granularity_1_CorrWGA is  120
Number of outliers for feature: Granularity_2_CorrActin is  155
Number of outliers for feature: Granularity_2_CorrDNA2 is  126
Number of outliers for feature: Granularity_2_CorrGolgi is  162
Number of outliers for feature: Granularity_2_CorrMito is  111
Number of outliers for feature: Granularity_2_CorrNileRed is  116
Number of outliers for feature: Granularity_2_CorrWGA is  131
Number of outliers for feature: Granularity_3_CorrActin is  177
Number of outliers for feature: Granularity_3_CorrDNA2 is  133
Number of outliers for feature: Granularity_3_CorrGolgi is  92
Number of outliers for feature: Gra

In [95]:
print(cyto_dose.shape)
print(cyto_dose.dropna().shape)

(3082, 138)
(604, 138)


In [44]:
sub_df = cyto_data[cyto_data['Metadata_Metadata_Cytokine'] == 'IL9']
cyto_dose = sub_df[sub_df['Metadata_Metadata_Dose'] == 11]

def replace_column_outliers(x):
    q1 = np.quantile(x, .25)
    q2 = np.quantile(x, .75)
    IQR = q2 - q1
    return np.where((x < (q1 - 1.5*IQR)) | (x > (q2+1.5*IQR)), float('nan'), x)

test = pd.Series(replace_column_outliers(cyto_dose['Granularity_1_CorrActin']))

print(len(cyto_dose['Granularity_1_CorrActin']))
test.isnull().sum()

3082


56

In [126]:
nuclei_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,AreaShape_Area,AreaShape_Orientation,Number_Object_Number
0,1,1,IFNg,33,Plate 1,B10,133,61.222325,1
1,1,2,IFNg,33,Plate 1,B10,126,84.552142,2
2,1,3,IFNg,33,Plate 1,B10,91,67.275934,3
3,1,4,IFNg,33,Plate 1,B10,150,-18.041571,4
4,1,5,IFNg,33,Plate 1,B10,196,75.615935,5


#### As you can see from above, A LOT of data was removed. May need to seek an alternative. 

#### I'll write our data to a clean folder from which we can conduct PCA and our analyses onto

In [48]:
# Ideally, this should write to a folder that'll serve as a temporary location for the PCA & Analysis scripts to read 
# from. We should discuss how to handle the hand-offs between our scripts

clean_cell.to_csv('C:/Users/mdbla/Documents/UW_VM_Capstone_2024/HTI/Preprocessed_Data/PAM194_ObjCell_clean.csv')
clean_nuclei.to_csv('C:/Users/mdbla/Documents/UW_VM_Capstone_2024/HTI/Preprocessed_Data/PAM194_ObjNuclei_clean.csv')
clean_cyto.to_csv('C:/Users/mdbla/Documents/UW_VM_Capstone_2024/HTI/Preprocessed_Data/PAM194_ObjCyto_clean.csv')