In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler 

In [3]:
# We still need to verify how we're supposed to ingest the data with our sponsors. Until then, I'm using the datasets
# I downloaded locally to work on this step

file = 'C:/Users/mdbla/Documents/UW-Capstone/PAM194_Keratino_CytoPanel_1/PAM194_Keratino_CytoPanel_1/pam194ObjPerinuclear.csv'

pam_data = pd.read_csv(file, sep = ',')

pam_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Date,Metadata_FileLocation,Metadata_Frame,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Run,Metadata_Series,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,,,0,IFNg,33,Plate 1,,0,...,163.02963,172.681729,41.057348,49.27853,51.725926,64.18664,305.080645,409.651838,432.438889,386.387033
1,1,2,,,0,IFNg,33,Plate 1,,0,...,735.050114,971.266504,204.086364,228.314904,189.419134,298.246944,1441.497727,2273.65625,1459.938497,1205.997555
2,1,3,,,0,IFNg,33,Plate 1,,0,...,857.140171,929.469945,73.214162,93.752747,72.317949,80.058288,3096.525043,5232.263736,2797.174359,2538.302368
3,1,4,,,0,IFNg,33,Plate 1,,0,...,751.958333,948.933649,41.334815,81.787736,81.738095,90.276461,977.817778,1993.136792,1493.849702,1703.658768
4,1,5,,,0,IFNg,33,Plate 1,,0,...,1294.931868,1797.672131,77.267819,94.722353,88.406593,130.035129,2478.924406,3199.548235,2603.863736,3415.674473


In [6]:
# For now, We'll keep the image number, object number, cytokine, dose, plate, and measurements columns.
# Feel free to change this later as you see fit

def drop_columns(data):
    data.drop(list(data.filter(regex = 'FileName_')), axis=1, inplace=True) # Dropping all of the columns starting with 'FileName_'
    data.drop(list(data.filter(regex = 'PathName_')), axis=1, inplace=True) # Dropping all of the columns starting with 'PathName_'
    data.drop(['Metadata_Date', 'Metadata_FileLocation', 'Metadata_Frame',
              'Metadata_Run', 'Metadata_Series'], axis=1, inplace=True)
    

In [7]:
drop_columns(pam_data)

pam_data.head()

Unnamed: 0,ImageNumber,ObjectNumber,Metadata_Metadata_Cytokine,Metadata_Metadata_Dose,Metadata_Plate,Metadata_Well,Granularity_1_CorrActin,Granularity_1_CorrDNA2,Granularity_1_CorrGolgi,Granularity_1_CorrMito,...,Texture_Contrast_CorrMito_3_02_256,Texture_Contrast_CorrMito_3_03_256,Texture_Contrast_CorrNileRed_3_00_256,Texture_Contrast_CorrNileRed_3_01_256,Texture_Contrast_CorrNileRed_3_02_256,Texture_Contrast_CorrNileRed_3_03_256,Texture_Contrast_CorrWGA_3_00_256,Texture_Contrast_CorrWGA_3_01_256,Texture_Contrast_CorrWGA_3_02_256,Texture_Contrast_CorrWGA_3_03_256
0,1,1,IFNg,33,Plate 1,B10,39.085821,65.349797,29.429464,30.085395,...,163.02963,172.681729,41.057348,49.27853,51.725926,64.18664,305.080645,409.651838,432.438889,386.387033
1,1,2,IFNg,33,Plate 1,B10,40.965732,52.025753,26.951641,12.008458,...,735.050114,971.266504,204.086364,228.314904,189.419134,298.246944,1441.497727,2273.65625,1459.938497,1205.997555
2,1,3,IFNg,33,Plate 1,B10,26.494443,23.842541,31.133865,16.847527,...,857.140171,929.469945,73.214162,93.752747,72.317949,80.058288,3096.525043,5232.263736,2797.174359,2538.302368
3,1,4,IFNg,33,Plate 1,B10,38.945234,37.122993,23.80509,18.204408,...,751.958333,948.933649,41.334815,81.787736,81.738095,90.276461,977.817778,1993.136792,1493.849702,1703.658768
4,1,5,IFNg,33,Plate 1,B10,33.282087,71.535983,23.37908,23.507941,...,1294.931868,1797.672131,77.267819,94.722353,88.406593,130.035129,2478.924406,3199.548235,2603.863736,3415.674473


In [10]:
# From the output below, we see that there are NAs present for some of our measurements
pam_data.columns[pam_data.isna().any()].tolist()

['Granularity_1_CorrMito',
 'Granularity_1_CorrWGA',
 'Granularity_2_CorrMito',
 'Granularity_2_CorrWGA',
 'Granularity_3_CorrMito',
 'Granularity_3_CorrWGA',
 'Granularity_4_CorrMito',
 'Granularity_4_CorrWGA',
 'RadialDistribution_MeanFrac_CorrMito_1of3',
 'RadialDistribution_MeanFrac_CorrMito_2of3',
 'RadialDistribution_MeanFrac_CorrMito_3of3',
 'RadialDistribution_MeanFrac_CorrWGA_1of3',
 'RadialDistribution_MeanFrac_CorrWGA_2of3',
 'RadialDistribution_MeanFrac_CorrWGA_3of3']

In [19]:
# We should have a function that imputes these NAs for each measuremnt. For now, I'll use the mean, but let me 
# know if we should use another method for imputation such as KNN or MICE

def replace_NA(data):
    measurements = data.iloc[:,6:].columns # since we know which columns we're dropping, should this subset be fixed?
    for measure in measurements:
        if data[measure].isna().any():
            data[measure].fillna(data[measure].mean(), inplace=True)
            
replace_NA(pam_data)


In [None]:
# Before normalization can occur, we'll need to identify any outliers and impute them as needed. 
# Should we use z-scores or box plots to identify them?

In [21]:
# We should apply some form of normalization with our measurements to ensure our models are working with the 
# same scale for all measurements. 

# There are different options for normalizing our features. For now, I'll use the standard scaler from sklearn to 
# perform normalizaiton. Let me know if any other method of standardization is more preferatble. 

measurements = pam_data.iloc[:, 6:]
scaler = StandardScaler()
measurements_normalized = scaler.fit_transform(measurements)

print(measurements_normalized)

[[ 0.67910922  0.4047504   0.47289192 ... -0.48177597 -0.35359681
  -0.49706458]
 [ 0.84852392 -0.31320936  0.23587719 ...  0.86803807  0.76223908
   0.09082475]
 [-0.45560664 -1.83184832  0.63592532 ...  3.01050614  2.21443989
   1.04645887]
 ...
 [ 0.91535315 -0.09809822  0.44240056 ... -0.64620301 -0.66587627
  -0.67759989]
 [-0.84032178  0.64709383 -0.42233521 ... -0.61366953 -0.49908006
  -0.4178257 ]
 [-0.55775986  0.67017715  0.96305834 ... -0.66868671 -0.71602665
  -0.697271  ]]


In [None]:
# QUESTION FOR TEAM: Should normalization occur before or after featrue selection. Or does it matter?