In [None]:
import numpy as np
import pandas as pd
import os
import tqdm
from tqdm import tqdm

from PIL import Image

In [None]:
dirname = '/content/drive/MyDrive/CS 199/Cell Images/'

# Generate Cell Images



In [None]:
# Checks if metadata and compounds are in files
if not (any("BBBC021_v1_image.csv" in s for s in os.listdir(dirname)) and any("BBBC021_v1_moa.csv" in s for s in os.listdir(dirname))):
    raise ValueError("BBBC021_v1_image.csv and BBBC021_v1_moa.csv need to be in directory")

print("Found the files, starting the program to process the images.")
# the number of cropped images from the original image
crops = 4

Found the files, starting the program to process the images.


In [None]:
def slide_window(img, dims=(512, 640)):
    window_height, window_width = dims
    y, x = img.shape[:2]
    crop_images = np.zeros((4,512,640,3)) # dtype="uint8"

    index = 0
    col = 0
    for i in range(y//window_height):
        row = 0
        for j in range(x//window_width):
            crop_images[index] = img[row:row+window_height, col:col+window_width, :]
            row += window_height
            index += 1
        col += window_width
    return crop_images

In [None]:
def anscombe(x):
    x = x.astype(np.float32)
    return (2.0*np.sqrt(x + 3.0/8.0))

In [None]:
def DMSO_normalization(x,y,idx,crops,rm_imgs):
    '''
    Mean DMSO per plate per channel is subtracted from each [non-DMSO] image pixel-wise.
      input x is the transformed non-DMSO image
      input y is the transformed DMSO image
    the output is the non-DMSO image that has been normalized by DMSO statistics
    '''
    channels = x.shape[3]
    x = x.astype(np.float32)

    # Subtracting mean DMSO and divide std DMSO from/by non-DMSO images pixel-wise
    for i in range(channels):
        x[:,:,:,i] -= y[:,:,:,i].mean()
        x[:,:,:,i] /= y[:,:,:,i].std()

    # Map values to 8-bit integers and save to file
    # if crops = 4, slide_window function is used to generate cropped images.
    for i,j in zip(range(x.shape[0]), idx):

        # NewValue = (((OldValue - OldMin) * (NewMax - NewMin)) / (OldMax - OldMin)) + NewMin

        # Map to 8-bit int
        OldRange = (x[i,:,:,:].max() - x[i,:,:,:].min())
        NewRange = (255 - 0)
        xt = (((x[i,:,:,:] - x[i,:,:,:].min()) * NewRange) / OldRange) + 0
        imgs = slide_window(xt)
        for i,img in enumerate(imgs):
            if ((img > (NewRange/5.1)).sum()/img.size) <= 0.002:
                rm_imgs.append(i+j)

                continue
            img_cropped = Image.fromarray(img.astype("uint8"))
            # Save images to directory
            # img_cropped.save("images_bbbc021/bbbc021_%s.png" % str(j+i-len(rm_imgs)))

            # img_cropped.save("/content/drive/MyDrive/CS 199/Cell Images/updated pics/bbbc021_%s.png" % str(j+i-len(rm_imgs)))
            # TESTING SET FOR IMAGES
            img_cropped.save("/content/drive/MyDrive/CS 199/Cell Images/TRAINING SET/bbbc021_%s.png" % str(j+i-len(rm_imgs)))

    return

In [None]:
# read mechanism file
moa  = pd.read_csv(dirname+'/BBBC021_v1_moa.csv')
# read data file which link images to compound/concentration.
# which can then be linked to moa file
data = pd.read_csv(dirname+'/BBBC021_v1_image.csv')

In [None]:
labels = []
# keep track of images
count = 0
# keep track of dataset
dataset = 1
# List specifying images to be removed
rm_imgs = []

In [None]:
# Iterate through different directories (different plates)
for f in (f for f in os.listdir(dirname) if 'Week' in f):

  # Assign new variable for current plate
  plate_data = data[data['Image_PathName_DAPI'].str.contains(f)]

  idx = []
  plate_X = []
  plate_Y = []
  

  # Iterate through current plate
  for index, row in plate_data.iterrows():
      # Exclude all taxol compounds except certain examples from Week 1 plates
      if row['Image_Metadata_Compound'] == "taxol":
          if not (
                  'Week1_' in row['Image_PathName_DAPI']  and
                  'D0' in row['Image_Metadata_Well_DAPI'] and
                      (
                      "0.3" in str(row['Image_Metadata_Concentration']) or
                      "1.0" in str(row['Image_Metadata_Concentration']) or
                      "3.0" in str(row['Image_Metadata_Concentration'])
                      )
                  ):
              continue

      # Extract compounds that have a MOA annotation
      if moa[(moa['compound']      == row['Image_Metadata_Compound']) &
              (moa['concentration'] == row['Image_Metadata_Concentration'])].shape[0] > 0:

          try:
            #Read the images, skip if not found
            img_DAPI    = Image.open(dirname+'/%s/%s' % (f, row['Image_FileName_DAPI']))
            img_DAPI    = np.array(img_DAPI)

            img_Tubulin = Image.open(dirname+'/%s/%s' % (f, row['Image_FileName_Tubulin']))
            img_Tubulin = np.array(img_Tubulin)

            img_Actin   = Image.open(dirname+'/%s/%s' % (f, row['Image_FileName_Actin']))
            img_Actin   = np.array(img_Actin)
          except OSError as error:
            print(error)
            print(f + ": File not found!")
            continue
        

          # Make it RGB (stack the three channels) and append to list of images of current plate
          img_stack   = np.dstack((img_Actin, img_Tubulin, img_DAPI))
          plate_X.append(img_stack)

          # Obtain mechanism, compound and concentration for image
          mechanism  = moa[(moa['compound']      == row['Image_Metadata_Compound']) &
                          (moa['concentration'] == row['Image_Metadata_Concentration'])]

          # Append additional labels (apart from mechanism, compounds, concentrations) to labels list.
          # And all different rotations/mirrors (x 8).
          if row['Image_Metadata_Compound'] != 'DMSO':
              [labels.append([mechanism.values.tolist()[0][0],
                              mechanism.values.tolist()[0][1],
                              mechanism.values.tolist()[0][2],
                              row['Image_Metadata_Plate_DAPI'],
                              row['Image_Metadata_Well_DAPI'],
                              row['Replicate']]) for i in range(crops)]

              idx.append(count)
              count += crops

          plate_Y.append([mechanism.values.tolist()[0][0],
                          mechanism.values.tolist()[0][1],
                          mechanism.values.tolist()[0][2],
                          row['Image_Metadata_Plate_DAPI'],
                          row['Image_Metadata_Well_DAPI'],
                          row['Replicate']])


  plate_Y      = np.asarray(plate_Y)
  dmso_idx     = np.where(plate_Y[:,0] == "DMSO")[0]
  non_dmso_idx = np.where(plate_Y[:,0] != "DMSO")[0]

  if len(non_dmso_idx) > 0:
      plate_X = np.asarray(plate_X)
      plate_X = anscombe(plate_X)
      DMSO_normalization(plate_X[non_dmso_idx], plate_X[dmso_idx], idx, crops, rm_imgs)

  print('Number of compounds transformed = ' + str(count) + '; dataset = ' + str(dataset))
  dataset += 1


Number of compounds transformed = 208; dataset = 1
Number of compounds transformed = 416; dataset = 2
Number of compounds transformed = 512; dataset = 3
[Errno 2] No such file or directory: '/content/drive/MyDrive/CS 199/Cell Images//Week1_22381/Week1_150607_E07_s1_w193232854-973F-4689-B7CB-FB7038C59BDA.tif'
Week1_22381: File not found!
[Errno 2] No such file or directory: '/content/drive/MyDrive/CS 199/Cell Images//Week1_22381/Week1_150607_E07_s2_w1F1EF41B2-EA86-4AC5-8DB4-51D2A6E98F59.tif'
Week1_22381: File not found!
[Errno 2] No such file or directory: '/content/drive/MyDrive/CS 199/Cell Images//Week1_22381/Week1_150607_E07_s3_w160FE277B-535B-428E-A2EF-D9AEB4C4F86D.tif'
Week1_22381: File not found!
[Errno 2] No such file or directory: '/content/drive/MyDrive/CS 199/Cell Images//Week1_22381/Week1_150607_E07_s4_w1822D00C4-D980-412D-9020-F3CEBB36BF8E.tif'
Week1_22381: File not found!
[Errno 2] No such file or directory: '/content/drive/MyDrive/CS 199/Cell Images//Week1_22381/Week1_1506

In [None]:
print("Finished transforming compounds. Starting on adjusting the labels.")    
for index in sorted(rm_imgs, reverse=True):
    del labels[index]

df = pd.DataFrame(labels)
df.to_csv(dirname + '/TRAINING SET/bbbc021_labels.csv',
          header=["compound", "concentration", "moa", "plate", "well", "replicate"])

print("Finished adjusting labels. Image processing has finished.")

Finished transforming compounds. Starting on adjusting the labels.
Finished adjusting labels. Image processing has finished.


In [None]:
folders = []
for f in (f for f in os.listdir(dirname) if 'Week' in f):
  folders.append(f)

folders_set = set(folders)


contains_duplicates = len(folders) != len(folders_set)



print(contains_duplicates)

False
