In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import random
import shutil
import tensorflow_datasets as tfds

from PIL import Image 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# variable
source_root_path = './dicom/NIH_1stPA_Norm_Fib'
dest_root_path = './dicom/Oversampled_5x_cleaned'
findings_of_interest =['Fibrosis','No Finding']
sub_dirs=['training','validation']
sample_multipler = 5

### Splitt our data into training and validation sets

In [4]:

# This expects folders of /validation and /training  each with /Fibrosis and /No Findings subfolders
# let's set aside 20% of our Fibrosis images for validation, and the other 80% for training
fibrosis_images = os.listdir(source_root_path+'/Fibrosis') 
print(fibrosis_images[:3])
random.shuffle(fibrosis_images)
print(fibrosis_images[:3])
print(len(fibrosis_images))
num_for_validation = int(len(fibrosis_images)*.2)
validation_fibrosis_images = fibrosis_images[:num_for_validation]
training_firbosis_images = fibrosis_images[num_for_validation:]
# validation_fibrosis_images = random.sample(fibrosis_images,num_for_validation)
print(len(validation_fibrosis_images),len(training_firbosis_images))
# copy 20% to validation
for image in validation_fibrosis_images:
  source = f'{source_root_path}/Fibrosis/{image}'
  dest = f'{dest_root_path}/validation/Fibrosis/{image}'
  # print( source, dest)
  # break
  shutil.copy(source,dest)
# copy 80% to training
for image in training_firbosis_images:
  source = f'{source_root_path}/Fibrosis/{image}'
  dest = f'{dest_root_path}/training/Fibrosis/{image}'
  # print( source, dest)
  # break
  shutil.copy(source,dest)

['00000945_000.png', '00000733_003.png', '00000324_007.png']
['00005567_008.png', '00017991_008.png', '00008223_000.png']
635
127 508


In [6]:
#Now let's move normal (No Finding) images, we aim to have 5x the base sample of fibrosis training images
fibrosis_training_count = len(os.listdir(dest_root_path+'/training/Fibrosis'))
print(fibrosis_training_count)
count_of_normals_needed = fibrosis_training_count * sample_multipler +num_for_validation
print(count_of_normals_needed)
normal_images = os.listdir(source_root_path+'/No Finding') 
print(len(normal_images))
normal_images = random.sample(normal_images,count_of_normals_needed)
# len(validation_normal_images)

validation_normal_images = normal_images[:num_for_validation]
training_normal_images = normal_images[num_for_validation:]
# validation_normal_images = random.sample(normal_images,num_for_validation)
print(len(validation_normal_images),len(training_normal_images))
# copy a small amount for validation
for image in validation_normal_images:
  source = f'{source_root_path}/No Finding/{image}'
  dest = f'{dest_root_path}/validation/No Finding/{image}'
  # print( source, dest)
  # break
  shutil.copy(source,dest)
# # copy the rest to training
for image in training_normal_images:
  source = f'{source_root_path}/No Finding/{image}'
  dest = f'{dest_root_path}/training/No Finding/{image}'
  # print( source, dest)
  # break
  shutil.copy(source,dest)


508
2667
6170
127 2540


At this point we have 127 CXRs of no findings and 127 CXR with fibrosis in our validation folder.
But we have 5x the number of normals compared to Fibrosis CXRs in our training folder.

### A) OVERSAMPLING

In [8]:
# Let's us create a simple oversampled folder, increase our firbosis images by 5x (don't run this if you want to do augmentation)
root_path =  dest_root_path+'/training/Fibrosis' 
image_paths = os.listdir(root_path)
for filename in image_paths:
  splitF = filename.split('.') 
  for i in range(1,sample_multipler):
    new_filename = f'{splitF[0]}-{i}.{splitF[1]}'
    source = f'{root_path}/{filename}'
    dest = f'{root_path}/{new_filename}'
    shutil.copy(source,dest)

### B) Augmentation

In [None]:

# import numpy as np
# import tensorflow as tf

# data_augmentation = tf.keras.Sequential([
#               tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal'),
#               tf.keras.layers.experimental.preprocessing.RandomRotation(0.2),
#              ])
img_path= os.listdir(root_path)[0]
# Python program to explain cv2.rotate() method
  
# importing cv2
import cv2
  
# path
# path = r'C:\Users\user\Desktop\geeks14.png'
  
# Reading an image in default mode
src = cv2.imread(img_path)
  
# Window name in which image is displayed
window_name = 'Image'
  
# Using cv2.rotate() method
# Using cv2.ROTATE_180 rotate by 
# 180 degrees clockwise
image = cv2.rotate(src, cv2.ROTATE_180)
  
# Displaying the image
cv2.imshow(window_name, image)
cv2.waitKey(0)

In [11]:
# Data augmentation, NOTE YOU MIGHT HAVE TO DUPLICATE THE OUTPUT FOLDER for Augmentation  
# THIS EXPECTS TRAINING/FIBROSIS/ TO ALREADY BE POPULATED WITH SOME .png's
training_fibrosis_path = 'dicom/NIH_Augmentation_5x/training/Fibrosis'
image_names= os.listdir(training_fibrosis_path)
# SHOW TWO IMAGES SIDE BY SIDE
def show_two(im,im2): 
    plt.subplot(1,2,1) 
    plt.title('Original image')
    plt.imshow(im) 
    plt.subplot(1,2,2) 
    plt.title('Augmented image')
    plt.imshow(im2) 
    plt.show() 
# DEFINE OUR IMAGE AUGEMNTATION MODEL USING KERAS 
data_augmentation = tf.keras.Sequential([
  # tf.keras.layers.ZeroPadding1D(padding=200),
  tf.keras.layers.RandomTranslation(0.025,0.025,fill_mode='nearest'),
  tf.keras.layers.RandomZoom(0.025,fill_mode='nearest'),
  tf.keras.layers.RandomRotation(5/360,fill_mode='nearest')
])
# LOAD ONE IMAGE
# img = tf.keras.utils.load_img( './dicom/NIH_Oversampled_5x/training/Fibrosis/00000022_001-1.png')
for file_name in image_names:
  img = tf.keras.utils.load_img( f'{training_fibrosis_path}/{file_name}')
  # img.show()
  image_array  = tf.keras.preprocessing.image.img_to_array(img)/255.0
  splitF = file_name.split('.') 
  for i in range(1,sample_multipler):
    new_filename = f'{splitF[0]}-{i}.{splitF[1]}'
    print(new_filename)
    augmented_image = data_augmentation(image_array)
    tf.keras.utils.save_img(f'{training_fibrosis_path}/{new_filename}', augmented_image.numpy())
    # pil_image=Image.fromarray(augmented_image.numpy())
    # pil_image.show()
    # break
  # break
# show_two(img,augmented_image)
# plt.imshow(augmented_image)

00000945_000-1.png
00000945_000-2.png
00000945_000-3.png
00000945_000-4.png
00000324_007-1.png
00000324_007-2.png
00000324_007-3.png
00000324_007-4.png
00000997_003-1.png
00000997_003-2.png
00000997_003-3.png
00000997_003-4.png
00004893_029-1.png
00004893_029-2.png
00004893_029-3.png
00004893_029-4.png
00017864_000-1.png
00017864_000-2.png
00017864_000-3.png
00017864_000-4.png
00014911_005-1.png
00014911_005-2.png
00014911_005-3.png
00014911_005-4.png
00016434_006-1.png
00016434_006-2.png
00016434_006-3.png
00016434_006-4.png
00021738_000-1.png
00021738_000-2.png
00021738_000-3.png
00021738_000-4.png
00010206_000-1.png
00010206_000-2.png
00010206_000-3.png
00010206_000-4.png
00018362_020-1.png
00018362_020-2.png
00018362_020-3.png
00018362_020-4.png
00004533_001-1.png
00004533_001-2.png
00004533_001-3.png
00004533_001-4.png
00014584_005-1.png
00014584_005-2.png
00014584_005-3.png
00014584_005-4.png
00012316_000-1.png
00012316_000-2.png
00012316_000-3.png
00012316_000-4.png
00001315_000