<a href="https://colab.research.google.com/github/jincy-p-janardhanan/SPP-Pneumonia-Net/blob/ml/Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminaries


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import shutil

In [None]:
normal = 0
bacterial = 0
viral = 0

# Create and organize dataset

## Dataset: [COVID-19 Radiography Database](https://www.kaggle.com/tawsifurrahman/covid19-radiography-database)

### Download dataset

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/radiography/"

In [None]:
% cd /content/drive/MyDrive/radiography/
% pwd

In [None]:
! kaggle datasets download -d tawsifurrahman/covid19-radiography-database

In [None]:
!unzip \*.zip && rm *.zip

### Copy Images

Copy normal images to main_project/normal

In [None]:
normal_folder = '/content/drive/MyDrive/radiography/COVID-19_Radiography_Dataset/Normal'

In [None]:
# change working directory before copying files
% cd '/content/drive/MyDrive/radiography/COVID-19_Radiography_Dataset/Normal'
! pwd

In [None]:
files = os.listdir(normal_folder)
for f in files:
  if 'Normal' in f and normal < 2780:
    normal += 1
    shutil.copy(normal_folder+f,'/content/drive/MyDrive/main_project/Normal')
print("normal: ", normal, "\t bacterial: ", bacterial, "\t viral: ", viral)

Copy viral pneumonia images to main_project/Viral

In [None]:
viral_folder = '/content/drive/MyDrive/radiography/COVID-19_Radiography_Dataset/Viral Pneumonia'

In [None]:
% cd '/content/drive/MyDrive/radiography/COVID-19_Radiography_Dataset/Viral Pneumonia'
! pwd

In [None]:
files = os.listdir(viral_folder)
for f in files:
  if 'Viral' in f and viral < 2780:
    viral += 1
    shutil.copy(pneumonia_folder+f,'/content/drive/MyDrive/main_project/Viral')

## Dataset: [Chest X-ray Images Pneumonia](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia)

### Download dataset

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/paultimothymooney/"

In [None]:
% cd /content/drive/MyDrive/paultimothymooney/
% pwd

In [None]:
! kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

In [None]:
!unzip \*.zip && rm *.zip

### Copy Images

Copy bacterial and viral pneumonia images from train folder to main_project/Bacterial and main_project/Viral

In [None]:
pneumonia_folder = '/content/drive/MyDrive/paultimothymooney/chest_xray/train/PNEUMONIA/'

In [None]:
% cd /content/drive/MyDrive/paultimothymooney/chest_xray/train/PNEUMONIA/
! pwd

In [None]:
for f in files:
  if 'bacteria' in f and bacterial < 2780:
    bacterial += 1
    shutil.copy(pneumonia_folder+f,'/content/drive/MyDrive/main_project/Bacterial')
  elif 'virus' in f and viral < 2780:
    viral += 1
    shutil.copy(pneumonia_folder+f,'/content/drive/MyDrive/main_project/Viral')
print("normal: ", normal, "\t bacterial: ", bacterial, "\t viral: ", viral)

Copy the 8 bacterial pneumonia images in val folder to main_project/Bacterial

In [None]:
pneumonia_folder = '/content/drive/MyDrive/paultimothymooney/chest_xray/val/PNEUMONIA/'

In [None]:
% cd /content/drive/MyDrive/paultimothymooney/chest_xray/val/PNEUMONIA/
! pwd

In [None]:
for f in files:
  if 'bacteria' in f and bacterial < 2780:
    bacterial += 1
    shutil.copy(pneumonia_folder+f,'/content/drive/MyDrive/main_project/Bacterial')
print("normal: ", normal, "\t bacterial: ", bacterial, "\t viral: ", viral)

# Convert dataset to h5 file

## Utility functions

- Convert images to numpy array and save in h5 file
- For faster training <br>
[Github Reference](https://github.com/selvam85/Cat-Dog-Classifier/blob/master/DNN_using_plain_TF_Cat_vs_Dog_classifier_Kaggle_dataset/Convert%20Images%20to%20Numpy%20array%20and%20save%20in%20h5%20fomat%20v2.1.ipynb) <br>

Other references: 
[1](https://medium.datadriveninvestor.com/speed-up-your-image-training-on-google-colab-dc95ea1491cf), 
[2](https://medium.com/@selvam85/how-to-work-with-large-training-dataset-in-google-colab-platform-c3499fc10c24)

### Normalize and write data to h5 file

In [None]:
def normalize_and_write_data_into_h5_file(dest_filepath, filepaths_list, n_px, n_channels = 3):
    
    '''
        This function converts images to numpy arrays and writes the array data into a h5 file.
        
        dest_filepath - the name of the file with full path that is being created
        filepaths_list - source image file paths which is being converted to numpy arrays
        n_px - number of pixels - will be used as image's height and width
        n_channels - 3 for rgb
    '''
    
    data_shape = (len(filepaths_list), n_px * n_px * n_channels)
    dataset_name = "input_data"

    with h5py.File(dest_filepath, 'a') as f:
        
        f.create_dataset(dataset_name, data_shape, np.float32)
        
        for i in range(len(filepaths_list)):
            #if (i+1) % 512 == 0:
            #    print('{}/{} files converted'.format((i+1), len(filepaths_list)))

            filepath = filepaths_list[i]
            img = cv2.imread(filepath)
            img = cv2.resize(img, (n_px, n_px), interpolation=cv2.INTER_CUBIC)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # To convert colormap from BGR to GRAY
            
            #Normalize the image - convert the each pixel value between 0 and 1
            img = img / 255
            #Reshape the image - roll it up into a column vector
            img = img.ravel()
            
            #img[None] makes it a proper array instead of rank 1 array
            f[dataset_name][i, ...] = img[None]

### Write labels

Write corresponding labels for each image into the h5 file

In [None]:
def write_labels_into_h5_file(dest_filepath, labels):
    dataset_name = "input_labels"
    with h5py.File(dest_filepath, 'a') as f:
        f.create_dataset(dataset_name, (len(labels),), np.int8)
        f[dataset_name][...] = labels

### Set labels

Numbers for labelling
- 0: Normal
- 1: Bacterial
- 2: Viral

In [None]:
def set_label(filepath):
  if 'Bacterial' in filepath:
    return 1
  elif 'Viral' in filepath:
    return 2
  else:
    return 0

### Combined function for converting images and writing labels

In [None]:
def convert_images_to_data_in_h5_file(src_img_filepath, dest_h5_file_path, n_px, n_channels = 3, batch_size = 1024):
    
    # Returns a list of filepaths ending with .jpeg or .png extension in the source directory and its sub-directories
    src_filepaths = [
                     os.path.join(dp, f) 
                     for dp, dn, filenames in os.walk(src_img_filepath) 
                     for f in filenames 
                     if os.path.splitext(f)[1] in ['.jpeg', '.png']
                     ]
    print('total no. of images = ', len(src_filepaths))
    # Create Labels based upon the substring contained in the filename
    labels = [set_label(filepath) for filepath in src_filepaths]

    count_normal, count_bacterial, count_viral = 0, 0, 0
    for l in labels:
      if l == 0:
        count_normal+=1
      elif l==1:
        count_bacterial+=1
      else:
        count_viral+=1
    print('Normal:', count_normal, '\t Bacterial:', count_bacterial, '\t Viral:', count_viral)
    
    #The zip(source_filepaths, labels) combines each element of source_filepaths list 
    #with each element of labels list forming a pair (tuple). t is the list which contains these tuples
    t = list(zip(src_filepaths, labels))

    #Shuffle the list
    shuffle(t)
    
    #Get the shuffled filepaths & labels
    src_filepaths, labels = zip(*t)
    
    #Number of images
    m = len(src_filepaths)
    n_complete_batches = math.ceil(m / batch_size)

    print('No. of complete batches = ', n_complete_batches)
    
    for i in range(n_complete_batches):
        print('Creating file', (i+1))
        
        dest_file_path = dest_h5_file_path + str(i + 1) + ".h5"   
        
        start_pos = i * batch_size
        end_pos = min(start_pos + batch_size, m)
        src_filepaths_batch = src_filepaths[start_pos: end_pos]
        labels_batch = labels[start_pos: end_pos]
        
        normalize_and_write_data_into_h5_file(dest_file_path, src_filepaths_batch, n_px, n_channels)
        write_labels_into_h5_file(dest_file_path, labels_batch)
    return n_complete_batches

## Create h5 files

In [None]:
# Create dataseth5 folder if not already existing
% mkdir -p /content/drive/MyDrive/main_project/dataseth5

In [None]:
# root directory for image files
root_dir = '/content/drive/MyDrive/main_project/'

In [None]:
# image shape parameters
n_px = 128
n_channels = 1

Divide dataset to 10 batches and convert to h5 files

In [None]:
# destination file path including filename (starting), for each batch
dest_filepath = '/content/drive/MyDrive/main_project/dataseth5/dataseth5_'

# divides dataset to 10
batch_size = int(2780 * 3 / 10)

# create h5 files
tic = time.process_time()
n_complete_batches = convert_images_to_data_in_h5_file(root_dir, dest_filepath, n_px, n_channels, batch_size)
toc = time.process_time()
print('No. of complete batches = ', n_complete_batches, '\n Time taken for creating the h5 files is', (toc-tic)*1000, 'ms')

Convert complete dataset to h5 file

In [None]:
# destination file path including filename (starting), for complete dataset
dest_filepath = '/content/drive/MyDrive/main_project/dataseth5/complete_dataseth5_'

# parameters
n_px = 128
n_channels = 1
batch_size = 2780 * 3

# create h5 file
tic = time.process_time()
n_complete_batches = convert_images_to_data_in_h5_file(root_dir, dest_filepath, n_px, n_channels, batch_size)
toc = time.process_time()
print('No. of complete batches = ', n_complete_batches, '\n Time taken for creating the h5 file is', (toc-tic)*1000, 'ms')