# **Preprocessing: convert images into numpy array**

## Import

In [1]:
# importing 
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization

print("Libraries imported")

Libraries imported


## Dataset preparation and Preprocessing

In [9]:
INPUT_SIZE = (128, 128)  # change size to (224, 224) and (229, 229) and repeat to get all the arrays
mapping = {'normal': 0, 'pneumonia': 1, 'COVID-19': 2}
train_filepath = 'train_split.txt'
test_filepath = 'test_split.txt'

# load in the train and test files
file = open(train_filepath, 'r')
trainfiles = file.readlines()
file = open(test_filepath, 'r')
testfiles = file.readlines()

# [FROM HERE] next two cycles do nothing: just to discover unique patient's distribution
arr1 = []
arr2 = []
p1 = 0
c1 = 0
n1 = 0
p2 = 0
c2 = 0
n2 = 0
for i in tqdm(range(len(testfiles))):
    test_i = testfiles[i].split()
    if (test_i[0] not in arr2):
        arr2.append(test_i[0])
        if(test_i[2] == 'pneumonia'):
            p1+=1
        if(test_i[2] == 'COVID-19'):
            c1+=1
        if(test_i[2] == 'normal'):
            n1+=1
for i in tqdm(range(len(trainfiles))):
    test_i = trainfiles[i].split()
    if (test_i[0] not in arr1):
        arr1.append(test_i[0])
        if(test_i[2] == 'pneumonia'):
            p2+=1
        if(test_i[2] == 'COVID-19'):
            c2+=1
        if(test_i[2] == 'normal'):
            n2+=1      
print('Total samples for train: ', len(trainfiles))
print('Total samples for test: ', len(testfiles))
print(len(arr1), 'unique train patients')
print(len(arr2), 'unique test patients')
print(n2,p2,c2)
print(n1,p1,c1)
# [TO HERE]

100%|████████████████████████████████████████| 1546/1546 [00:00<00:00, 81482.71it/s]
100%|██████████████████████████████████████| 15086/15086 [00:01<00:00, 12594.81it/s]

Total samples for train:  15086
Total samples for test:  1546
14900 unique train patients
1522 unique test patients
7966 5451 1483
885 591 46





In [6]:
# resize to input size and normalize to 0 - 1
x_test = []
y_test = []

for i in tqdm(range(len(testfiles))):
    test_i = testfiles[i].split()
    imgpath = test_i[1]
    img = cv2.imread(os.path.join('data', 'test', imgpath))
    img = cv2.resize(img, INPUT_SIZE)
    img = img.astype('float32') / 255.0
    x_test.append(img)
    y_test.append(mapping[test_i[2]])

print('In x_test we have', len(x_test), 'test images and their shape is', x_test[0].shape)
print('In y_text we have', len(y_test), 'labels')

100%|████████████████████████████████████████████████████| 1546/1546 [00:15<00:00, 99.80it/s]

In x_test we have 1546 test images and their shape is (229, 229, 3)
In y_text we have 1546 labels





In [7]:
# export to npy to load in for testing
np.save('data/x_test_229.npy', x_test)
np.save('data/y_test.npy', y_test)

In [8]:
# resize to input size and normalize to 0 - 1
x_train = []
y_train = []

for i in tqdm(range(len(trainfiles))):
    train_i = trainfiles[i].split()
    imgpath = train_i[1]
    img = cv2.imread(os.path.join('data', 'train', imgpath))
    img = cv2.resize(img, INPUT_SIZE)
    img = img.astype('float32') / 255.0
    x_train.append(img)
    y_train.append(mapping[train_i[2]])
    
print('In x_train we have', len(x_train), 'train images and their shape is', x_train[0].shape)
print('In y_train we have', len(y_train), 'labels')

100%|████████████████████████████████████████████████████████████| 15086/15086 [02:38<00:00, 95.25it/s]

In x_train we have 15086 train images and their shape is (229, 229, 3)
In y_train we have 15086 labels





In [9]:
# export to npy to load in for training
np.save('data/x_train_128.npy', x_train)
np.save('data/y_train.npy', y_train)