In [5]:
from pathlib import Path
import glob
import pandas as pd
import cv2 as cv
import numpy as np
import random
from sklearn.svm import SVC

# Step 1/   Fetching Images

>Reading images and filling up our DataFrame with all our images labeled: No Pneumonia(0), Virus(1) and Bacteria(2)

In [2]:
train_data = []
root_train_images_dir = Path('../dataset/train')
subdirectories = ['NORMAL', 'PNEUMONIA']
    
for sub in subdirectories:
    images_dir = root_train_images_dir / sub
    images = images_dir.glob('*.jpeg')
    for img in images:
        if sub == 'PNEUMONIA':
            train_data.append((img, 1))
        else:
            train_data.append((img, 0))
train_data = pd.DataFrame(train_data, columns=['image', 'label'], index = None)
print("Training data:\n{}".format(train_data))


Training data:
                                                 image  label
0               dataset\train\NORMAL\IM-0115-0001.jpeg      0
1               dataset\train\NORMAL\IM-0117-0001.jpeg      0
2               dataset\train\NORMAL\IM-0119-0001.jpeg      0
3               dataset\train\NORMAL\IM-0122-0001.jpeg      0
4               dataset\train\NORMAL\IM-0125-0001.jpeg      0
...                                                ...    ...
5211   dataset\train\PNEUMONIA\person99_virus_183.jpeg      1
5212  dataset\train\PNEUMONIA\person9_bacteria_38.jpeg      1
5213  dataset\train\PNEUMONIA\person9_bacteria_39.jpeg      1
5214  dataset\train\PNEUMONIA\person9_bacteria_40.jpeg      1
5215  dataset\train\PNEUMONIA\person9_bacteria_41.jpeg      1

[5216 rows x 2 columns]


# Step 2/    Image conversion and data augmentation
>Reading all images in our DataFrame. Then we will resize them to fit 64x64 and will apply some changes to duplicate images with modifications like rotation, blurring, etc. It is called data augmentation.

In [9]:
train_features = {'base': [], 'low': [], 'medium': [], 'high': []}
train_labels = {'base': [], 'low': [], 'medium': [], 'high': []}
levels = ['base', 'low', 'medium', 'high']

def histogram_equalization(img, label):
    tmp = cv.cvtColor(img, cv.COLOR_BGR2YUV)
    tmp[:,:,0] = cv.equalizeHist(tmp[:,:,0])
    result = cv.cvtColor(tmp, cv.COLOR_YUV2BGR)
    for l in levels:
        if l != 'base':
            train_features[l].append(np.array(result))
            train_labels[l].append(label)

def rotation(img, label):
    rows, cols = img.shape[0], img.shape[1]
    rotate_low = random.randint(-180, 180)
    matrice = cv.getRotationMatrix2D((cols / 2, rows / 2), rotate_low, .7)
    result = cv.warpAffine(img, matrice, (rows, cols), borderMode=cv.BORDER_CONSTANT, borderValue=(144, 159, 162))
    for l in levels:
        if l != 'base':
            train_features[l].append(np.array(result))
            train_labels[l].append(label)
    rotate_medium = rotate_low
    while rotate_medium == rotate_low:
        rotate_medium = random.randint(-180, 180)
    matrice = cv.getRotationMatrix2D((cols / 2, rows / 2), rotate_medium, .7)
    result = cv.warpAffine(img, matrice, (rows, cols), borderMode=cv.BORDER_CONSTANT, borderValue=(144, 159, 162))
    for l in levels:
        if l != 'base' and l != 'low':
            train_features[l].append(np.array(result))
            train_labels[l].append(label)

for data in train_data.values:
    img = cv.imread(data[0].__str__())
    img = cv.resize(img, (64, 64))
    for l in levels:
        train_features[l].append(np.array(img))
        train_labels[l].append(data[1])
    histogram_equalization(img, data[1])
    rotation(img, data[1])

for l in levels:
    print("{}:".format(l.capitalize()))
    print("Features: {}".format(train_features[l].__len__()))
    print("Labels: {}\n".format(train_labels[l].__len__()))

#   Dans train_features on a les différentes images à envoyer au modèle sous forme de numpy arrays. Et dans train labels , les labels correspondant aux différentes images.

Base:
Features: 5216
Labels: 5216

Low:
Features: 15648
Labels: 15648

Medium:
Features: 20864
Labels: 20864

High:
Features: 20864
Labels: 20864

