In [4]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import itertools
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
import random
import cv2
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Activation,Dropout, Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam,  Adagrad
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2


In [None]:
def append_labeled_image(_dir, label):
    """
    Add labeled image to 2 distinct lists.

    Args:
        _dir (list) : List of images.
        lbel (str) : image label.

    Returns:
        case_list (list) : List Contains images of specified case.
        label_list : List of specified label for cases. 
    """
    case_list = []
    label_list = []

    for img in _dir: 
        # Append images to case list
        case_list.append(img)
        # Append labels to label list
        label_list.append(label)

    return case_list, label_list

In [5]:
def load_data(path, data):
    """
    Load the data in a data frame.

    Args:
        path (str) : Path to data directory.
        data (str) : What type of data it is.

    Returns:
        df (dataframe) : Pandas dataframe with 2 columns, one column for image path and the other column for image label
    """
    # Getting the path of each case directory
    if data == "test":
        squamous_cases_dir =  path / "squamous.cell.carcinoma"
        normal_cases_dir = path / "normal"
        large_cases_dir = path / "large.cell.carcinoma"
        adenocarcinoma_cases_dir = path / "adenocarcinoma"
    else:    
        # train / validation data
        squamous_cases_dir =  path / "squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa"
        normal_cases_dir = path / "normal"
        large_cases_dir = path / "large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa"
        adenocarcinoma_cases_dir = path / "adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib"

    # Appending each image full path
    squamous_list = squamous_cases_dir.glob("*.png")
    normal_list = normal_cases_dir.glob("*.png")
    large_list = large_cases_dir.glob("*.png")
    adenocarcinoma_list = adenocarcinoma_cases_dir.glob("*.png")

    # Adding images with their corrosponding labels
    squamous_img_list, squamous_label_list = append_labeled_image(squamous_list, "Squamous cell carcinoma")
    normal_img_list, normal_label_list = append_labeled_image(normal_list, "Normal")
    large_img_list, large_label_list = append_labeled_image(large_list, "Large cell carcinoma")
    adenocarcinoma_img_list, adenocarcinoma_label_list = append_labeled_image(adenocarcinoma_list, "Adenocarcinoma")

    # Nest all images
    nested_data = [squamous_img_list, normal_img_list, large_img_list, adenocarcinoma_img_list]
    nested_labels = [squamous_label_list, normal_label_list, large_label_list, adenocarcinoma_label_list]
    
    # Appending all lists to one training list
    data_list = list(itertools.chain.from_iterable(nested_data))
    labels_list = list(itertools.chain.from_iterable(nested_labels))

    # Dataframe of images and labels
    df = pd.DataFrame(data_list)
    df.columns = ["images"]
    df["label"] = labels_list
    df = df.sample(frac = 1).reset_index(drop = True)

    # return dataframe
    return df

In [7]:
def preprocess_image(image_path, img_size = (224, 224)):
    """
    Reads image path, resize it then normalize it into range [0, 1]

    Args:
        image_path (pathlib.PosixPath) : path of the image need to be converted to string.
        img_size (tuple) : W, H, C.

    Returns:
        img ((n, w, h, c)) : numpy nd array. 
        (n ==> number of training examples),
        (w ==> image width)
        (h ==> image height)
        (c ==> color channel)
    """
    img = cv2.imread(str(image_path))  # Load image (BGR format)
    img = cv2.resize(img, img_size)  # Resize to model input size
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    img = img / 255.0   # Normalize to [0, 1]
    return img

def process_dataframe(df, label_encoder, fit_encoder=False):
    """
    Takes pandas dataframe of 2 columns, map each label into integer when fit_encoder is True

    Args:
        df (Dataftrame (2 cols)) : Pandas dataframe.
        label_encoder : LabelEncoder object
        fit_encoder (boolean)

    Returns:
        X (ndarray) : Preprocessed images.
        Y (ndarray) : Integer encoded labels.
    """
    X = np.stack(df["images"].apply(preprocess_image))
    
    if fit_encoder:
        y = label_encoder.fit_transform(df["label"])
    else:
        y = label_encoder.transform(df["label"])
    
    return X, y