In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math
import time
import cv2
from sklearn import metrics
import gc

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models


from sklearn.model_selection import train_test_split

import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [None]:
folder_train = "/kaggle/input/histopathologic-cancer-detection/train/"
folder_test = "/kaggle/input/histopathologic-cancer-detection/test/"

sample_df = pd.read_csv("/kaggle/input/histopathologic-cancer-detection/sample_submission.csv")
label_df = pd.read_csv("/kaggle/input/histopathologic-cancer-detection/train_labels.csv")

train_name = label_df["id"].to_numpy()
test_name =  sample_df["id"].to_numpy()
train_label = label_df["label"].to_numpy()

def plot_photo(N, nc = 4, pos = True):
    
    nr =  math.ceil(N/nc)
    n_train = train_name.shape[0]
    fig, ax = plt.subplots(nr, nc, figsize = (15, nr*3.2))
    
    np.random.seed(1)
    
    IDs = np.random.choice(n_train, N, replace = False)
    
    if pos:
        filter1 = train_label == 1
    else:
        filter1 = train_label == 0
        
    names_1 = train_name[filter1]
    
    names = np.random.choice(names_1, N, replace = False)
    
    for k in range(N):
        i =  int(k/nc)
        j = k % nc
        fname =  folder_train + names[k] + ".tif"
        img = cv2.imread(fname)
        ax[i,j].imshow(img)

In [None]:
# Load the training data into a DataFrame named 'train'. 
# Print the shape of the resulting DataFrame. 

train = pd.read_csv(f'../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)

print('Training Set Size:', train.shape)

train.head()

In [None]:
# Label distribution

(train.label.value_counts() / len(train)).to_frame().sort_index().T

In [None]:
#Sample images of original dataset

train_path = "../input/histopathologic-cancer-detection/train"
print('Training Images:', len(os.listdir(train_path)))

sample = train.sample(n=16).reset_index()

plt.figure(figsize=(8,8))

for i, row in sample.iterrows():

    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
    label = row.label

    plt.subplot(4,4,i+1)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')
        
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Example of images for Class = 1
img_names = train[train['label']=='1']['id'][:16]

plt.figure(figsize=[8,8])
i = 1
for img_name in img_names:
    img = mpimg.imread("../input/histopathologic-cancer-detection/train/%s" % img_name)[...,[2, 1, 0]] 
    plt.subplot(4, 4, i)
    plt.imshow(img)
    plt.text(0, -5, f'Class 1', color='k')
    i += 1
    plt.axis('off')
plt.show()

In [None]:
# Example of images for Class = 0
img_names0 = train[train['label']=="0"]['id'][:16]

plt.figure(figsize=[8,8])
i = 1
for img_name0 in img_names0:
    img = mpimg.imread("../input/histopathologic-cancer-detection/train/%s" % img_name0)[...,[2, 1, 0]] 
    plt.subplot(4, 4, i)
    plt.imshow(img)
    plt.text(0, -5, f'Class 0', color='k')
    i += 1
    plt.axis('off')
plt.show()

In [None]:
# data generators
#Using original dataset
RANDOM_SEED = 1982
train_df, valid_df = train_test_split(train, test_size=0.2, random_state=RANDOM_SEED, stratify=train.label)

print(train_df.shape)
print(valid_df.shape)

# Create image data generators for both the training set and the validation set. 
# Use the data generators to scale the pixel values by a factor of 1/255. 

train_datagen = ImageDataGenerator(
    rescale=1./255,
    vertical_flip = True,
    horizontal_flip = True,
    rotation_range=90,
    zoom_range=0.2, 
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    channel_shift_range=0.1,
    fill_mode='nearest')

valid_datagen = ImageDataGenerator(rescale=1/255)

# Complete the code for the data loaders below. 

BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

valid_loader = valid_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

#Look at some augmented images
def plotImages(images_arr):
    fig, axes = plt.subplots(3, 5, figsize=(10,10))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
    plt.tight_layout()
    plt.show()
    
    
augmented_images = [train_loader[0][0][0] for i in range(15)]
plotImages(augmented_images)