### Breast Histopathology Images 

In [None]:
! pip install kaggle 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp /content/drive/MyDrive/Stat_Docs/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# Change permission
! chmod 600 ~/.kaggle/kaggle.json

### Download Histopathology Dataset

Link To Breast Histopathology Dataset [Here](https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images)

In [None]:
# unzip content
! unzip /content/breast-histopathology-images.zip -d /content/breast-histopathology-images

In [None]:
# check disk usage
! df -h

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from skimage.io import imread
import cv2
from sklearn.utils import resample 

In [None]:
breast_imgs = glob.glob('/content/breast-histopathology-images/IDC_regular_ps50_idx5/**/*.png', recursive = True)
breast_imgs[0]

In [None]:
# Plot random images
fig, axes = plt.subplots(3, 3, figsize = (10, 10))
for i in range(5):
    img = cv2.imread(breast_imgs[i])
    axes[i].imshow(img)

In [None]:
# Get data insights from dataframe
df = pd.DataFrame(index=np.arrange(0, len(breast_imgs)), columns=['path', 'target'])
for i in range(len(breast_imgs)):
    df.loc[i, 'path'] = breast_imgs[i]
    df.loc[i, 'target'] = int(breast_imgs[i][-5])

In [None]:
df

In [None]:
pos = df[df['target'] == 1]
neg = df[df['target'] == 0]

In [None]:
pos.shape

neg.shape

In [None]:
# print cancer images 
fig, axes = plt.subplots(4, 10, figsize=(20,10))
fig.suptitle('Cancer', fontsize=20)
for i in range(4):
    for j in range(10):        
        image = imread(pos.iloc[j + 10*i]["path"])
        axes[i,j].imshow(image)
plt.tight_layout(rect=[0, 0, 1, 0.96])

In [None]:
# print non-cancer images 
fig, axes = plt.subplots(4, 10, figsize=(20,10))
fig.suptitle('No Cancer', fontsize=20)
for i in range(4):
    for j in range(10):        
        image = imread(neg.iloc[j + 10*i]["path"])
        axes[i,j].imshow(image)
plt.tight_layout(rect=[0, 0, 1, 0.96])

In [None]:
# insights from dataframe
plt.figure(figsize=(10, 5))
ax = sns.barplot(x=df['target'].value_counts().index, y = df['target'].value_counts(), palette='Set1')
plt.xlabel('State of Cancer')
plt.ylabel('Number of Patients')
plt.title('Distribution of Cancer States')
plt.legend(handles=ax.patches[:2], labels=['0 = No Cancer', '1 = Cancer'], loc='upper right')
plt.show()

In [None]:
pos_balanced = resample(pos, replace=True, n_samples=135000)

In [None]:
df_balanced = pd.concat([neg, pos_balanced])

# balance the dataset to look like 
# neg DataFrame
#    path                target
# 0  /path/img1_0.jpg    0
# 1  /path/img2_0.jpg    0

# # pos_balanced DataFrame
#    path                target
# 0  /path/img3_1.jpg    1
# 1  /path/img4_1.jpg    1

# # df_balanced (result)
#    path                target
# 0  /path/img1_0.jpg    0
# 1  /path/img2_0.jpg    0
# 2  /path/img3_1.jpg    1
# 3  /path/img4_1.jpg    1

In [None]:
df_balanced.to_csv('balanced_dataframe.csv', index=False)

# Without index=False:
# csvCopy,path,target
# 0,/path/img1_0.jpg,0
# 1,/path/img2_0.jpg,0
# 2,/path/img3_1.jpg,1
# With index=False:
# csvCopypath,target
# /path/img1_0.jpg,0
# /path/img2_0.jpg,0
# /path/img3_1.jpg,1


In [None]:
# print balanced dataframe 
df_balanced

In [None]:
values=df_balanced.values
np.random.shuffle(values)

In [None]:
X = []
y = []
# feature - (image path)
for feature, label in values:
    image = cv2.imread(feature, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (50,50), interpolation = cv2.INTER_LINEAR)
    X.append(image)
    y.append(label)
X = np.array(X)
y = np.array(y)


In [None]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# check data
print(f'X Training Data: {len(X_train)}')
print(f'X Testing Data: {len(X_test)}')

In [None]:
# import tensorflow and sklearn modules 
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix
import itertools
from tensorflow.keras.utils import to_categorical

In [None]:
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

In [None]:
# stopback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min', verbose=1)

# reduce learning rate on plateau
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=3, verbose=1, factor=0.5, min_lr=0.0001)

In [None]:
# build model
model = Sequential()
model.add(Conv2D(32, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(50, 50, 1)))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Conv2D(64, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(512, activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test), batch_size=64, callbacks=[early_stopping, lr_reduction])

In [None]:
y_pred = model.predict(X_test)
y_test = np.argmax(y_test, axis=1)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

In [None]:
confusion_mtx = confusion_matrix(y_test, y_pred_classes) 
f,ax = plt.subplots(figsize=(8, 8))
sns.heatmap(confusion_mtx, annot=True, linewidths=0.01,cmap="BuPu",linecolor="gray", fmt= '.1f',ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
plt.figure(figsize=(20, 8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from tensorflow.keras.models import save_model
model.save('breast_cancer_model.h5')