<a href="https://colab.research.google.com/github/heesukjang/W207_AppliedML_Fall2022/blob/main/W207_Breast_Cancer_IDC_Classification_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## FALL 2022<br>
**Heesuk Jang**

In [1]:
import pandas as pd
import numpy as np
import os
import random
import joblib
import glob

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib.patches as patches

from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import RandomFlip
from tensorflow.keras.layers import RandomZoom
from tensorflow.keras.layers import RandomRotation
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
import cv2 as cv
import skimage.io as skio

# Required to read the data from Kaggle
from google.colab import drive
drive.mount('/content/gdrive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/Kaggle"

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

Mounted at /content/gdrive


In [3]:
%pwd

'/content'

# Load data as a zip file directly from [Kaggle](https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images) to Colab

In [4]:
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

Downloading breast-histopathology-images.zip to /content
 99% 3.08G/3.10G [00:17<00:00, 223MB/s]
100% 3.10G/3.10G [00:17<00:00, 189MB/s]


# Unzip the downloaded zip and remove the original zip file

In [5]:
!unzip \*.zip && rm *.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1551_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1601_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1651_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1701_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1751_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1801_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1851_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1901_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y1951_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y2001_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y2251_class0.png  
  inflating: IDC_regular_ps50_idx5/9346/0/9346_idx5_x2351_y2301_class0.png  
  inflating

In [9]:
%ls

[0m[01;34m10253[0m/  [01;34m10305[0m/  [01;34m12881[0m/  [01;34m13018[0m/  [01;34m14153[0m/  [01;34m16085[0m/  [01;34m8980[0m/  [01;34m9255[0m/
[01;34m10254[0m/  [01;34m10306[0m/  [01;34m12882[0m/  [01;34m13019[0m/  [01;34m14154[0m/  [01;34m16165[0m/  [01;34m8984[0m/  [01;34m9256[0m/
[01;34m10255[0m/  [01;34m10307[0m/  [01;34m12883[0m/  [01;34m13020[0m/  [01;34m14155[0m/  [01;34m16166[0m/  [01;34m9022[0m/  [01;34m9257[0m/
[01;34m10256[0m/  [01;34m10308[0m/  [01;34m12884[0m/  [01;34m13021[0m/  [01;34m14156[0m/  [01;34m16167[0m/  [01;34m9023[0m/  [01;34m9258[0m/
[01;34m10257[0m/  [01;34m12241[0m/  [01;34m12886[0m/  [01;34m13022[0m/  [01;34m14157[0m/  [01;34m16531[0m/  [01;34m9029[0m/  [01;34m9259[0m/
[01;34m10258[0m/  [01;34m12242[0m/  [01;34m12890[0m/  [01;34m13023[0m/  [01;34m14188[0m/  [01;34m16532[0m/  [01;34m9035[0m/  [01;34m9260[0m/
[01;34m10259[0m/  [01;34m12626[0m/  [01;34m12891

# Read image files from **IDC_regular_ps50_idx5**

In [63]:
def read_image_files(binary_class):
  current_working_dir = os.getcwd()
  folder = 'IDC_regular_ps50_idx5/*/'

  image_files = []
  for img in glob.glob(os.path.join(current_working_dir, folder + binary_class + '/') + '*.png'):
    image_files.append(img)
  return image_files

class_1_malignant = read_image_files('1')
class_0_benign = read_image_files('0')
full_data = class_1_malignant + class_0_benign

In [69]:
print('class_1:\n',class_1_malignant[:2])
print('\nclass_1 and class_0 combined:\n', full_data[0], '\n', full_data[-1])

class_1:
 ['/content/IDC_regular_ps50_idx5/9346/1/9346_idx5_x2051_y2251_class1.png', '/content/IDC_regular_ps50_idx5/9346/1/9346_idx5_x1701_y1701_class1.png']

class_1 and class_0 combined:
 /content/IDC_regular_ps50_idx5/9346/1/9346_idx5_x2051_y2251_class1.png 
 /content/IDC_regular_ps50_idx5/13400/0/13400_idx5_x1651_y2201_class0.png


# Total number of image files

In [57]:
def check_class_size(class_1, class_0):
  class_1_size, class_0_size = len(class_1), len(class_0)
  count = pd.Series([class_1_size, class_0_size])
  percent = round(count/(class_1_size + class_0_size)*100, 2)
  df_perc = pd.concat({'class_count':count, 'class_percent(%)':percent}, axis=1)
  df_perc['class'] = ['Class 1 (Malignant)', 'Class 0 (Benign)']
  df_perc = df_perc[['class','class_count','class_percent(%)']]
  print('Total Count = ', class_1_size + class_0_size)
  return df_perc

check_class_size(class_1_malignant, class_0_benign)

Total Count =  277524


Unnamed: 0,class,class_count,class_percent(%)
0,Class 1 (Malignant),78786,28.39
1,Class 0 (Benign),198738,71.61
