### Breast Histopathology Images 

In [None]:
! pip install kaggle 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp /content/drive/MyDrive/Stat_Docs/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# Change permission
! chmod 600 ~/.kaggle/kaggle.json

### Download Histopathology Dataset

Link To Breast Histopathology Dataset [Here](https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images)

In [None]:
# unzip content
! unzip /content/breast-histopathology-images.zip -d /content/breast-histopathology-images

In [None]:
# check disk usage
! df -h

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from skimage.io import imread
import cv2

In [None]:
breast_imgs = glob.glob('/content/breast-histopathology-images/IDC_regular_ps50_idx5/**/*.png', recursive = True)
breast_imgs[0]

In [None]:
# Plot random images
fig, axes = plt.subplots(3, 3, figsize = (10, 10))
for i in range(5):
    img = cv2.imread(breast_imgs[i])
    axes[i].imshow(img)

In [None]:
# Get data insights from dataframe
df = pd.DataFrame(index=np.arrange(0, len(breast_imgs)), columns=['path', 'target'])
for i in range(len(breast_imgs)):
    df.loc[i, 'path'] = breast_imgs[i]
    df.loc[i, 'target'] = int(breast_imgs[i][-5])

In [None]:
df

In [None]:
pos = df[df['target'] == 1]
neg = df[df['target'] == 0]

In [None]:
pos.shape

neg.shape

In [None]:
# print cancer images 
fig, axes = plt.subplots(4, 10, figsize=(20,10))
fig.suptitle('Cancer', fontsize=20)
for i in range(4):
    for j in range(10):        
        image = imread(pos.iloc[j + 10*i]["path"])
        axes[i,j].imshow(image)
plt.tight_layout(rect=[0, 0, 1, 0.96])

In [None]:
# print non-cancer images 
fig, axes = plt.subplots(4, 10, figsize=(20,10))
fig.suptitle('No Cancer', fontsize=20)
for i in range(4):
    for j in range(10):        
        image = imread(neg.iloc[j + 10*i]["path"])
        axes[i,j].imshow(image)
plt.tight_layout(rect=[0, 0, 1, 0.96])

In [None]:
# insights from dataframe
plt.figure(figsize=(10, 5))
ax = sns.barplot(x=df['target'].value_counts().index, y = df['target'].value_counts(), palette='Set1')
plt.xlabel('State of Cancer')
plt.ylabel('Number of Patients')
plt.title('Distribution of Cancer States')
plt.legend(handles=ax.patches[:2], labels=['0 = No Cancer', '1 = Cancer'], loc='upper right')
plt.show()