# Computer Vision Final Project: Breast Cancer Tumor Classification
## Section 2 Spring 2023
## Nicholas Lee, Nic Brathwaite, Amir Moayed


source: 

Publication: 
Spanhol, F., Oliveira, L. S., Petitjean, C., Heutte, L., A Dataset for Breast Cancer Histopathological Image Classification, IEEE Transactions on Biomedical Engineering (TBME), 63(7):1455-1462, 2016. [pdf]



## Libraries

In [None]:
# Google colab for data storage
from google.colab import drive
from google.colab import files


import os
import warnings
import random
from datetime import date

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import PIL

In [None]:
# Mount google drive
drive.mount('/content/drive')


Mounted at /content/drive


## Notebook Setup

In [None]:
# Paths
main_path = "/content/drive/MyDrive/"

project_path = "w281_final_project/"
data_path = "Data/"
save_path = "".join([date.today().strftime('%Y%m%d'), "_eda_models/"])

project_path = "".join([main_path, project_path])
data_path = "".join([project_path, data_path])
save_path = "".join([project_path, save_path])

# Sanity Check
print("project path: ", project_path)
print("data path: ", data_path)
print("save path: ", save_path)


# Pandas viewing options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

warnings.filterwarnings("ignore")

#silence TF
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

random.seed(2)
%matplotlib inline

# Graphing options
sns_old_theme = sns.set_theme()

project path:  /content/drive/MyDrive/w281_final_project/
data path:  /content/drive/MyDrive/w281_final_project/Data/
save path:  /content/drive/MyDrive/w281_final_project/20230309_eda_models/


Example images

Description of variation in dataset (categories, size/resolution, etc.)

Description of the intended classification problem (list of output categories)

Estimation of the approximate number of images in each category

Guess of types of image features useful for classification


In [None]:
# import imageio
# imageio.plugins.freeimage.download()
# !pip install itk

In [None]:
"""save images as a single gif """
# initialize the animation file
gif_writer = imageio.get_writer('output.gif', mode='I')

for z in range(img.shape[0]):
  # write the current image in the gif
  plt.imshow(img[z], cmap = "gray")
  plt.grid(visible = None)
  plt.xticks(ticks=[])
  plt.yticks(ticks=[])

  plt.savefig('temp.png', dpi=200)
  plt.close()
  image = imageio.imread('temp.png')
  os.remove('temp.png')

  gif_writer.append_data(image)
    
gif_writer.close()


## Download Kaggle dataset (optional)

In [None]:
downloadData = False

if downloadData == True: 
  files.upload()

  # move file to ~/.kaggle
  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 /root/.kaggle/kaggle.json

  # install kaggle client
  !pip install -q kaggle

  # set working directory to data storage folder
  %cd /content/drive/MyDrive/w281_final_project/Data # replace with your own folder location such as the shared drive

  !kaggle datasets download -d ambarish/breakhis

  # upzip
  !unzip /content/drive/MyDrive/w281_final_project/Data/breakhis.zip > /dev/null
  !echo dataset downloaded! GO TIME

Saving kaggle.json to kaggle.json
/content/drive/MyDrive/w281_final_project/Data
breakhis.zip: Skipping, found more recently modified local copy (use --force to force download)
replace BreaKHis_v1/BreaKHis_v1/histology_slides/breast/README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: dataset downloaded! GO TIME


## Create Metadata

In [None]:
!cat /content/drive/MyDrive/w281_final_project/Data/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/README.txt

Samples
* Samples are generated from breast tissue biopsy slides,
stained with hematoxylin and eosin (HE).
* prepared for histological study and labelled by pathologists of the P&D Lab
* breast tumor specimens assessed by Immunohistochemistry (IHC)
* Core Needle Biopsy (CNB) and Surgical Open Biopsy (SOB)
* section of ~3µm thickness

Image acquisition
* Olympus BX-50 system microscope with a relay lens with magnification of 3.3× coupled to a Samsung digital color camera SCC-131AN
* magnification 40×, 100×, 200×, and 400× (objective lens 4×, 10×, 20×, and 40× with ocular lens 10×)
* camera pixel size 6.5 µm
* raw images without normalization nor color color standardization
* resulting images saved in 3-channel RGB, 8-bit depth in each channel, PNG format


Format of image filename

   <BIOPSY_PROCEDURE>_<TUMOR_CLASS>_<TUMOR_TYPE>_<YEAR>-<SLIDE_ID>-<MAGNIFICATION>-<SEQ>

   <BIOPSY_PROCEDURE>::=CNB|SOB
   <TUMOR_CLASS>::=M|B
   <TUMOR_TYPE>::=<BENIGN_TYPE>|<MALIGNANT_TYPE>
   <BENIGN_TYP

In [None]:
class fileMetaData: 
  def __init__(fileName): 
    self.fileName = fileName
  

In [None]:
# categories
categories = {'B': 'Benign',
              'M': 'Malignant',
              'A': 'Adenosis',
              'F': 'Fibroadenoma',
              'TA': 'Tubular Adenoma',
              'PT': 'Phyllodes Tumor',
              'DC': 'Ductal Carcinoma',
              'LC': 'Lobular Carcinoma',
              'MC': 'Mucinous Carcinoma (Colloid)',
              'PC': 'Papillary Carcinoma'}

In [None]:
labels = {
    "BIOPSY_PROCEDURE": "", 
    "TUMOR_CLASS": "", 
    "TUMOR_TYPE":"", 
    "YEAR": "",
    "SLIDE_ID": "", 
    "MAGNIFICATION": "",
    "SEQ":""
}

In [None]:
fileList = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser("/content/drive/MyDrive/w281_final_project/Data/BreaKHis_v1/BreaKHis_v1/histology_slides/breast")) for f in fn]

In [None]:
fileList

In [None]:
file = "SOB_B_A-14-22549AB-100-001.png"

In [None]:
test = str.split(file, sep = ".")[0]

In [None]:
test = str.split(test, sep = "_")

In [None]:
test2 = str.split(test[-1], sep = "-")

In [None]:
test = test[:-1]

In [None]:
metaValues = test + test2

In [None]:
metaValues

['SOB', 'B', 'A', '14', '22549AB', '100', '001']

In [None]:
#  <BIOPSY_PROCEDURE>_<TUMOR_CLASS>_<TUMOR_TYPE>_<YEAR>-<SLIDE_ID>-<MAGNIFICATION>-<SEQ> # seven elements

In [None]:
def metaDataCreator(fileName):
  """
    Separate out the parts of a file name to get the annotations
  """

  metaKeys = ['BIOPSY_PROCEDURE',
            'TUMOR_CLASS',
            'TUMOR_TYPE',
            'YEAR',
            'SLIDE_ID',
            'MAGNIFICATION',
            'SEQ']

  test = str.split(fileName, sep = ".")[0]
  test = str.split(test, sep = "_")
  test2 = str.split(test[-1], sep = "-")
  test = test[:-1]
  metaValues = test + test2
  if len(metaValues) == 7: 
    return {metaKeys[i]:metaValues[i] for i in range(len(metaValues))}
  else: 
    return None

In [None]:
metaDataCreator(file)

{'BIOPSY_PROCEDURE': 'SOB',
 'TUMOR_CLASS': 'B',
 'TUMOR_TYPE': 'A',
 'YEAR': '14',
 'SLIDE_ID': '22549AB',
 'MAGNIFICATION': '100',
 'SEQ': '001'}

In [None]:
import glob

In [None]:
fileList = glob.glob("/content/drive/MyDrive/w281_final_project/Data/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/**/*.png", recursive = True)

In [None]:
def fileNameGetter(f):
  return f.split("/")[-1]

In [None]:
fileList = list(map(fileNameGetter, fileList))

In [None]:
meta = {file:metaDataCreator(file) for file in fileList}
meta

{'SOB_B_A-14-22549AB-100-004.png': {'BIOPSY_PROCEDURE': 'SOB',
  'TUMOR_CLASS': 'B',
  'TUMOR_TYPE': 'A',
  'YEAR': '14',
  'SLIDE_ID': '22549AB',
  'MAGNIFICATION': '100',
  'SEQ': '004'},
 'SOB_B_A-14-22549AB-100-002.png': {'BIOPSY_PROCEDURE': 'SOB',
  'TUMOR_CLASS': 'B',
  'TUMOR_TYPE': 'A',
  'YEAR': '14',
  'SLIDE_ID': '22549AB',
  'MAGNIFICATION': '100',
  'SEQ': '002'},
 'SOB_B_A-14-22549AB-100-003.png': {'BIOPSY_PROCEDURE': 'SOB',
  'TUMOR_CLASS': 'B',
  'TUMOR_TYPE': 'A',
  'YEAR': '14',
  'SLIDE_ID': '22549AB',
  'MAGNIFICATION': '100',
  'SEQ': '003'},
 'SOB_B_A-14-22549AB-100-001.png': {'BIOPSY_PROCEDURE': 'SOB',
  'TUMOR_CLASS': 'B',
  'TUMOR_TYPE': 'A',
  'YEAR': '14',
  'SLIDE_ID': '22549AB',
  'MAGNIFICATION': '100',
  'SEQ': '001'},
 'SOB_B_A-14-22549AB-100-007.png': {'BIOPSY_PROCEDURE': 'SOB',
  'TUMOR_CLASS': 'B',
  'TUMOR_TYPE': 'A',
  'YEAR': '14',
  'SLIDE_ID': '22549AB',
  'MAGNIFICATION': '100',
  'SEQ': '007'},
 'SOB_B_A-14-22549AB-100-006.png': {'BIOPSY_PROCED

In [None]:
# write out meta data
import json
json_object = json.dumps(meta)
with open("metaData.json", "w") as outfile:
  outfile.write(json_object)

# Citation(s)