# Data Augmenta

## Steps:
- Load Data
- Resize to Uniform size
- (Fit Colors)
- Normalize
- (Augment Pictures)


In [42]:
# Imports
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix

# Get table with features and target labels

In [None]:
# Load the data
imageDataframe = pd.read_csv('./inputs/image_data.csv')
imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
...,...,...,...,...
7904,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7905,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7906,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7907,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X


Change filepath to reflect raw dataset

In [44]:
imageDataframe['path_to_image'] = imageDataframe['path_to_image'].replace("BreaKHis_v1", "BreaKHis_v1 2", regex=True)
imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
1,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
2,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
3,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
4,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
...,...,...,...,...
7904,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X
7905,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X
7906,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X
7907,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X


# Check for missing values

In [45]:
# Check for missing values in the column Benign or Malignant
imageDataframe[imageDataframe['Benign or Malignant'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1 2/histology_slides/breast/malignan...,,,
3228,BreaKHis_v1 2/histology_slides/breast/malignan...,,,
4536,BreaKHis_v1 2/histology_slides/breast/malignan...,,,


In [46]:
# We see that all the missing values from the Benign or Malignant column are actually Malignant cancers (filename)
imageDataframe['Benign or Malignant'] = imageDataframe['Benign or Malignant'].fillna('Malignant')

In [47]:
# Check for missing values in the column Cancer Type
imageDataframe[imageDataframe['Cancer Type'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,
3093,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,
3228,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,
4536,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,


In [48]:
# We fill all missing values with Mucinous Carcinoma
imageDataframe['Cancer Type'] = imageDataframe['Cancer Type'].fillna('Mucinous Carcinoma')

# And the one at position 4536 is overwritten with the correct value
imageDataframe.iloc[4536, 2] = 'Ductal Carcinoma'

In [49]:
# Check where there is no magnification value
imageDataframe[imageDataframe['Magnification'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Mucinous Carcinoma,
3093,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Mucinous Carcinoma,
3228,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Mucinous Carcinoma,
4536,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Ductal Carcinoma,


In [50]:
# And fill the magnification accordingly
imageDataframe.iloc[2871, 3] = '100X'
imageDataframe.iloc[3093, 3] = '200X'
imageDataframe.iloc[3228, 3] = '400X'
imageDataframe.iloc[4536, 3] = '40X'

In [51]:
# Encode the column of the tumor class - binary => LabelEncoder
classEncoder = LabelEncoder()
imageDataframe['Benign or Malignant'] = classEncoder.fit_transform(imageDataframe['Benign or Malignant'])

# Encode the column of the tumor subclass - multi-class => OneHotEncoder
subClassEncoder= OneHotEncoder(sparse_output = False)
imageDataframe[subClassEncoder.get_feature_names_out(['Cancer Type'])] = subClassEncoder.fit_transform(imageDataframe[['Cancer Type']])

# Should we drop Magnification column???????????????????????????????????????????????
imageDataframe.drop(columns=['Cancer Type', 'Magnification'], inplace = True)

imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type_Adenosis,Cancer Type_Ductal Carcinoma,Cancer Type_Fibroadenoma,Cancer Type_Lobular Carcinoma,Cancer Type_Mucinous Carcinoma,Cancer Type_Papillary Carcinoma,Cancer Type_Phyllodes Tumor,Cancer Type_Tubular Adenoma
0,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7904,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7905,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7906,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7907,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
# Train-test split
X = imageDataframe.iloc[:, :1]
y_binary = imageDataframe.iloc[:, 1:2]
y_multi = imageDataframe.iloc[:, 2:]

# Train-test split with stratification due to class imbalance
X_train_text, X_test_text, y_binary_train, y_binary_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify = y_binary)

In [53]:
import os
from PIL import Image
from pathlib import Path

os.mkdir('../binaryProcessedDataset')
os.mkdir('../binaryProcessedDataset/Train')
os.mkdir('../binaryProcessedDataset/Test')
os.mkdir('../binaryProcessedDataset/Train/Benign')
os.mkdir('../binaryProcessedDataset/Train/Malignant')
os.mkdir('../binaryProcessedDataset/Test/Benign')
os.mkdir('../binaryProcessedDataset/Test/Malignant')

targetScale = (256, 256)
targetFolder = '../binaryProcessedDataset'

def scaleDownImagesBinary(df):
  for index, row in df.iterrows():
    imageFilePath = "../DeepLearning24_25/" + row['path_to_image']
    img = Image.open(imageFilePath)
    img = img.resize(targetScale)  

    trainOrTest = 'Train' if row['path_to_image'] in X_train_text['path_to_image'].values else 'Test'
    className = 'Benign' if row['Benign or Malignant'] == 0 else 'Malignant'
    imageFileName = os.path.basename(imageFilePath)

    img.save(f"{targetFolder}\{trainOrTest}\{className}\{imageFileName}")

scaleDownImagesBinary(imageDataframe)


  img.save(f"{targetFolder}\{trainOrTest}\{className}\{imageFileName}")
  img.save(f"{targetFolder}\{trainOrTest}\{className}\{imageFileName}")
  img.save(f"{targetFolder}\{trainOrTest}\{className}\{imageFileName}")


In [64]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Training data generator with augmentations
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Testing/validation data generator (only rescaling)
test_datagen = ImageDataGenerator(rescale=1.0 / 255.0)

trainDir = '../binaryProcessedDataset/Train/'
testDir = '../binaryProcessedDataset/Test/'

# Training data generator
train_generator = train_datagen.flow_from_directory(
    trainDir,
    batch_size=128,
    class_mode='binary'
)

# Testing data generator
test_generator = test_datagen.flow_from_directory(
    testDir,
    batch_size=128,
    class_mode='binary'
)

Found 6327 images belonging to 2 classes.
Found 1582 images belonging to 2 classes.


# Creating and evaluating the model - binary

In [65]:
# Create model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model,Sequential
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten

model = Sequential()
model.add(Conv2D(32, (4, 4), activation='relu', input_shape=(256, 256, 3)))
model.add(Conv2D(64, (4, 4), activation='relu'))
model.add(MaxPooling2D((2, 2)))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [66]:
model.fit(
    train_generator,
    validation_data = test_generator,
    epochs=1,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_steps=test_generator.samples // test_generator.batch_size
)


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 11s/step - accuracy: 0.6867 - loss: 10.7572 - val_accuracy: 0.8470 - val_loss: 0.4459


<keras.src.callbacks.history.History at 0x28978cb1850>