# Data Augmenta

## Steps:
- Load Data
- Resize to Uniform size
- (Fit Colors)
- Normalize
- (Augment Pictures)


In [2]:
# Imports
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix

#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Get table with features and target labels

In [3]:
# Load the data
imageDataframe = pd.read_csv('/content/drive/MyDrive/0DEEPLEARNINGPROJECT/image_data.csv')
imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
...,...,...,...,...
7904,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7905,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7906,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7907,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X


Change filepath to reflect raw dataset

In [4]:
imageDataframe['path_to_image'] = imageDataframe['path_to_image'].replace("BreaKHis_v1", "BreaKHis_v1 2", regex=True)
imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
1,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
2,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
3,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
4,BreaKHis_v1 2/histology_slides/breast/benign/S...,Benign,Adenosis,100X
...,...,...,...,...
7904,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X
7905,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X
7906,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X
7907,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Lobular Carcinoma,200X


# Check for missing values

In [5]:
# Check for missing values in the column Benign or Malignant
imageDataframe[imageDataframe['Benign or Malignant'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1 2/histology_slides/breast/malignan...,,,
3228,BreaKHis_v1 2/histology_slides/breast/malignan...,,,
4536,BreaKHis_v1 2/histology_slides/breast/malignan...,,,


In [6]:
# We see that all the missing values from the Benign or Malignant column are actually Malignant cancers (filename)
imageDataframe['Benign or Malignant'] = imageDataframe['Benign or Malignant'].fillna('Malignant')

In [7]:
# Check for missing values in the column Cancer Type
imageDataframe[imageDataframe['Cancer Type'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,
3093,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,
3228,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,
4536,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,,


In [8]:
# We fill all missing values with Mucinous Carcinoma
imageDataframe['Cancer Type'] = imageDataframe['Cancer Type'].fillna('Mucinous Carcinoma')

# And the one at position 4536 is overwritten with the correct value
imageDataframe.iloc[4536, 2] = 'Ductal Carcinoma'

In [9]:
# Check where there is no magnification value
imageDataframe[imageDataframe['Magnification'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Mucinous Carcinoma,
3093,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Mucinous Carcinoma,
3228,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Mucinous Carcinoma,
4536,BreaKHis_v1 2/histology_slides/breast/malignan...,Malignant,Ductal Carcinoma,


In [10]:
# And fill the magnification accordingly
imageDataframe.iloc[2871, 3] = '100X'
imageDataframe.iloc[3093, 3] = '200X'
imageDataframe.iloc[3228, 3] = '400X'
imageDataframe.iloc[4536, 3] = '40X'

In [11]:
# Encode the column of the tumor class - binary => LabelEncoder
classEncoder = LabelEncoder()
imageDataframe['Benign or Malignant'] = classEncoder.fit_transform(imageDataframe['Benign or Malignant'])

# Encode the column of the tumor subclass - multi-class => OneHotEncoder
subClassEncoder= OneHotEncoder(sparse_output = False)
imageDataframe[subClassEncoder.get_feature_names_out(['Cancer Type'])] = subClassEncoder.fit_transform(imageDataframe[['Cancer Type']])

# Should we drop Magnification column???????????????????????????????????????????????
imageDataframe.drop(columns=['Cancer Type', 'Magnification'], inplace = True)

imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type_Adenosis,Cancer Type_Ductal Carcinoma,Cancer Type_Fibroadenoma,Cancer Type_Lobular Carcinoma,Cancer Type_Mucinous Carcinoma,Cancer Type_Papillary Carcinoma,Cancer Type_Phyllodes Tumor,Cancer Type_Tubular Adenoma
0,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BreaKHis_v1 2/histology_slides/breast/benign/S...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7904,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7905,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7906,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7907,BreaKHis_v1 2/histology_slides/breast/malignan...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
# Train-test split
X = imageDataframe.iloc[:, :1]
y_binary = imageDataframe.iloc[:, 1:2]
y_multi = imageDataframe.iloc[:, 2:]

# Train-test split
X_train_text, X_test_text, y_binary_train, y_binary_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify = y_binary)

In [None]:
import os
from PIL import Image
from pathlib import Path

os.mkdir('../binaryProcessedDataset')
os.mkdir('../binaryProcessedDataset/Train')
os.mkdir('../binaryProcessedDataset/Test')
os.mkdir('../binaryProcessedDataset/Train/Benign')
os.mkdir('../binaryProcessedDataset/Train/Malignant')
os.mkdir('../binaryProcessedDataset/Test/Benign')
os.mkdir('../binaryProcessedDataset/Test/Malignant')

targetScale = (256, 256)
targetFolder = '../binaryProcessedDataset'

def scaleDownImagesBinary(df):
  for index, row in df.iterrows():
    imageFilePath = "../DeepLearning24_25/" + row['path_to_image']
    img = Image.open(imageFilePath)
    img = img.resize(targetScale)

    trainOrTest = 'Train' if row['path_to_image'] in X_train_text['path_to_image'].values else 'Test'
    className = 'Benign' if row['Benign or Malignant'] == 0 else 'Malignant'
    imageFileName = os.path.basename(imageFilePath)

    img.save(f"{targetFolder}\{trainOrTest}\{className}\{imageFileName}")

scaleDownImagesBinary(imageDataframe)


FileNotFoundError: [Errno 2] No such file or directory: '/DeepLearning24_25/BreaKHis_v1 2/histology_slides/breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-011.png'

In [53]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Training data generator with augmentations
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=10,
    width_shift_range=0.05,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2
)


trainDir = '/content/drive/MyDrive/0DEEPLEARNINGPROJECT/binaryProcessedDataset/Train/'
testDir = '/content/drive/MyDrive/0DEEPLEARNINGPROJECT/binaryProcessedDataset/Test/'

# Training data generator
train_generator = train_datagen.flow_from_directory(
    trainDir,
    batch_size=32,
    #color_mode = 'grayscale',
    class_mode='binary',
    subset='training'
)

# Testing data generator
validation_generator = train_datagen.flow_from_directory(
    trainDir,
    batch_size=32,
    class_mode='binary',
    #color_mode = 'grayscale',
    subset='validation'
)

# create a balanced class weights dictionary
lables=train_generator.classes
print(lables)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(lables),
    y=lables
)
class_weights = dict(enumerate(class_weights))
class_weights

Found 5063 images belonging to 2 classes.
Found 1264 images belonging to 2 classes.
[0 0 0 ... 1 1 1]


{0: 1.5941435768261965, 1: 0.7284892086330935}

# Creating and evaluating the model - binary

In [54]:
# Create model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model,Sequential
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten

model = Sequential()
model.add(Conv2D(64, (4, 4), activation='relu', input_shape=(256, 256, 3)))
model.add(Conv2D(64, (4, 4), activation='relu'))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(32, (4, 4), activation='relu'))
model.add(Conv2D(32, (4, 4), activation='relu'))
model.add(MaxPooling2D((2, 2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.fit(
    train_generator,
    validation_data = validation_generator,
    class_weight = class_weights,
    epochs=10,
    verbose=1
)

Epoch 1/10


  self._warn_if_super_not_called()


[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 884ms/step - accuracy: 0.6466 - loss: 1.0711 - val_accuracy: 0.8434 - val_loss: 0.4617
Epoch 2/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 854ms/step - accuracy: 0.7808 - loss: 0.5408 - val_accuracy: 0.8085 - val_loss: 0.4595
Epoch 3/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 862ms/step - accuracy: 0.7965 - loss: 0.5209 - val_accuracy: 0.7951 - val_loss: 0.5018
Epoch 4/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 845ms/step - accuracy: 0.7978 - loss: 0.5266 - val_accuracy: 0.8070 - val_loss: 0.4557
Epoch 5/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 863ms/step - accuracy: 0.8051 - loss: 0.5024 - val_accuracy: 0.8410 - val_loss: 0.4318
Epoch 6/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 859ms/step - accuracy: 0.8024 - loss: 0.5116 - val_accuracy: 0.8339 - val_loss: 0.4471
Epoch 7/10
[1m

In [42]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


validation_generator.reset()  # Ensures predictions and labels are aligned

prediction = model.predict(validation_generator)
y_pred = np.round(prediction).astype(int).flatten()

y_true = validation_generator.classes  # Ground truth labels

#print(y_pred)
#f1 = f1_score(y_true, y_pred, average='binary')
#print(f1)

print(classification_report(y_true, y_pred))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 660ms/step
              precision    recall  f1-score   support

           0       0.29      0.16      0.20       396
           1       0.68      0.82      0.74       868

    accuracy                           0.61      1264
   macro avg       0.48      0.49      0.47      1264
weighted avg       0.56      0.61      0.58      1264

