# Create the corrected, detailed .csv file with relative paths


### Imports

In [1]:
import pandas as pd

# Mount drive, if necessary
# from google.colab import drive
# drive.mount('/content/drive')

### Fix missing values in the file

In [None]:
# load the data
imageDataframe = pd.read_csv('../dataset_index/image_data.csv')
imageDataframe

In [None]:
# check for missing values in the column Benign or Malignant
imageDataframe[imageDataframe['Benign or Malignant'].isnull()]

In [4]:
# we see that all the missing values from the Benign or Malignant column are actually Malignant cancers (filename)
imageDataframe['Benign or Malignant'] = imageDataframe['Benign or Malignant'].fillna('Malignant')

In [None]:
# check for missing values in the column Cancer Type
imageDataframe[imageDataframe['Cancer Type'].isnull()]

In [6]:
# we fill all missing values with Mucinous Carcinoma
imageDataframe['Cancer Type'] = imageDataframe['Cancer Type'].fillna('Mucinous Carcinoma')

# and the one at position 4536 is overwritten with the correct value
imageDataframe.iloc[4536, 2] = 'Ductal Carcinoma'

In [None]:
# check where there is no magnification value
imageDataframe[imageDataframe['Magnification'].isnull()]

In [8]:
# and fill the magnification accordingly
imageDataframe.iloc[2871, 3] = '100X'
imageDataframe.iloc[3093, 3] = '200X'
imageDataframe.iloc[3228, 3] = '400X'
imageDataframe.iloc[4536, 3] = '40X'

In [None]:
# check results for inconsistencies
imageDataframe.isna().sum()

### Add relative path to the files

In [None]:
# take the file name and put it into a separate column
imageDataframe["file_name"] = imageDataframe["path_to_image"].str.split("/").str[-1]

# the relative path will point to the temporary tmp folder
imageDataframe['rel_path'] = imageDataframe['file_name'].apply(lambda x: '/tmp/' + x)

imageDataframe

In [None]:
# check again for inconsistencies
imageDataframe.isna().sum()

### Save .csv file

In [12]:
# save the new CSV
imageDataframe.to_csv('../dataset_index/processed_image_data.csv', index = False)

# Build file structure for running the model



In [None]:
from tqdm import tqdm
from PIL import Image
import os
import concurrent.futures

# DEFINE THE WORKSPACE DIRECTORY, THE DATASET BreaKHis MUST BE DOWNLOADED AND EXTRACTED HERE
workspace_dir = 'PATH/TO/WORKSPACE/WHERE/DATASET/IS/STORED'

# output directory
output_dir = '../images_plain'
os.makedirs(output_dir, exist_ok = True)

# function to read, resize and save the images
def process_and_save_image(row):
    input_path = os.path.join(workspace_dir, row.path_to_image)
    output_path = os.path.join(output_dir, row.file_name)
    try:
        with Image.open(input_path) as img:
            img = img.resize((256, 256))
            img.save(output_path)
    except Exception as e:
        pass

# for faster iteration, itertuples are used
rows = imageDataframe.itertuples(index=False)

# for parallel processing, the ThreadPoolExecutor is used
with concurrent.futures.ThreadPoolExecutor() as executor:

    # for a more user-friendly interface, the executor.map is wrapped with tqdm for a progress bar
    list(tqdm(executor.map(process_and_save_image, rows), total = len(imageDataframe)))