In [3]:
import h5py
import pandas as pd
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
import pickle
import time

## Preprocess metadata

In [7]:
# read metadata
path = r'.\balanced_augmented_labels.csv'

demo_data = pd.read_csv(path)
demo_data

Unnamed: 0,image_id,binaryLabel,age,sex,Tirads
0,2_1_aug_0,0,49,M,2
1,2_1_aug_1,0,49,M,2
2,2_1_aug_2,0,49,M,2
3,2_1_aug_3,0,49,F,2
4,2_1_aug_4,0,49,F,2
...,...,...,...,...,...
1740,400_1_aug_0,1,63,F,4a
1741,400_1_aug_1,1,63,F,4a
1742,400_1_aug_2,1,63,F,4a
1743,400_1_aug_3,1,63,F,4a


In [8]:
Counter(demo_data['dataset'])

KeyError: 'dataset'

In [17]:
# add image path to the metadata
pathlist = demo_data['image_id'].values.tolist()
paths = ['\\' + i + '.jpg' for i in pathlist]
demo_data['Path'] = paths

In [18]:
demo_data['Path'].head()

0    \2_1_aug_0.jpg
1    \2_1_aug_1.jpg
2    \2_1_aug_2.jpg
3    \2_1_aug_3.jpg
4    \2_1_aug_4.jpg
Name: Path, dtype: object

In [None]:
# remove age/sex == null 
demo_data = demo_data[~demo_data['age'].isnull()]
demo_data = demo_data[~demo_data['sex'].isnull()]
len(demo_data)

In [None]:
# unify the value of sensitive attributes
sex = demo_data['sex'].values
sex[sex == 'male'] = 'M'
sex[sex == 'female'] = 'F'
demo_data['Sex'] = sex
demo_data

In [10]:
# split subjects to different age groups
demo_data['Age_multi'] = demo_data['age'].values.astype('int')
demo_data['Age_multi'] = np.where(demo_data['Age_multi'].between(-1,19), 0, demo_data['Age_multi'])
demo_data['Age_multi'] = np.where(demo_data['Age_multi'].between(20,39), 1, demo_data['Age_multi'])
demo_data['Age_multi'] = np.where(demo_data['Age_multi'].between(40,59), 2, demo_data['Age_multi'])
demo_data['Age_multi'] = np.where(demo_data['Age_multi'].between(60,79), 3, demo_data['Age_multi'])
demo_data['Age_multi'] = np.where(demo_data['Age_multi']>=80, 4, demo_data['Age_multi'])

demo_data['Age_binary'] = demo_data['age'].values.astype('int')
demo_data['Age_binary'] = np.where(demo_data['Age_binary'].between(-1, 60), 0, demo_data['Age_binary'])
demo_data['Age_binary'] = np.where(demo_data['Age_binary']>= 60, 1, demo_data['Age_binary'])
demo_data

Unnamed: 0,image_id,binaryLabel,age,sex,Tirads,Path,Age_multi,Age_binary
0,2_1_aug_0,0,49,M,2,./augmented_images/2_1_aug_0.jpg,2,0
1,2_1_aug_1,0,49,M,2,./augmented_images/2_1_aug_1.jpg,2,0
2,2_1_aug_2,0,49,M,2,./augmented_images/2_1_aug_2.jpg,2,0
3,2_1_aug_3,0,49,F,2,./augmented_images/2_1_aug_3.jpg,2,0
4,2_1_aug_4,0,49,F,2,./augmented_images/2_1_aug_4.jpg,2,0
...,...,...,...,...,...,...,...,...
1740,400_1_aug_0,1,63,F,4a,./augmented_images/400_1_aug_0.jpg,3,1
1741,400_1_aug_1,1,63,F,4a,./augmented_images/400_1_aug_1.jpg,3,1
1742,400_1_aug_2,1,63,F,4a,./augmented_images/400_1_aug_2.jpg,3,1
1743,400_1_aug_3,1,63,F,4a,./augmented_images/400_1_aug_3.jpg,3,1


In [None]:
# convert to binary labels
# benign: bcc, bkl, dermatofibroma, nv, vasc
# maglinant: akiec, mel

labels = demo_data['dx'].values.copy()
labels[labels == 'akiec'] = '1'
labels[labels == 'mel'] = '1'
labels[labels != '1'] = '0'

labels = labels.astype('int')

demo_data['binaryLabel'] = labels
demo_data

## Split train/val/test

In [19]:
# Split the data into train, test, and validation sets (70%, 20%, 10%)
train_meta, test_val_meta = train_test_split(demo_data, test_size=0.3, random_state=42)
test_meta, val_meta = train_test_split(test_val_meta, test_size=1/3, random_state=42)

In [20]:
train_meta.to_csv(r'./split/new_train.csv')
val_meta.to_csv(r'./split/new_val.csv')
test_meta.to_csv(r'./split/new_test.csv')

In [26]:
# Path where images are stored
image_dir = r'C:\Users\saium\Desktop\thyroid\augmented_images'
output_path = r'C:\Users\saium\Desktop\thyroid\pkls'

## Save images into pickle files
This is optional, but if you are training many models, this step can save a lot of time by reducing the data IO.

In [28]:
# Function to read, resize, and save images as pickle
def save_images_as_pickle(meta_df, output_filename):
    images = []
    start = time.time()
    for i in range(len(meta_df)):
        img = cv2.imread(image_dir + meta_df.iloc[i]['Path'])
        if img is not None:
            img = cv2.resize(img, (256, 256))
            images.append(img)
    end = time.time()
    print(f"Time taken for {output_filename}: {end - start} seconds")

    with open(output_path + output_filename, 'wb') as f:
        pickle.dump(images, f)

# Process and save images for train, test, and validation
save_images_as_pickle(train_meta, '\\train_images.pkl')
save_images_as_pickle(test_meta, '\\test_images.pkl')
save_images_as_pickle(val_meta, '\\val_images.pkl')

print("Images for train, test, and val sets have been split and saved as pickle files.")

Time taken for \train_images.pkl: 1.3611629009246826 seconds
Time taken for \test_images.pkl: 0.27084994316101074 seconds
Time taken for \val_images.pkl: 0.1318039894104004 seconds
Images for train, test, and val sets have been split and saved as pickle files.


In [29]:
# Function to check the number of images in a pickle file
def check_number_of_images_in_pickle(pickle_file_path):
    with open(pickle_file_path, 'rb') as f:
        images = pickle.load(f)
    print(f"Number of images in {pickle_file_path}: {len(images)}")

# Example usage
check_number_of_images_in_pickle('./pkls/train_images.pkl')
check_number_of_images_in_pickle('./pkls/test_images.pkl')
check_number_of_images_in_pickle('./pkls/val_images.pkl')


Number of images in ./pkls/train_images.pkl: 1221
Number of images in ./pkls/test_images.pkl: 349
Number of images in ./pkls/val_images.pkl: 175
