In [2]:
import tensorflow as tf
import os
import os.path as osp
import pandas as pd
import numpy as np
import shutil

In [3]:
'''
Split dogs-vs-cats (PetFinder) dataset into train and validation set. Choose 1k images from cat and dog and move to validation set
'''
for class_name in ['dog', 'cat']:
    val_indices = np.random.choice(range(12499), 1000, replace=False)
    for idx in val_indices:
        shutil.move(f"/usr/xtmp/vision/datasets/dogs-vs-cats/train/{class_name}/{class_name}.{idx}.jpg", f"/usr/xtmp/vision/datasets/dogs-vs-cats/val/{class_name}/{class_name}.{idx}.jpg")


'\nSplit dogs-vs-cats (PetFinder) dataset into train and validation set. Choose 1k images from cat and dog and move to validation set\n'

In [4]:
'''
Combine melanoma data from 2019 and 2020. Split dataset into train and validation set. 
Save images in folders corresponding to their classes (melanoma and no-melanoma)
Use splitting method from https://github.com/mlfoundations/imagenet-applications-transfer/blob/main/datasets/melanoma.py
'''

# First need to add 'tfrecord' attribute to train.csv for the 2020 dataset.
# This is included in the 2019 dataset and is needed to split the data.
# Read in tfrecords and make a dataframe with the image_names, targets, and tfrecord values
feature_description = {
    'image_name': tf.io.FixedLenFeature([], tf.string),
    'target': tf.io.FixedLenFeature([], tf.int64),
}

# Function to parse a single example
def _parse_function(example_proto):
    # Parse the input `tf.train.Example` protocol buffer
    return tf.io.parse_single_example(example_proto, feature_description)

tfrec_df = pd.DataFrame(columns=['image_name', 'target', 'tfrecord'])
for i, tfrecord_path in enumerate([f'/usr/xtmp/vision/datasets/melanoma/other_2020_data/tfrecords/train{x:02d}-2071.tfrec' for x in range(15)] +
                               [f'/usr/xtmp/vision/datasets/melanoma/other_2020_data/tfrecords/train15-2061.tfrec']):
    assert osp.exists(tfrecord_path), f'{tfrecord_path} does not exist.'
    # Create a TFRecordDataset
    dataset = tf.data.TFRecordDataset(tfrecord_path)

    parsed_dataset = dataset.map(_parse_function)
    image_names = []
    targets = []
    for rec in parsed_dataset:
        image_names.append(rec['image_name'].numpy().decode('utf-8'))
        targets.append(rec['target'].numpy())

    temp_df = pd.DataFrame({'image_name': image_names, 'target': targets, 'tfrecord': [i]*len(targets)})
    tfrec_df = pd.concat([tfrec_df, temp_df])

print(tfrec_df)

2023-12-05 18:07:42.317093: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-12-05 18:07:42.419624: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-12-05 18:07:44.293994: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-12-05 18:07:46.521154: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executo

        image_name target tfrecord
0     ISIC_3139230      0        0
1     ISIC_1939931      0        0
2     ISIC_8681290      0        0
3     ISIC_5403296      0        0
4     ISIC_5598875      0        0
...            ...    ...      ...
2056  ISIC_6364051      0       15
2057  ISIC_4645138      0       15
2058  ISIC_1626843      0       15
2059  ISIC_6927363      0       15
2060  ISIC_6553041      0       15

[33126 rows x 3 columns]


In [5]:
'''
Next merge train.csv with tfrec_df. Save a new version of train.csv with the tfrecord attribute
'''
train_df = pd.read_csv('/usr/xtmp/vision/datasets/melanoma/other_2020_data/train.csv')
merged = train_df.merge(tfrec_df, on='image_name')
assert np.all(merged['target_x'] == merged['target_y']), "Targets don't match for all images"
merged['target'] = merged['target_x']
merged = merged.drop(['target_x', 'target_y'], axis=1)
print(merged.keys())
merged.to_csv('/usr/xtmp/vision/datasets/melanoma/melanoma-2020/train.csv')

Index(['image_name', 'patient_id', 'sex', 'age_approx',
       'anatom_site_general_challenge', 'diagnosis', 'benign_malignant',
       'tfrecord', 'target'],
      dtype='object')


In [11]:
'''
Split melanoma data into train/val and melanoma/no-melanoma
Found val_folds using
    folds = set(df['tfrecord'].values)
    folds = [f for f in folds if f >= 0]
    # Finding new splits each time you call MelanomaDataset, train and test could overlap
    # Okay bc calling np.random.seed each time
    train_splits = np.random.choice(folds, size=int(0.8*len(folds)), replace=False).tolist()
    val_splits = [fold for fold in folds if fold not in train_splits]
'''
val_folds = {
    '2020': [3, 6, 8, 9],
    '2019': [0, 5, 12, 16, 21, 29]
}

for year in ['2019', '2020']:
    df = pd.read_csv(f'/usr/xtmp/vision/datasets/melanoma/melanoma-{year}/train.csv')
    train_df = df[~df['tfrecord'].isin(val_folds[year])]
    val_df = df[df['tfrecord'].isin(val_folds[year])]
    print(f'Full dataset: {len(df)}, train: {len(train_df)}, test: {len(val_df)}, prop_val: {len(val_df)/len(df):.2f}')
    for split_name, df_split in zip(['train', 'val'], [train_df, val_df]):
        no_df = df_split[df_split['target'] == 0]
        mela_df = df_split[df_split['target'] == 1]
        print(f'Full split: {len(df_split)}, no: {len(no_df)}, mela: {len(mela_df)}, prop_mela: {len(mela_df) / len(df_split):.2f}')

        for class_name, class_df in zip(['melanoma', 'no-melanoma'], [mela_df, no_df]):
            for i, row in class_df.iterrows():
                image_name = row['image_name']
                shutil.move(f'/usr/xtmp/vision/datasets/melanoma/melanoma-{year}/train/{image_name}.jpg', 
                            f'/usr/xtmp/vision/datasets/melanoma/{split_name}/{class_name}/{image_name}.jpg')

Full dataset: 25331, train: 20280, test: 5051, prop_val: 0.20
Full split: 20280, no: 16652, mela: 3628, prop_mela: 0.18
Full split: 5051, no: 4157, mela: 894, prop_mela: 0.18
Full dataset: 33126, train: 24842, test: 8284, prop_val: 0.25
Full split: 24842, no: 24399, mela: 443, prop_mela: 0.02
Full split: 8284, no: 8143, mela: 141, prop_mela: 0.02
