# Create Dataset for GAN
Based on the code from the YOLO dataset creation, this will make folders in a linked structure as described in the readme here:
https://github.com/POSTECH-CVLab/PyTorch-StudioGAN?tab=readme-ov-file#dataset

In [1]:
import os, pickle, json
import pandas as pd
from pathlib import Path

def get_aws_grouping(rootFolder):
    if os.path.exists(f"{rootFolder}/new_aws_ratings.pickle"):
        with open(f"{rootFolder}/new_aws_ratings.pickle","rb") as f:
            ratings = pickle.load(f)
    else:
        ratings = []
        print("nothing found in ratings folder")
    return ratings

def cumulative_length(d):
    return sum(len(lst) for lst in d.values())

groups = get_aws_grouping(".")

print(f"found details for {cumulative_length(groups)} images")

with open('output_crypts_dedupe.pickle', 'rb') as f:
    df = pickle.load(f)



df.head()

found details for 55948 images


Unnamed: 0,FileName,OriginalImage,PatchXLocation,PatchYLocation,OriginalXLocation,OriginalYLocation,cropHeight,cropWidth,padding,JustFileName
0,9c256bec-a1f2-433d-86f8-57c67b0fe26f.png,/openpatho-colorectal-unitopatho/unitopatho/70...,0,0,546,382,211,165,"[167, 144, 168, 145]",9c256bec-a1f2-433d-86f8-57c67b0fe26f.png
1,28b45f1f-93db-4ae2-a07e-fa236bb31c33.png,/openpatho-colorectal-unitopatho/unitopatho/70...,0,0,551,65,270,198,"[151, 115, 151, 115]",28b45f1f-93db-4ae2-a07e-fa236bb31c33.png
2,e3a308e3-14f6-4c13-84e5-4a8a8b07bc92.png,/openpatho-colorectal-unitopatho/unitopatho/70...,0,0,555,72,180,142,"[179, 160, 179, 160]",e3a308e3-14f6-4c13-84e5-4a8a8b07bc92.png
3,7e98a85b-3788-4d85-a6e7-f0a0a10d6624.png,/openpatho-colorectal-unitopatho/unitopatho/70...,0,0,795,0,230,97,"[201, 135, 202, 135]",7e98a85b-3788-4d85-a6e7-f0a0a10d6624.png
4,b83eecc9-3fb9-429a-bb56-8cf5a0a6589f.png,/openpatho-colorectal-unitopatho/unitopatho/70...,1536,0,2585,985,187,266,"[117, 156, 117, 157]",b83eecc9-3fb9-429a-bb56-8cf5a0a6589f.png


## Read Human Labels
read in and summarise the human labelling effort thus far.

In [2]:
all_labelled_files = []
possible_labels = ["unlabelled"]
for key, listImages in groups.items():
    possible_labels.append(key)
    strippedImages = [os.path.basename(imagepath) for imagepath in listImages]
    all_labelled_files.extend(strippedImages)

matching_rows = df[df['JustFileName'].isin(all_labelled_files)]
matched_values = matching_rows['OriginalImage'].unique().tolist()
print(f"using the labels from the human efforts, we have crypts in {len(matched_values)} slides")

using the labels from the human efforts, we have crypts in 855 slides


## Add in extra labels from the RESNET Labeller
If we say that anything with > 90% confidence is as accurately labelled as our human labelling we can add in a bunch more labelled crypts: 

In [7]:
with open("list-of-frames.pickle", "rb") as f:
    list_of_dataframes = pickle.load( f)
classifier_results = pd.concat(list_of_dataframes, ignore_index=True)
merged_df = df.merge(classifier_results, left_on='JustFileName', right_on='filenames', how='inner')
merged_df['confidence'] = merged_df['probs'].apply(max)
merged_df = merged_df[merged_df['confidence'] >= 0.9995]
print(f"This gives us {len(merged_df.index)} new labels")
merged_df.head()

This gives us 36607 new labels


Unnamed: 0,FileName,OriginalImage,PatchXLocation,PatchYLocation,OriginalXLocation,OriginalYLocation,cropHeight,cropWidth,padding,JustFileName,filenames,modelledClass,probs,confidence
1,28b45f1f-93db-4ae2-a07e-fa236bb31c33.png,/openpatho-colorectal-unitopatho/unitopatho/70...,0,0,551,65,270,198,"[151, 115, 151, 115]",28b45f1f-93db-4ae2-a07e-fa236bb31c33.png,28b45f1f-93db-4ae2-a07e-fa236bb31c33.png,Something Irrelevant,"[6.8051518e-06, 1.4318189e-05, 0.99989164, 8.7...",0.999892
7,3191da06-1d3b-46c5-a580-f937c2668aec.png,/openpatho-colorectal-unitopatho/unitopatho/70...,3072,0,4932,17,171,186,"[157, 164, 157, 165]",3191da06-1d3b-46c5-a580-f937c2668aec.png,3191da06-1d3b-46c5-a580-f937c2668aec.png,Something Irrelevant,"[7.984322e-05, 1.313553e-06, 0.99991155, 7.327...",0.999912
9,1be9d24f-a78e-4294-90dc-b9bc7a2f827d.png,/openpatho-colorectal-unitopatho/unitopatho/70...,3072,0,4682,1456,318,148,"[176, 91, 176, 91]",1be9d24f-a78e-4294-90dc-b9bc7a2f827d.png,1be9d24f-a78e-4294-90dc-b9bc7a2f827d.png,Something Irrelevant,"[0.00013213811, 3.034871e-06, 0.9998472, 1.762...",0.999847
10,4e7f2de6-75cf-4608-bb53-9131e46d5671.png,/openpatho-colorectal-unitopatho/unitopatho/70...,3072,0,4426,1127,250,197,"[151, 125, 152, 125]",4e7f2de6-75cf-4608-bb53-9131e46d5671.png,4e7f2de6-75cf-4608-bb53-9131e46d5671.png,Something Irrelevant,"[6.265589e-05, 3.949634e-07, 0.99992156, 1.531...",0.999922
13,94f182b6-021e-4293-917a-ede4d99fc661.png,/openpatho-colorectal-unitopatho/unitopatho/70...,4608,0,5269,956,226,245,"[127, 137, 128, 137]",94f182b6-021e-4293-917a-ede4d99fc661.png,94f182b6-021e-4293-917a-ede4d99fc661.png,Something Irrelevant,"[1.282365e-05, 1.3702127e-06, 0.9999703, 1.541...",0.99997


## Extract from dataFrame to a dictionary
This just makes it faster/easier to do the next sorting steps, and to over-write the classes from the resnet with our human ones:

In [8]:
# export classified labels first:
fileNameLookup = merged_df.set_index('JustFileName')['modelledClass'].to_dict()

#assume human ones are better, and this over-writes by default
for key, listImages in groups.items():
    for image in listImages:
        fileNameLookup[image] = key

## Filter out dud image files

In [9]:
import os
from PIL import Image

def is_image(file_path, verbose=False):
    """Check if the file exists and is a valid image file.

    Args:
        file_path (str): Path to the file to check.
        verbose (bool): If True, print messages about the file's validity.
    Returns:
        bool: True if the file exists and is a valid image, False otherwise.
    """
    # Check if the file exists
    if not os.path.isfile(file_path):
        if verbose: print(f"File does not exist: {file_path}")
        return False

    # Attempt to open the file as an image
    try:
        with Image.open(file_path) as img:
            # If successful, print the image format (optional)
            if verbose: print(f"Valid image file with format: {img.format}")
            return True
    except (IOError, SyntaxError) as e:
        if verbose: print(f"Invalid image file: {file_path}. Error: {e}")
    except Exception as e:
        if verbose: print(f"Other error when loading the image file {e}")

    return False

## Linking Loop
Loop to move the files into the right directory structure

To have a better chance of an even class split after the train/val split we shuffle the dictionary too.

In [10]:
from tqdm.notebook import tqdm
import random

trainSplit = int(len(fileNameLookup)*0.75) # 75/25 split of train and validation images.

# We need to do this as this will mean our train/val split is likely to have broadly even classes.
# Convert dictionary items to a list of (key, value) tuples
items = list(fileNameLookup.items())

# Shuffle the list
random.shuffle(items)



i=1

for file, label in tqdm(items):

    if i < trainSplit: 
        destPath = f"../small_dataset/train/{label}/{file}"
    else: 
        destPath = f"../small_dataset/valid/{label}/{file}"
        
    srcPath = "B:/restained_new/" + file
    if not is_image(srcPath):
        continue
    os.makedirs(os.path.dirname(destPath), exist_ok=True)
    os.symlink(srcPath,destPath)
    i += 1

  0%|          | 0/89650 [00:00<?, ?it/s]

In [13]:
from sizeUpWithBorder import process_images
process_images("../small_dataset/")

Processing Images: 100%|█████████████████████████████████████████████████████████| 89200/89200 [19:00<00:00, 78.18it/s]
