In [1]:
import pandas as pd 
import numpy as np 
import torch
import os 

In [2]:
mapping = {
    "A172": "Glioblastoma",
    "BT474": "Ductal Carcinoma",
    "BV2": "Microglial",
    "Huh7": "Tumorigenic",
    "MCF7": "Breast Cancer",
    "SHSY5Y": "Neuroblastoma",
    "SkBr3": "Adenocarcinoma",
    "SKOV3": "Adenocarcinoma"
}

In [3]:
for t in mapping:
    num_in = len([f for f in os.listdir(os.path.join('../images/', t))])
    print(f'Number of images of {mapping[t]} is {num_in}')

Number of images of Glioblastoma is 608
Number of images of Ductal Carcinoma is 672
Number of images of Microglial is 608
Number of images of Tumorigenic is 600
Number of images of Breast Cancer is 735
Number of images of Neuroblastoma is 704
Number of images of Adenocarcinoma is 704
Number of images of Adenocarcinoma is 608


Now, we have to create the labels file

In [32]:
import pandas as pd 
 
df = pd.DataFrame(columns=['label'])
df.index.name = 'filename'
for t in mapping:
    for f in os.listdir(os.path.join('../images/', t)):
        df.loc[os.path.join(t, f), :] = mapping[t]

df.to_csv('../images/labels.csv', index=True)

In [33]:
pd.read_csv('../images/labels.csv')

Unnamed: 0,filename,label
0,A172/A172_Phase_A7_1_00d08h00m_3.tif,Glioblastoma
1,A172/A172_Phase_A7_2_02d04h00m_1.tif,Glioblastoma
2,A172/A172_Phase_B7_1_03d00h00m_3.tif,Glioblastoma
3,A172/A172_Phase_C7_2_01d04h00m_3.tif,Glioblastoma
4,A172/A172_Phase_C7_1_01d00h00m_2.tif,Glioblastoma
...,...,...
5234,SKOV3/SKOV3_Phase_E4_1_01d04h00m_2.tif,Adenocarcinoma
5235,SKOV3/SKOV3_Phase_E4_2_01d12h00m_4.tif,Adenocarcinoma
5236,SKOV3/SKOV3_Phase_G4_1_00d20h00m_1.tif,Adenocarcinoma
5237,SKOV3/SKOV3_Phase_H4_2_01d00h00m_3.tif,Adenocarcinoma


Now that we've created a labels file, we can create the PyTorch dataset and generate our train-test split

In [68]:
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

class CellDataset(Dataset):
    def __init__(self, images_path, label_path):
        self.images_path = images_path
        self.labels = pd.read_csv(label_path)
        self.toimage = transforms.ToTensor()
        
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        img_path, label = self.labels.iloc[idx]['filename'], self.labels.iloc[idx]['label']
        img = Image.open(os.path.join(self.images_path, img_path))
        return self.toimage(img), label
            

In [69]:
test = CellDataset('../images/', '../images/labels.csv')

torch.Size([1, 520, 704])