In [21]:
from data_preparation import DataRepo, filter_valid_images
from environs import Env
from pathlib import Path
import pandas as pd
env = Env()
env.read_env()
CELL_DIR = Path('/lustre/scratch2/ws/0/s2558947-hema_pytorch/cell_class/data') / "cell_images"
WSI_DIR = Path('/lustre/scratch2/ws/0/s2558947-hema_pytorch/cell_class/data') / "wsi_images"
data_repo = DataRepo("https://172.26.62.216:8000", dataset_ids=[2], segmentation_set_ids=[2], annotator_ids=[12], cell_dir=CELL_DIR, wsi_dir=WSI_DIR)
cell_df = data_repo.build_cell_label_data()
wsi_df = data_repo.build_wsi_label_data()

In [11]:
# Build df with valid promyelocyte cell images as cell_df["y"]==1 and "other" cell_df["y"]==0
Myeloblast_label = 3
Monoblast_label = 4
cell_df["y"] = cell_df["label"].apply(lambda label: 1 if label == Myeloblast_label or Monoblast_label else 0)
### drop all cells where label == 0
cell_df = cell_df[cell_df['y'] == 1]
# There is one row for each label of a cell, but we will only keep every cell id once.
# Descending sort makes sure, that we keep all true (1) labels
cell_df = cell_df.sort_values("y", ascending=False).drop_duplicates(subset="cell_id",keep="first")
# Drop label column as it would be misleading, because we dropped unneeded labels
cell_df = cell_df.drop("label", axis=1)
cell_df = filter_valid_images(cell_df)

# Same procedure for wsi. Build df with valid AML M3 images as wsi_df["y"]==1 and "other" wsi_df["y"]==0
m3_label = "m3"
#wsi_df["y"] = wsi_df["label"].apply(lambda label: 1 if label == m3_label else 0)
wsi_df["y"] = 1
wsi_df = wsi_df.sort_values("y", ascending=False).drop_duplicates(subset="wsi_id",keep="first")
wsi_df = wsi_df.drop("label", axis=1)
wsi_df = filter_valid_images(wsi_df)

# Combine dfs to get all cells labeled as promyelocyte/other (combined_df.y_cell == 1 or 0)
# with their wsi images labeled as m3/other (combined_df.y_wsi == 1 or 0)
combined_df = cell_df.join(wsi_df.set_index('wsi_id'), on="wsi_id", lsuffix="_cell", rsuffix="_wsi")

In [12]:
# add npm1 and flt3i labels each in new column
from pathlib import Path
import numpy as np

p = Path('/lustre/scratch2/ws/0/s2558947-hema_pytorch/data/flt3i/FLT3I')
p2 = Path('/lustre/scratch2/ws/0/s2558947-hema_pytorch/data/npm1/npm1')
flt3i_list = [str(i).split('/')[-1] for i in p.rglob('*.png')]
npm1_list = [str(i).split('/')[-1] for i in p2.rglob('*.png')]

combined_df['flt3i'] = combined_df['img_path_wsi'].apply(lambda x: 1 if x.split('/')[-1] in flt3i_list else 0)
combined_df['npm1'] = combined_df['img_path_wsi'].apply(lambda x: 1 if x.split('/')[-1] in npm1_list else 0)

In [13]:
print(combined_df['npm1'].value_counts(), combined_df['flt3i'].value_counts())

1    2424
0    1596
Name: npm1, dtype: int64 0    2970
1    1050
Name: flt3i, dtype: int64


In [67]:
def balance_classes(input_df, label_column):
    input_df = input_df.groupby(label_column)
    input_df = pd.DataFrame(input_df.apply(lambda x: x.sample(input_df.size().min(), random_state=123).reset_index(drop=True)))
    input_df.reset_index(drop=True, inplace=True)
    return input_df

def split_wrt_wsi(input_df, label, test_fraction=0.2):
    # splits data into train and test (according to test_fraction) and keeps data separated with respect to img_path_wsi
    train_fraction = 1 - test_fraction # fraction of data used for training (rest for testing)

    col_names = list(input_df.columns)
    df_train = pd.DataFrame(columns=col_names)
    df_test = pd.DataFrame(columns=col_names)

    # shuffle data
    input_df = input_df.sample(frac=1, random_state=123)

    #init: put 1 entry into df_train to avoid devision by zero in if condition
    df_train = df_train.append(input_df.iloc[0], ignore_index=True)
    input_df.drop([0], inplace=True)

    for i in input_df.iloc:
        if len(df_train) / (len(df_train) + len(df_test)) >= train_fraction:
            if i['img_path_wsi'] not in df_train['img_path_wsi'].values:
                df_test = df_test.append(i, ignore_index=True)
            else:
                df_train = df_train.append(i, ignore_index=True)
        else:
            if i['img_path_wsi'] not in df_test['img_path_wsi'].values:
                df_train = df_train.append(i, ignore_index=True)
            else:
                df_test = df_test.append(i, ignore_index=True)
                
    df_train = df_train[['img_path_cell', label]]
    df_train.drop_duplicates(inplace=True)
    df_test = df_test[['img_path_cell', label]]
    df_test.drop_duplicates(inplace=True)
    return df_train, df_test

In [87]:
label = 'npm1'
balanced_df = balance_classes(combined_df, label)
df_train, df_test = split_wrt_wsi(balanced_df, label)

#save
df_train.to_csv('data/'+label+'_train.csv', index= False)
df_test.to_csv('data/'+label+'_test.csv', index= False)

In [88]:
df_test[label].value_counts()

0    443
1    374
Name: npm1, dtype: int64

In [89]:
df_train[label].value_counts()

1    1222
0    1152
Name: npm1, dtype: int64

import pandas as pd
df = pd.read_csv('data/npm1_train.csv')

In [1]:
# build dataset from imagefolder
import torch
import torchvision
from pathlib import Path
import pandas as pd
from torchvision import transforms
torch.manual_seed(1)

label = 'flt3i'

data_path = '/scratch/ws/0/s2558947-hema_pytorch/data/flt3i'

dataset = torchvision.datasets.ImageFolder(
    data_path,
    transforms.Compose([
        transforms.RandomResizedCrop((1500,2000)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ]))

indices = torch.randperm(len(dataset)).tolist()
split=0.8
splitpoint=int(len(dataset)*split)

p = Path(data_path).glob('**/*')
dirs = [x for x in p if x.is_dir()]
dirs = sorted(dirs)

files = []
labels = []
for dir in dirs:
    p2 = dir.glob('**/*')
    files2 = [str(x) for x in p2 if x.is_file()]
    #files2 = sorted(files2)
    files.extend(files2)

labels = [0 if '/non_' in x else 1 for x in files]
files = ['/lustre/scratch2/ws/0/s2558947-hema_pytorch/cell_class/data/wsi_images/' + x.split('/')[-1] for x in files]


In [2]:
df = pd.DataFrame({'img_path': files, label: labels}, columns=['img_path', label])
#df
#df.to_csv('data/'+label+'_wsi.csv', index= False)

In [24]:
splitpoint=int(len(dataset)*(1-0.2))
indices = torch.randperm(len(dataset)).tolist()

In [22]:
indices

[452,
 425,
 364,
 210,
 99,
 380,
 217,
 45,
 37,
 254,
 67,
 158,
 93,
 172,
 119,
 419,
 222,
 434,
 140,
 465,
 401,
 112,
 34,
 257,
 38,
 273,
 68,
 389,
 197,
 108,
 159,
 88,
 168,
 354,
 455,
 405,
 227,
 318,
 194,
 125,
 242,
 171,
 428,
 152,
 1,
 126,
 83,
 251,
 252,
 313,
 0,
 316,
 228,
 132,
 193,
 271,
 362,
 315,
 54,
 74,
 219,
 424,
 444,
 402,
 70,
 299,
 253,
 5,
 136,
 304,
 64,
 329,
 286,
 232,
 266,
 12,
 335,
 407,
 411,
 435,
 426,
 290,
 280,
 43,
 334,
 312,
 270,
 432,
 263,
 153,
 89,
 155,
 340,
 230,
 149,
 184,
 448,
 110,
 186,
 366,
 449,
 58,
 374,
 382,
 341,
 177,
 367,
 264,
 438,
 471,
 62,
 287,
 32,
 215,
 23,
 234,
 17,
 321,
 303,
 25,
 397,
 198,
 148,
 430,
 282,
 470,
 306,
 129,
 97,
 332,
 337,
 381,
 81,
 423,
 245,
 417,
 279,
 49,
 443,
 170,
 224,
 16,
 317,
 195,
 463,
 142,
 283,
 300,
 320,
 410,
 319,
 441,
 388,
 231,
 92,
 404,
 377,
 277,
 80,
 211,
 403,
 365,
 302,
 209,
 144,
 272,
 84,
 213,
 346,
 412,
 348,
 13,
 131,

In [28]:
df.iloc[indices,:]

Unnamed: 0,img_path,flt3i
348,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,0
92,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,1
32,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,1
422,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,0
34,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,1
...,...,...
58,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,1
157,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,1
390,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,0
236,/lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...,1


In [None]:
/lustre/scratch2/ws/0/s2558947-hema_pytorch/cell_class/data

In [67]:
dataset.classes

['FLT3I', 'non_FLT3I']

In [3]:
import tempfile

In [7]:
with tempfile.NamedTemporaryFile(mode='w+') as temp:
    df.to_csv(temp.name)
    
    print(temp.name)
    df2 = pd.read_csv(temp.name)
    print(df2)

/tmp/tmpdwr8fpin
     Unnamed: 0                                           img_path  flt3i
0             0  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
1             1  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
2             2  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
3             3  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
4             4  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
..          ...                                                ...    ...
469         469  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
470         470  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
471         471  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
472         472  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
473         473  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0

[474 rows x 3 columns]


In [14]:
with tempfile.TemporaryDirectory() as tmpdirname:
    df.to_csv(tmpdirname+"/test.csv")
    
    print(tmpdirname)
    df2 = pd.read_csv(tmpdirname+"/test.csv")
    print(df2)

/tmp/tmpwqzxv0pc
     Unnamed: 0                                           img_path  flt3i
0             0  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
1             1  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
2             2  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
3             3  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
4             4  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      1
..          ...                                                ...    ...
469         469  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
470         470  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
471         471  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
472         472  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0
473         473  /lustre/scratch2/ws/0/s2558947-hema_pytorch/ce...      0

[474 rows x 3 columns]


In [15]:
df3 = pd.read_csv(tmpdirname+"/test.csv")
df3

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpwqzxv0pc/test.csv'