In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import scipy
from scipy import ndimage

import glob

In [2]:
# train_ct_all_list=list(glob.glob(r"/ssd8/2023COVID19/Train_Valid_dataset/train_pure_crop/*/*/*"))
# print(len(train_ct_all_list))
# valid_ct_all_list=list(glob.glob(r"/ssd8/2023COVID19/Train_Valid_dataset/valid_pure_crop/*/*/*")) 
# print(len(valid_ct_all_list))

In [3]:
train_ct_all_pd = pd.read_csv('./chih_4_fold_covid_train_df.csv')[['path','slice_name']]
train_ct_all_pd['full_path'] = train_ct_all_pd['path']+'/'+train_ct_all_pd['slice_name']
train_ct_all_list = train_ct_all_pd.full_path.values.tolist()
valid_ct_all_pd = pd.read_csv('./chih_4_fold_covid_valid_df.csv')[['path','slice_name']]
valid_ct_all_pd['full_path'] = valid_ct_all_pd['path']+'/'+valid_ct_all_pd['slice_name']
valid_ct_all_pd = valid_ct_all_pd[valid_ct_all_pd['full_path'].str.contains('challenge')]
valid_ct_all_list = valid_ct_all_pd.full_path.values.tolist()
print(train_ct_all_pd.shape, valid_ct_all_pd.shape)

(541028, 3) (42360, 3)


In [4]:
train_area=[]
for path in tqdm(train_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    train_area.append(aaa.sum()/255)
    

In [5]:
train_area_df=pd.DataFrame((zip(train_ct_all_list, train_area)), columns = ['path', 'area'])
train_area_df_base = pd.read_csv("/ssd8/2023COVID19/Train_Valid_dataset/train_area_df1.csv")
new_challenge_train_area_df = pd.concat([train_area_df_base, train_area_df])
new_challenge_train_area_df.to_csv("/ssd8/2023COVID19/Train_Valid_dataset/train_area_df1_challenge.csv", index=False, encoding='utf-8-sig')
print(new_challenge_train_area_df.shape)

In [6]:
valid_area=[]
for path in tqdm(valid_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    valid_area.append(aaa.sum()/255)

100%|██████████| 42360/42360 [27:27<00:00, 25.72it/s]


In [7]:
valid_area_df=pd.DataFrame((zip(valid_ct_all_list, valid_area)), columns = ['path', 'area'])
valid_area_df.to_csv("/ssd8/2023COVID19/Train_Valid_dataset/valid_area_df1_challenge.csv",index=False, encoding='utf-8-sig')
valid_area_df_base = pd.read_csv("/ssd8/2023COVID19/Train_Valid_dataset/valid_area_df1.csv")
new_challenge_valid_area_df = pd.concat([valid_area_df_base, valid_area_df])
new_challenge_valid_area_df.to_csv("/ssd8/2023COVID19/Train_Valid_dataset/valid_area_df1_challenge.csv",index=False, encoding='utf-8-sig')

print(new_challenge_valid_area_df.shape)

(150886, 2)


In [2]:
train_area_df=pd.read_csv("/ssd8/2023COVID19/Train_Valid_dataset/train_area_df1_challenge.csv")
valid_area_df=pd.read_csv("/ssd8/2023COVID19/Train_Valid_dataset/valid_area_df1_challenge.csv")
print(train_area_df.shape, valid_area_df.shape)

(539939, 2) (150886, 2)


In [3]:
train_area_df["ct_path"]=train_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
valid_area_df["ct_path"]=valid_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))

train_area_df["ct_slice"]=train_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))
valid_area_df["ct_slice"]=valid_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

train_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)
valid_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)



In [4]:
def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    # print(l, k)
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k



In [5]:
ct_path_list=train_area_df["ct_path"].unique()
train_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=train_area_df[train_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    train_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))
    # break

100%|██████████| 3032/3032 [03:45<00:00, 13.45it/s]


In [6]:
len(train_dic)

3032

In [7]:
ct_path_list=valid_area_df["ct_path"].unique()
valid_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=valid_area_df[valid_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    valid_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

100%|██████████| 704/704 [00:16<00:00, 42.58it/s]


In [8]:
len(valid_dic)

704

In [9]:
import pickle

In [10]:
with open('/ssd8/2023COVID19/Train_Valid_dataset/train_dic1_05_challenge.pickle', 'wb') as handle:
    pickle.dump(train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
with open('/ssd8/2023COVID19/Train_Valid_dataset/valid_dic1_05_challenge.pickle', 'wb') as handle:
    pickle.dump(valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)