In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import scipy
from scipy import ndimage

import glob

In [2]:
test_ct_all_list=list(glob.glob("/ssd8/2023COVID19/Train_Valid_dataset/test_crop/*/*")) 

In [3]:
len(test_ct_all_list)

785605

In [4]:
test_area=[]
for path in tqdm(test_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    test_area.append(aaa.sum()/255)

100%|██████████| 785605/785605 [4:57:38<00:00, 43.99it/s]    


In [5]:
test_area_df=pd.DataFrame((zip(test_ct_all_list, test_area)), columns = ['path', 'area'])

In [6]:
test_area_df

Unnamed: 0,path,area
0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,36700.0
1,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,313.0
2,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,33961.0
3,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,22438.0
4,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,22742.0
...,...,...
785600,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,1185.0
785601,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,37777.0
785602,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,40605.0
785603,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,3479.0


In [7]:
test_area_df.to_csv("/ssd8/2023COVID19/Train_Valid_dataset/test_area_df.csv",index=False)

In [8]:
test_area_df["ct_path"]=test_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
test_area_df["ct_slice"]=test_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

In [9]:
test_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [10]:
test_area_df=test_area_df.reset_index(drop=True)

In [12]:
test_area_df

Unnamed: 0,path,area,ct_path,ct_slice
0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,366.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,0
1,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,366.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,1
2,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,572.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,2
3,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,533.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,3
4,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,1324.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,4
...,...,...,...,...
785600,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,2155.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,367
785601,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,2339.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,368
785602,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,2407.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,369
785603,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,2342.0,/ssd8/2023COVID19/Train_Valid_dataset/test_cro...,370


In [13]:
def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k

In [14]:
ct_path_list=test_area_df["ct_path"].unique()

In [15]:
test_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=test_area_df[test_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    test_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

100%|██████████| 4308/4308 [07:00<00:00, 10.24it/s]


In [16]:
import pickle
with open('/ssd8/2023COVID19/Train_Valid_dataset/test_dic1_05.pickle', 'wb') as handle:
    pickle.dump(test_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
# ct_path_list=test_area_df["ct_path"].unique()
# for i in range(100):
    
#     tmp_df=test_area_df[test_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
#     a,b=test_dic[ct_path_list[i]]
#     print(ct_path_list[i])
#     plt.plot(tmp_df["area"])
#     plt.plot(tmp_df["area"][a:b])
#     print(a,b)
#     plt.show()