In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import scipy
from scipy import ndimage

import glob

In [2]:
train_ct_all_list=list(glob.glob(r"/home/fate/covid19_CT/input/train_pure_crop/*/*/*")) 

In [3]:
len(train_ct_all_list)

433432

In [4]:
valid_ct_all_list=list(glob.glob(r"/home/fate/covid19_CT/input/valid_pure_crop/*/*/*")) 

In [5]:
len(valid_ct_all_list)

105359

In [6]:
train_area=[]
for path in tqdm(train_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    train_area.append(aaa.sum()/255)
    

  0%|          | 1250/433432 [00:12<1:09:55, 103.00it/s]


KeyboardInterrupt: 

In [None]:
train_area_df=pd.DataFrame((zip(train_ct_all_list, train_area)), columns = ['path', 'area'])
train_area_df_=pd.read_csv("/home/fate/covid19_CT/input/train_area_df1.csv")

In [None]:
valid_area=[]
for path in tqdm(valid_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    valid_area.append(aaa.sum()/255)

In [None]:
valid_area_df=pd.DataFrame((zip(valid_ct_all_list, valid_area)), columns = ['path', 'area'])

In [None]:
valid_area_df.to_csv("/home/fate/covid19_CT/input/valid_area_df1.csv",index=False)

In [10]:
# train_area_df=pd.read_csv("/home/fate/covid19_CT/input/train_area_df1.csv")
# valid_area_df=pd.read_csv("/home/fate/covid19_CT/input/valid_area_df1.csv")

In [11]:
train_area_df["ct_path"]=train_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
valid_area_df["ct_path"]=valid_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))

train_area_df["ct_slice"]=train_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))
valid_area_df["ct_slice"]=valid_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

train_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)
valid_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)



In [13]:
def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k



In [20]:
ct_path_list=train_area_df["ct_path"].unique()
train_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=train_area_df[train_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    train_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

100%|██████████| 1990/1990 [01:04<00:00, 30.86it/s]


In [16]:
ct_path_list=valid_area_df["ct_path"].unique()
valid_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=valid_area_df[valid_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    valid_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

100%|██████████| 484/484 [00:02<00:00, 217.76it/s]


In [18]:
import pickle

In [None]:
with open('/home/fate/covid19_CT/output/train_dic1_05.pickle', 'wb') as handle:
    pickle.dump(train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('/home/fate/covid19_CT/output/valid_dic1_05.pickle', 'wb') as handle:
    pickle.dump(valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)