In [87]:
import os
import numpy as np
import cv2
import shutil
from imutils import paths

In [88]:
SPLIT = 5

In [89]:
video_dir = "../data_pocus/cleaned_data_videos"
out_image_dir = "../data_pocus/cleaned_data_images"
base_dir = "../data_pocus/cleaned_data_images"
CROSS_VAL_DIR = "../data_pocus/cross_validation_data"
FRAMERATE = 2  # 2 = we save one frame per second

### Make directories

In [4]:
# make directories:
if not os.path.exists(out_image_dir):
    os.makedirs(out_image_dir)
if not os.path.exists(CROSS_VAL_DIR):
    os.makedirs(CROSS_VAL_DIR)   
for mod in ["covid", "pneunomia", "regular"]:
    if not os.path.exists(os.path.join(out_image_dir, mod)):
        os.makedirs(os.path.join(out_image_dir, mod))

## Videos to images

In [5]:
vid_files = os.listdir(video_dir)
for i in range(len(vid_files)):
    
    # skip unnecessary files
    if vid_files[i][0]==".":
        # DStore etc
        continue
    
    # define video path
    print("video", vid_files[i], "number ",i, "out of ", len(vid_files))
    video_path = os.path.join(video_dir, vid_files[i])
    
    # determine label
    if vid_files[i][:3]=="Cov":
        label = "covid"
    elif vid_files[i][:3]=="Pne" or vid_files[i][:3]=="pne":
        label = "pneunomia"
    elif vid_files[i][:3]=="Reg":
        label = "regular"
    else:
        raise ValueError("Wrong label! "+ vid_files[i])
    out_path = os.path.join(out_image_dir, label)
    
    # copy if image
    if vid_files[i][-3:]=="jpg" or vid_files[i][-3:]=="png":
        shutil.copy(video_path, out_path)
        continue
    
    # read and write if video
    cap = cv2.VideoCapture(video_path)   # capturing the video from the given path
    frameRate = cap.get(5) #frame rate
    every_x_image = int(frameRate/FRAMERATE)
    print("actual framerate", frameRate, "--> taking every ", every_x_image, "th image")
    x=1
    while(cap.isOpened()):
        frameId = cap.get(1) #current frame number
        ret, frame = cap.read()
        if (ret != True):
            break
        if (frameId % every_x_image == 0):
            # storing the frames in a new folder named test_1
            filename = os.path.join(out_path, vid_files[i] +"_frame%d.jpg" % frameId)
            cv2.imwrite(filename, frame)
    cap.release()

video Reg-Grep-Normal.gif number  0 out of  45
actual framerate 10.0 --> taking every  5 th image
video Pneu-Atlas-pneumonia2.gif number  1 out of  45
actual framerate 10.0 --> taking every  5 th image
video Cov-Atlas-+(43).gif number  2 out of  45
actual framerate 10.0 --> taking every  5 th image
video Cov-B_ConvexProb_score1.jpg number  3 out of  45
video Pneu-Atlas-pneumonia.gif number  4 out of  45
actual framerate 10.0 --> taking every  5 th image
video Reg-Atlas-lungcurtain.gif number  5 out of  45
actual framerate 10.0 --> taking every  5 th image
video Reg-Youtube_Video_29_Lung_POCUS_right.mp4 number  6 out of  45
actual framerate 30.0 --> taking every  15 th image
video Reg-Butterfly.mp4 number  7 out of  45
actual framerate 30.0 --> taking every  15 th image
video Cov-Atlas+(45).gif number  8 out of  45
actual framerate 10.0 --> taking every  5 th image
video Cov-C_Convex_Prob_score2.jpg number  9 out of  45
video Cov-Butterfly-COVID Skip Lesion.mp4 number  10 out of  45
act

## Make splits of approximately equal test size

In [90]:
split_test = [{} for _ in range(SPLIT)]

for modality in ["covid", "pneunomia", "regular"]:
    p_vids = []
    p_fn = []
    # for traintest in ["train", "test"]:
    for cov_data in os.listdir(os.path.join(base_dir, modality)):
        if cov_data[0]==".":
            continue
        p_fn.append(cov_data)
        p_vids.append(cov_data.split(".")[0])
    vid_names, count1 = np.unique(p_vids, return_counts=True)
    count = count1.copy()
    name_list = [[v] for v in vid_names]
    # for i in range(len(vid_names)):
    #     print(vid_names[i], count1[i])

    # TODO: stattdessen mit opencv the number of frames holen

    # summarize to number of split (always merge the ones with smallest count)
    while len(count)>SPLIT:
        arg_inds = np.argsort(count)
        # merge smallest counts
        count[arg_inds[0]] = count[arg_inds[0]] + count[arg_inds[1]]
        count = np.delete(count, arg_inds[1])
        # merge video names in smallest counts
        name_list[arg_inds[0]].extend(name_list[arg_inds[1]])
        del name_list[arg_inds[1]]
    # for i in range(len(name_list)):
    #     print(name_list[i], count[i])
    
    # get filenames instead of video names
    f_list = [[] for _ in range(SPLIT)]
    for j in range(SPLIT):
        # iterate over videos for this split
        fn_list = []
        for vid in name_list[j]:
            fn_list.extend(np.array(p_fn)[np.array(p_vids)==vid])
        f_list[j] = fn_list
    
    # add to overall split list
    for j in range(SPLIT):
        split_test[j][modality] = f_list[j]

    

## Copy data from all data in cross_val directory

In [7]:
for split_ind in range(SPLIT):
    # make directory for this split
    split_path = os.path.join(CROSS_VAL_DIR, "split"+str(split_ind))
    if not os.path.exists(split_path):
        os.makedirs(split_path)
    # add each data type
    for modality in split_test[split_ind].keys():
        # make directory for each modality
        mod_path = os.path.join(split_path, modality)
        if not os.path.exists(mod_path):
            os.makedirs(mod_path)
        # copy all files
        mod_split_files = split_test[split_ind][modality]
        for fname in mod_split_files:
            shutil.copy(os.path.join(base_dir, modality, fname), mod_path)

## cross validation for loop

In [8]:
# For the pocus-splitted data

# MAIN LOOP FOR CROSS VAL
for split in range(SPLIT):
    print("-------------- SPLIT ", split, "-------------------")
    
    train_labels, test_labels, test_files = [], [], []
    train_data, test_data = [], []

    # loop over split0, split1 etc
    for imagePath in paths.list_images(CROSS_VAL_DIR):
        
        path_parts = imagePath.split(os.path.sep)
        # extract the split
        train_test = path_parts[-3][-1]
        # extract the class label from the filename
        label = path_parts[-2]
        
        # load the image, swap color channels, and resize it to be a fixed
        # 224x224 pixels while ignoring aspect ratio
        # image = cv2.imread(imagePath)
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # image = cv2.resize(image, (224, 224))
        # TESTING
        image = (imagePath.split(os.path.sep)[-1]).split(".")[0]
        
        # update the data and labels lists, respectively
        if train_test == str(split):
            test_labels.append(label)
            test_data.append(image)
            test_files.append(imagePath.split(os.path.sep)[-1])
        else:
            train_labels.append(label)
            train_data.append(image)
    
    # Test printouts
    print("train", np.unique(train_data))
    print("test", np.unique(test_data))
    assert len(set(np.unique(train_data)).intersection(set(np.unique(test_data)))) == 0, "intersection train test nonempty"
    print(len(np.unique(train_data))+ len(np.unique(test_data)))

-------------- SPLIT  0 -------------------
train ['Cov-Butterfly-COVID Lung 1' 'Cov-Butterfly-COVID Lung 2'
 'Cov-Butterfly-COVID Skip Lesion' 'Cov-MSU-COVID Lung 2-Blines'
 'Cov-MSU-SkipLesions' 'Cov-clarius' 'Cov-grepmed-blines-pocus-'
 'Cov-grepmed2' 'Cov-grepmed3' 'Pneu-Atlas-pneumonia'
 'Pneu-Atlas-pneumonia-AirBronch' 'Pneu-Atlas-pneumonia2'
 'Pneu-grep-pneumonia1' 'Pneu-grep-pneumonia3' 'Pneu-grep-pneumonia4'
 'Reg-Butterfly' 'Reg-Grep-Alines' 'Reg-Grep-Normal' 'Reg-NormalLungs'
 'Reg-Youtube-Video_902_Lung_POCUS'
 'Reg-Youtube-Video_902_Lung_POCUS-left'
 'Reg-Youtube_Video_29_Lung_POCUS_left'
 'Reg-Youtube_Video_29_Lung_POCUS_right' 'Reg-bcpocus' 'Reg-nephropocus'
 'pneu-everyday' 'pneu-gred-6' 'pneu-gred-7' 'pneu-radiopaeda']
test ['Cov-Atlas+(44)' 'Cov-Atlas+(45)' 'Cov-Atlas-+(43)' 'Cov-Atlas-Day+1'
 'Cov-Atlas-Day+2' 'Cov-Atlas-Day+3' 'Cov-Atlas-Day+4'
 'Cov-B_ConvexProb_score1' 'Cov-C_ConvexProb_score3'
 'Cov-C_Convex_Prob_score2' 'Cov-D_Convex_Prob_score3'
 'Pneu-grep-pne

## Data for covid paper model

In [104]:
txt_in_path = "../data_pocus/cross_validation_data"
txt_out_path = "../data_pocus/cross_val_txt"
name_mapping = {"covid":"COVID-19", "pneunomia":"pneumonia", "regular":"normal"}

In [111]:
for split_ind in range(SPLIT):
    # test_data = paths.list_images(os.path.join(txt_in_path, "split"+str(split_ind)))
    train_data, test_data = [], []
    for iter_fold in range(SPLIT):
        if iter_fold==split_ind:
            test_data.extend(paths.list_images(os.path.join(txt_in_path, "split"+str(split_ind))))
        else:
            train_data.extend(paths.list_images(os.path.join(txt_in_path, "split"+str(iter_fold))))

    train_test_data = {"train": train_data, "test": test_data}

    for traintest in ["train", "test"]:
        out_test_file = os.path.join(txt_out_path, "fold_"+str(split_ind)+"_"+traintest+".txt")
        with open(out_test_file, 'a') as outfile:
            for line in train_test_data[traintest]:
                parts = line.split("/")
                vid_name = parts[-1].split(".")[0]
                if " " in vid_name:
                    vid_name = vid_name.replace(" ", "_")
                    # print(vid_name)
                label = name_mapping[parts[-2]]
                out_line = "\t".join([vid_name, line, label])
                # print(vid_name, line, label)
                outfile.write(out_line+"\n")
