In [1]:
import pandas as pd

In [2]:
#path defs
train_path = '../data/dicom-images-train/'
test_path = '../data/dicom-images-test/'
train_dicom_names = '../data/train-dicom-names.csv'
test_dicom_names = '../data/test-dicom-names.csv'
annot_csv = '../data/train-rle.csv'
sample_data = '../data/sample images/'

### Overview of Data

Issue one user had, look for these issues:
> I was able to download all train data (10712 images) - all I had to use sleep(0.025) before two api calls for each study_uid in download_images.py script. The remaining issue is that there is 37 images in train set which don't have annotations. Two options there - drop them from training, or assume its rle-encoding is "-1""

All dicom images are placed within two folders. E.g.
- ../data/dicom-images-test/
- 1.2.276.0.7230010.3.1.2.8323329.6769.1517875200.68560/ <-- folder 1 of many
- 1.2.276.0.7230010.3.1.3.8323329.6769.1517875200.68559/ <-- folder 2 of one
- 1.2.276.0.7230010.3.1.4.8323329.6769.1517875200.68561.dcm <-- single dicom image

pipe recursive file count to wc: ```find . -type f | wc -l``` Get's us the number of images

Now we need the name of all the dicom images ```find ../data/dicom-images-train/ -type f -printf "%f\n" | paste -sd ',' >> ../data/train-dicom-names``` We will use this to cross ref the annotations csv and also check for duplicate data.


Train Imgs: 10712  -  Annotations: 11582 (8296 = -1, 3286 = disorder annotated) roughly 3:1

Test Imgs: 1377  -  Annotations: No .dcm images are present in the annotation file.

Duplicate Dicom Imgs: No

##### ISSUE: 37 Dicom images are not accounted for in the annotation list. 



In [21]:
def dicom_count_checker(path, assumed_count):
    '''Assure our dicom counts are correct, no duplicates, and we downloaded the correct amount of data'''
    counted_imgs = 0
    data_set = set()
    with open(path, 'r') as infile:
        for line in infile:
            line = line.strip('\n') #chomp chomp
            counted_imgs += 1
            data_set.add(line)
    assert counted_imgs == assumed_count & len(data_set) == assumed_count
    print(path + ': count assumption valid, no duplicates!')
    return data_set

train_names = dicom_count_checker(train_dicom_names, 10712)
test_names = dicom_count_checker(test_dicom_names, 1377)



../data/train-dicom-names.csv: count assumption valid, no duplicates!
../data/test-dicom-names.csv: count assumption valid, no duplicates!


In [30]:
# ensure no data leakage between train/test. These should be same len 
# between set union and non-concat len addition
assert (len(train_names) + len(test_names)) == len(train_names.union(test_names))
print('no data leakage')

no data leakage


In [78]:
# strip out dicom file names from annotated list
# count the number of annotated files with no disoder present
count = 0
count_no_annot = 0
unique_annotations = set()

with open(annot_csv, 'r') as infile:
    next(infile)
    for line in infile:
        count += 1
        cols = line.split(',')
        cols[1] = cols[1].strip('\n').lstrip() #chomp
        if cols[1] == '-1':
            count_no_annot += 1
        unique_annotations.add(cols[0]+'.dcm')

print('total annotations: ', count)
print('total rows w/o annotations: ',count_no_annot)
print('total unique annotations: ',len(unique_annotations))

total annotations:  11582
total rows w/o annotations:  8296
total unique annotations:  10675


In [82]:
# take the disjoint of the annotated file names with the test file names. Should return an empty set.
assert unique_annotations.isdisjoint(test_names)
print('no annotated test files... yay!')

no annotated test files... yay!


In [85]:
# Here is the issue... not all training images are annotated...
print(len(train_names))
print(len(unique_annotations))

10712
10675


In [101]:
# Here are the bad actors... There are no mentions of these files in the annotations. 
# Do we treat these as a -1? or are these valid cases of the disorder... or drop them!
sym_diff = train_names.symmetric_difference(unique_annotations)
print(len(sym_diff))
sym_diff

37


{'1.2.276.0.7230010.3.1.4.8323329.10231.1517875222.737143.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.10362.1517875223.377845.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.10407.1517875223.567351.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.10599.1517875224.488727.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.1068.1517875166.144255.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.11104.1517875231.169401.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.11215.1517875231.757436.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.11557.1517875233.601090.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.11566.1517875233.640521.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.11577.1517875233.694347.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.11584.1517875233.731531.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.12062.1517875237.179186.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.13378.1517875244.961609.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.13415.1517875245.218707.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.13620.1517875246.884737.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.14557.1