# Initial setup
## Load dependencies

In [23]:
import os
import numpy as np
from pathlib import Path
import time
from shutil import copyfile

## Define locations for data
We seperate output directories into raw and non-raw/final. This is because we need to do a set of preprocessing stages to the raw data before we make the final dataset.

In [53]:
# inputs
input_image_dir = '//media/jake/1tb_ssd/mctv_analysis/head_scans'
input_labels_dir = '//media/jake/1tb_ssd/mctv_analysis/mctv_resfiles'

# outputs
main_dir = '//media/jake/data/jake/mctnet_data'
raw_image_dir = '//media/jake/data/jake/mctnet_data/raw_images'
raw_labels_dir = '//media/jake/data/jake/mctnet_data/raw_labels'
image_dir = '//media/jake/data/jake/mctnet_data/images'
labels_dir = '//media/jake/data/jake/mctnet_data/labels'

if not os.path.isdir(image_dir):
    os.makedirs(image_dir)

if not os.path.isdir(labels_dir):
    os.makedirs(labels_dir)

# Collate images
Our original images are organised in a messy way. Let's collate them together and fix that up.

First, let's find all the images in the input image directory

In [54]:
nii_files = []
dicom_dirs = []
for root, dirs, files in os.walk(input_image_dir):
    for i, file in enumerate(files):
        if file.endswith('.nii'):
            nii_files.append(root + '/' + file)
        if file.endswith('.dcm'):
            dicom_dirs.append(root)

# remove duplicate dicom_dirs
dicom_dirs = list(set(dicom_dirs))

print('Found ' + str(len(nii_files)) + ' nifti files')
print('Found ' + str(len(dicom_dirs)) + ' dicom directories')

Found 26 nifti files
Found 70 dicom directories


Let's plot the first 10

In [55]:
nii_files[:10]

['//media/jake/1tb_ssd/mctv_analysis/head_scans/Brachyscelus.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Vibilia_01_FEG191211_087_filterted.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG200130_103_head_04.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Scypholanceola_head_02_FEG191022_076.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Platyscelus_02_FEG191112_082.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Phronima_05_FEG200107_090.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG200130_102_head_05.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Hyperia 01_segmented eyes.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Phronima_04_FEG200107_089.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/psyllid_20190906_male_eye/psyllid_20190906_male_eye.nii']

In [56]:
dicom_dirs[:10]

['//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Streetsia_01_head_sp_4_USNM1450785_3',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_03_sp_6_A',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Scina_02_sp_2_1450794_slices',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG181024_head_sp_8_1423158/Paraphronima_sp_8_1423158-slices',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Hyperia_head_7_1432188A',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Cystisoma_FEG190802_040_brain',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Streetsia_head_sp_4_USNM1450785_2',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Streetsia_head_sp_4_USNM1450785_1',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/P_crassipes_FEG190213_003b_02_head']

# Move nifti to raw images folder
Let's move all the nifti files to the raw_images directory.


In [57]:
for d in nii_files:
    p = Path(d)
    filename = p.stem
    new_path = raw_image_dir + "/" + filename + '.nii'
    if not os.path.isfile(new_path):
        copyfile(d, new_path)
        print(f'Copied nifti {d} to {new_path}\n')
    else:
        print(f'File {new_path} already exists, so skipped this copy\n')


File //media/jake/data/jake/mctnet_data/raw_images/Brachyscelus.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Vibilia_01_FEG191211_087_filterted.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Paraphronima_FEG200130_103_head_04.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Scypholanceola_head_02_FEG191022_076.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Platyscelus_02_FEG191112_082.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Phronima_05_FEG200107_090.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Paraphronima_FEG200130_102_head_05.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Hyperia 01_segmented eyes.nii already exists, so skipped this copy

File //media/jake/da

# Convert dicom to nifti
A bunch of files are in the dicom format. Let's convert these to nifti, so that all our data are consistent and in single files.

To do this, we will be using dcm2niix found at https://github.com/rordenlab/dcm2niix
This can be installed on linux via
```
sudo apt install dicom2nixx
```


We have the dicom directories in `dicom_dirs` but we also need to extract the output path and the filename. Let's do that here

In [62]:
import pprint

conversion_info = []

for d in dicom_dirs:
    p = Path(d)
    filename = p.stem
    #new_path = raw_image_dir + "/" + filename + '.nii'

    conversion_info.append(
        dict(
            in_dir=d,
            out_dir=raw_image_dir,
            filename=filename
        )
    )

# print first 5 to check if looks ok
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(conversion_info[:5])

[ { 'filename': 'Paraphronima_head_04_FEG200130_103',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103',
    'out_dir': '//media/jake/data/jake/mctnet_data/raw_images'},
  { 'filename': 'Streetsia_01_head_sp_4_USNM1450785_3',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/head_scans/Streetsia_01_head_sp_4_USNM1450785_3',
    'out_dir': '//media/jake/data/jake/mctnet_data/raw_images'},
  { 'filename': 'Eupronoe_03_sp_6_A',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_03_sp_6_A',
    'out_dir': '//media/jake/data/jake/mctnet_data/raw_images'},
  { 'filename': 'Scina_02_sp_2_1450794_slices',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/head_scans/Scina_02_sp_2_1450794_slices',
    'out_dir': '//media/jake/data/jake/mctnet_data/raw_images'},
  { 'filename': 'Paraphronima_sp_8_1423158-slices',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG181024_head_sp_8_1423158/Paraphronima_sp_8_

Now let's create a yaml file to tell dicom2niix what to do

In [63]:
import yaml

data = dict(
    Options = dict(
      isGz='false',
      isFlipY='false',
      isVerbose='false',
      isCreateBIDS='false',
      isOnlySingleFile='false'
    ),
    Files = conversion_info
)
with open('dicom2niix_batch_info.yml', 'w') as outfile:
    yaml.dump(data, outfile)

and let's run dicom2niix

In [None]:
!dcm2niibatch dicom2niix_batch_info.yml

dcm2niibatch using Chris Rorden's dcm2niiX version v1.0.20181125  (JP2:OpenJPEG) GCC9.3.0 (64-bit Linux)
yaml_path: dicom2niix_batch_info.yml
Compression will be faster with 'pigz' installed
Found 1496 DICOM file(s)
Error: Instance number (0020,0013) not found: //media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103/FEG200130_103_0000.dcm
Error: Instance number (0020,0013) not found: //media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103/FEG200130_103_0001.dcm
Error: Instance number (0020,0013) not found: //media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103/FEG200130_103_0002.dcm
Error: Instance number (0020,0013) not found: //media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103/FEG200130_103_0003.dcm
Error: Instance number (0020,0013) not found: //media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103/FEG200130_103_0004.dcm
Error: Instance number (0020,0013)

# Move raw labels
Now that all our images are in our raw_images folder. Let's move the raw labels over too.

# DON'T USE
# Convert dicom to nifti alternative (doesn't seem to work too well with some dicom files.
# appears to be a problem with dicom2nifti)

Let's start by making a function that can convert our dicom directories to nifti.

In [17]:
def convert_nifti_to_dicom(d):
    print(f'Starting conversion of {d} \n...\n')

    p = Path(d)
    filename = p.stem
    new_path = raw_image_dir + "/" + filename + '.nii'
    
    if not os.path.isfile(new_path):
        dicom2nifti.dicom_series_to_nifti(d, new_path, reorient_nifti=True)
        print(f'Converted dicom {d} to {new_path}\n')
    else:
        print(f'File {new_path} already exists, so skipped this conversion\n')


To make this faster, let's create some multiprocessing helper functions

In [26]:
import multiprocessing

# split a list into evenly sized chunks
def chunks(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]

def do_job(job_id, data, func):
    for item in data:
        print(f'Worker {job_id} doing job\n')
        func(item)

def dispatch_jobs(job_number, data, func):
    total = len(data)
    chunk_size = total / job_number
    slice = chunks(data, int(chunk_size))
    jobs = []

    for i, s in enumerate(slice):
        j = multiprocessing.Process(target=do_job, args=(i, s, func))
        jobs.append(j)
    for j in jobs:
        j.start()

Now let's run this function over our data!

In [27]:
print(f'Converting {len(dicom_dirs)} dicom scans to nifti')

starttime = time.time()

dispatch_jobs(6, dicom_dirs, convert_nifti_to_dicom)

print('That took {} seconds'.format(time.time() - starttime))

Converting 70 dicom scans to nifti
Worker 0 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/flammula_male_23_6_left 
...
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_03_sp_6_A 
...
Worker 1 doing job

File /media/jake/data/jake/mctnet_data/raw_images/flammula_male_23_6_left.nii already exists, so skipped this conversion


Worker 0 doing jobWorker 2 doing job

Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_03_sp_6_A/Leptocotis_FEG190214_004a_body_slices 
...
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Hyperia_FEG190604_013A-slices-body 
...


File /media/jake/data/jake/mctnet_data/raw_images/Leptocotis_FEG190214_004a_body_slices.nii already exists, so skipped this conversion
Worker 3 doing job

Worker 0 doing jobStarting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Lanceola_01_head_FEG190802_037 
...
Worker 4 doing jobThat took 0.05334925651550293 seconds


S

Process Process-32:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_767898/2836884920.py", line 10, in do_job
    func(item)
  File "/tmp/ipykernel_767898/1759632588.py", line 9, in convert_nifti_to_dicom
    dicom2nifti.dicom_series_to_nifti(d, new_path, reorient_nifti=True)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 78, in dicom_series_to_nifti
    return dicom_array_to_nifti(dicom_input, output_file, reorient_nifti)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 112, in dicom_array_to_nifti
    if not are_imaging_dicoms(dicom_list):
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 151, in are_

Converted dicom /media/jake/1tb_ssd/mctv_analysis/head_scans/Cystisoma_FEG20190212_01_head to /media/jake/data/jake/mctnet_data/raw_images/Cystisoma_FEG20190212_01_head.nii

Worker 6 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Scina_02_sp_2_1450794_slices 
...

File /media/jake/data/jake/mctnet_data/raw_images/Scina_02_sp_2_1450794_slices.nii already exists, so skipped this conversion

Worker 6 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Leptocotis_01_head_FEG190214_005a 
...

File /media/jake/data/jake/mctnet_data/raw_images/Leptocotis_01_head_FEG190214_005a.nii already exists, so skipped this conversion

Worker 6 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Vibilia_FEG191113_084_blurry 
...

Converted dicom /media/jake/1tb_ssd/mctv_analysis/head_scans/Schypholanceola_FEG190801_036_head to /media/jake/data/jake/mctnet_data/raw_images/Schypholanceola_FEG190801_036_head.nii

Worker 0 doi

Process Process-29:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_767898/2836884920.py", line 10, in do_job
    func(item)
  File "/tmp/ipykernel_767898/1759632588.py", line 9, in convert_nifti_to_dicom
    dicom2nifti.dicom_series_to_nifti(d, new_path, reorient_nifti=True)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 78, in dicom_series_to_nifti
    return dicom_array_to_nifti(dicom_input, output_file, reorient_nifti)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 112, in dicom_array_to_nifti
    if not are_imaging_dicoms(dicom_list):
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 151, in are_