# Initial setup
## Load dependencies

In [7]:
import os
import numpy as np
from pathlib import Path
import time
from shutil import copyfile
import pandas as pd

## Define locations for data
We seperate output directories into raw and non-raw. This is because we need to do a set of preprocessing stages to the raw data before we make the final dataset.

In [8]:
# inputs
input_image_dir = '//media/jake/1tb_ssd/mctv_analysis/Head Scans'
input_labels_dir = '//home/jake/projects/mctv_resfiles'

# outputs
main_dir = '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data'
raw_image_dir = '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images'
raw_labels_dir = '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_labels'
image_dir = '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/images'
labels_dir = '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/labels'

if not os.path.isdir(image_dir):
    os.makedirs(image_dir)

if not os.path.isdir(labels_dir):
    os.makedirs(labels_dir)

# Collate images
Our original images are organised in a messy way. Let's collate them together and fix that up.

First, let's find all the images in the input image directory

In [9]:
nii_files = []
dicom_dirs = []
for root, dirs, files in os.walk(input_image_dir):
    for i, file in enumerate(files):
        if file.endswith('.nii'):
            nii_files.append(root + '/' + file)
        if file.endswith('.dcm'):
            dicom_dirs.append(root)

# remove duplicate dicom_dirs
dicom_dirs = list(set(dicom_dirs))

print('Found ' + str(len(nii_files)) + ' nifti files')
print('Found ' + str(len(dicom_dirs)) + ' dicom directories')

Found 42 nifti files
Found 86 dicom directories


Let's plot the first 10

In [8]:
nii_files[:10]

['//media/jake/1tb_ssd/mctv_analysis/head_scans/Brachyscelus.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Vibilia_01_FEG191211_087_filterted.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG200130_103_head_04.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Scypholanceola_head_02_FEG191022_076.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Platyscelus_02_FEG191112_082.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Phronima_05_FEG200107_090.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG200130_102_head_05.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Hyperia 01_segmented eyes.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Phronima_04_FEG200107_089.nii',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/psyllid_20190906_male_eye/psyllid_20190906_male_eye.nii']

In [9]:
dicom_dirs[:10]

['//media/jake/1tb_ssd/mctv_analysis/head_scans/Scina_02_sp_2_1450794_slices',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_01_sp_6_F',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Cystisoma_FEG20190212_01_head',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/P_crassipes_FEG190213_002_head_eyesdamaged',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Pronoe_sp_6_E',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_FEG181024_head_sp_3_1423158a',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/P_crassipes_FEG200130_104',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Phronima_head_1450842_head_2',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/flammula_male_23_6_left',
 '//media/jake/1tb_ssd/mctv_analysis/head_scans/Paraphronima_head_04_FEG200130_103']

# Move nifti to raw images folder
Let's move all the nifti files to the raw_images directory.


In [57]:
for d in nii_files:
    p = Path(d)
    filename = p.stem
    new_path = raw_image_dir + "/" + filename + '.nii'
    if not os.path.isfile(new_path):
        copyfile(d, new_path)
        print(f'Copied nifti {d} to {new_path}\n')
    else:
        print(f'File {new_path} already exists, so skipped this copy\n')


File //media/jake/data/jake/mctnet_data/raw_images/Brachyscelus.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Vibilia_01_FEG191211_087_filterted.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Paraphronima_FEG200130_103_head_04.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Scypholanceola_head_02_FEG191022_076.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Platyscelus_02_FEG191112_082.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Phronima_05_FEG200107_090.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Paraphronima_FEG200130_102_head_05.nii already exists, so skipped this copy

File //media/jake/data/jake/mctnet_data/raw_images/Hyperia 01_segmented eyes.nii already exists, so skipped this copy

File //media/jake/da

# Convert dicom to nifti
A bunch of files are in the dicom format. Let's convert these to nifti, so that all our data are consistent and in single files.

To do this, we will be using dcm2niix found at https://github.com/rordenlab/dcm2niix
This can be installed on linux via
```
sudo apt install dicom2nixx
```


We have the dicom directories in `dicom_dirs` but we also need to extract the output path and the filename. Let's do that here

In [10]:
import pprint

conversion_info = []

for d in dicom_dirs:
    p = Path(d)
    filename = p.stem
    #new_path = raw_image_dir + "/" + filename + '.nii'

    conversion_info.append(
        dict(
            in_dir=d,
            out_dir=raw_image_dir,
            filename=filename
        )
    )

# print first 5 to check if looks ok
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(conversion_info[:5])

[ { 'filename': 'Schypholanceola_FEG190802_038_01_head',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/Head '
              'Scans/Schypholanceola_FEG190802_038_01_head',
    'out_dir': '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images'},
  { 'filename': 'Paraphronima_sp_8_1423158-slices',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/Head '
              'Scans/Phronima_sp_5_USNM1450842/Paraphronima_sp_8_1423158-slices',
    'out_dir': '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images'},
  { 'filename': 'Streetsia_sp_4_1450785b',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/Head '
              'Scans/Streetsia_head_sp_4_1450785b/Streetsia_sp_4_1450785b',
    'out_dir': '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images'},
  { 'filename': 'FEG210804_117-Phrosina-semilunata-body-registered',
    'in_dir': '//media/jake/1tb_ssd/mctv_analysis/Head '
              'Scans/FEG210804_117-Phrosina-semilunata-body-re

Now let's create a yaml file to tell dicom2niix what to do

In [11]:
import yaml

data = dict(
    Options = dict(
      isGz='false',
      isFlipY='false',
      isVerbose='false',
      isCreateBIDS='false',
      isOnlySingleFile='false'
    ),
    Files = conversion_info
)
with open('dicom2niix_batch_info.yml', 'w') as outfile:
    yaml.dump(data, outfile)

and let's run dicom2niix (warning this may generate a huge output, so clear this afterwards)

In [None]:
!dcm2niibatch dicom2niix_batch_info.yml

Let's now look at the scans that are in the directory to make sure everything is set up properly.

In [13]:
print(
    f'There are {len(dicom_dirs)} dicom directories that should have been converted to \n\
        nifti and {len(nii_files)} nifti files that should have been moved \n\
        i.e. a total of {len(dicom_dirs) + len(nii_files)} nifti files.'
)

There are 70 dicom directories that should have been converted to 
        nifti and 26 nifti files that should have been moved 
        i.e. a total of 96 nifti files.


In [34]:
raw_nii_files = []
for file in os.listdir(raw_image_dir):
    if file.endswith('.nii'):
        raw_nii_files.append(raw_image_dir + '/' + file)


print(f'There are {len(raw_nii_files)} nifti files in the output folder')

There are 151 nifti files in the output folder


In [35]:
assert len(raw_nii_files) == (len(dicom_dirs) + len(nii_files)), 'the number of files do not match up'

AssertionError: the number of files do not match up

In [36]:
print(raw_nii_files[:5])

['//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images/Paraphronima_head_05_FEG200130_102.nii', '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images/Vibilia_FEG191112_081_blurrya.nii', '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images/Phronima_02_head_1450842_head_3.nii', '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images/Phronima_01_head_sp_5_USNM1450842.nii', '//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jake/mctnet_data/raw_images/FEG191028_078_dcm.nii']


If the above assert statement causes an error, then that means you may need to manually sort through these files to see what might have happened wrong in the conversion process. If some did not convert, then try to convert them manually via a gui interface. If there are too many files, go through them and check to see which scan matches up with the annotations.


# Checking and linking the image files to the annotated files manually

Let's create a csv file with the paths to all the image files we moved/converted. Then, let's add the annotated files we know of to this csv file

In [53]:
df = pd.DataFrame(
 {
    'image_file_path': raw_nii_files,
    'raw_annotated_file_path': '',
    'name': '',
    'converted_from_dicom': ''
 }
)
df

Unnamed: 0,image_file_path,raw_annotated_file_path,name
0,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
1,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
2,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
3,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
4,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
...,...,...,...
146,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
147,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
148,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,
149,//mnt/d37c99c5-3b94-47b9-9965-c66fd9a16e23/jak...,,


In [54]:
newfile_path = 'raw_image_info.csv'
if not os.path.isfile(newfile_path):
    df.to_csv('raw_image_info.csv', index=False)
else:
    print(newfile_path + ' already exists, so did not overwrite.')

raw_image_info.csv already exists, so did not overwrite.


Open this up with libreoffice (alternatively, just open the file with another program like excel)

In [None]:
!libreoffice --calc raw_image_info.csv

javaldx: Could not find a Java Runtime Environment!
Please ensure that a JVM and the package libreoffice-java-common
is installed.
If it is already installed then try removing ~/.config/libreoffice/4/user/config/javasettings_Linux_*.xml


Once you have created this csv file and filled it in, you should be ready to move onto step 02.

# Convert dicom to nifti alternative (doesn't seem to work too well with some dicom files.
# appears to be a problem with dicom2nifti or its inability to deal with file errors)

Let's start by making a function that can convert our dicom directories to nifti.

In [17]:
def convert_nifti_to_dicom(d):
    print(f'Starting conversion of {d} \n...\n')

    p = Path(d)
    filename = p.stem
    new_path = raw_image_dir + "/" + filename + '.nii'
    
    if not os.path.isfile(new_path):
        dicom2nifti.dicom_series_to_nifti(d, new_path, reorient_nifti=True)
        print(f'Converted dicom {d} to {new_path}\n')
    else:
        print(f'File {new_path} already exists, so skipped this conversion\n')


To make this faster, let's create some multiprocessing helper functions

In [26]:
import multiprocessing

# split a list into evenly sized chunks
def chunks(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]

def do_job(job_id, data, func):
    for item in data:
        print(f'Worker {job_id} doing job\n')
        func(item)

def dispatch_jobs(job_number, data, func):
    total = len(data)
    chunk_size = total / job_number
    slice = chunks(data, int(chunk_size))
    jobs = []

    for i, s in enumerate(slice):
        j = multiprocessing.Process(target=do_job, args=(i, s, func))
        jobs.append(j)
    for j in jobs:
        j.start()

Now let's run this function over our data!

In [27]:
print(f'Converting {len(dicom_dirs)} dicom scans to nifti')

starttime = time.time()

dispatch_jobs(6, dicom_dirs, convert_nifti_to_dicom)

print('That took {} seconds'.format(time.time() - starttime))

Converting 70 dicom scans to nifti
Worker 0 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/flammula_male_23_6_left 
...
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_03_sp_6_A 
...
Worker 1 doing job

File /media/jake/data/jake/mctnet_data/raw_images/flammula_male_23_6_left.nii already exists, so skipped this conversion


Worker 0 doing jobWorker 2 doing job

Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Eupronoe_03_sp_6_A/Leptocotis_FEG190214_004a_body_slices 
...
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Hyperia_FEG190604_013A-slices-body 
...


File /media/jake/data/jake/mctnet_data/raw_images/Leptocotis_FEG190214_004a_body_slices.nii already exists, so skipped this conversion
Worker 3 doing job

Worker 0 doing jobStarting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Lanceola_01_head_FEG190802_037 
...
Worker 4 doing jobThat took 0.05334925651550293 seconds


S

Process Process-32:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_767898/2836884920.py", line 10, in do_job
    func(item)
  File "/tmp/ipykernel_767898/1759632588.py", line 9, in convert_nifti_to_dicom
    dicom2nifti.dicom_series_to_nifti(d, new_path, reorient_nifti=True)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 78, in dicom_series_to_nifti
    return dicom_array_to_nifti(dicom_input, output_file, reorient_nifti)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 112, in dicom_array_to_nifti
    if not are_imaging_dicoms(dicom_list):
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 151, in are_

Converted dicom /media/jake/1tb_ssd/mctv_analysis/head_scans/Cystisoma_FEG20190212_01_head to /media/jake/data/jake/mctnet_data/raw_images/Cystisoma_FEG20190212_01_head.nii

Worker 6 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Scina_02_sp_2_1450794_slices 
...

File /media/jake/data/jake/mctnet_data/raw_images/Scina_02_sp_2_1450794_slices.nii already exists, so skipped this conversion

Worker 6 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Leptocotis_01_head_FEG190214_005a 
...

File /media/jake/data/jake/mctnet_data/raw_images/Leptocotis_01_head_FEG190214_005a.nii already exists, so skipped this conversion

Worker 6 doing job
Starting conversion of /media/jake/1tb_ssd/mctv_analysis/head_scans/Vibilia_FEG191113_084_blurry 
...

Converted dicom /media/jake/1tb_ssd/mctv_analysis/head_scans/Schypholanceola_FEG190801_036_head to /media/jake/data/jake/mctnet_data/raw_images/Schypholanceola_FEG190801_036_head.nii

Worker 0 doi

Process Process-29:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_767898/2836884920.py", line 10, in do_job
    func(item)
  File "/tmp/ipykernel_767898/1759632588.py", line 9, in convert_nifti_to_dicom
    dicom2nifti.dicom_series_to_nifti(d, new_path, reorient_nifti=True)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 78, in dicom_series_to_nifti
    return dicom_array_to_nifti(dicom_input, output_file, reorient_nifti)
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 112, in dicom_array_to_nifti
    if not are_imaging_dicoms(dicom_list):
  File "/home/jake/projects/mctnet/venv/lib/python3.8/site-packages/dicom2nifti/convert_dicom.py", line 151, in are_