# import

In [1]:
import os
import io
import json
import pandas as pd
import pandas_profiling
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pydicom

from PIL import Image
from pydicom.pixel_data_handlers.util import apply_voi_lut
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# dataset

In [2]:
train_df = pd.read_csv('dataset/mod_train_df.csv')
train_df.head()

Unnamed: 0,id,boxes,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,000a312787f2,"[{""xmin"": 119, ""ymin"": 107, ""xmax"": 273, ""ymax...",5776db0cec75,0,1,0,0
1,0012ff7358bc,"[{""xmin"": 142, ""ymin"": 50, ""xmax"": 324, ""ymax""...",9d514ce429a7,0,1,0,0
2,001398f4ff4f,"[{""xmin"": 408, ""ymin"": 397, ""xmax"": 550, ""ymax...",28dddc8559b2,0,0,0,1
3,001bd15d1891,"[{""xmin"": 117, ""ymin"": 240, ""xmax"": 251, ""ymax...",dfd9fdd85a3e,0,1,0,0
4,0022227f5adf,"[{""xmin"": 390, ""ymin"": 128, ""xmax"": 469, ""ymax...",84543edc24c2,0,0,1,0


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4294 entries, 0 to 4293
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        4294 non-null   object
 1   boxes                     4294 non-null   object
 2   StudyInstanceUID          4294 non-null   object
 3   Negative for Pneumonia    4294 non-null   int64 
 4   Typical Appearance        4294 non-null   int64 
 5   Indeterminate Appearance  4294 non-null   int64 
 6   Atypical Appearance       4294 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 235.0+ KB


# TFRecord

In [4]:
train_df, eval_df = train_test_split(train_df, test_size=0.1)
len(train_df),len(eval_df)

(3864, 430)

In [5]:
def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def int64_list_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def bytes_list_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def float_list_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [6]:
def create_tf_example(dir_name, file_name, data):
    with tf.io.gfile.GFile(os.path.join(dir_name, '{}'.format(file_name)), 'rb') as fid:
        encoded_jpg = fid.read()
    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size
    
    filename = file_name.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in data.iterrows():
        xmins.append(row['xmin'] / width)
        xmaxs.append(row['xmax'] / width)
        ymins.append(row['ymin'] / height)
        ymaxs.append(row['ymax'] / height)
        classes_text.append(row['class'].encode('utf8'))
        classes.append(class_text_to_int(row['class']))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': int64_feature(height),
        'image/width': int64_feature(width),
        'image/filename': bytes_feature(filename),
        'image/source_id': bytes_feature(filename),
        'image/encoded': bytes_feature(encoded_jpg),
        'image/format': bytes_feature(image_format),
        'image/object/bbox/xmin': float_list_feature(xmins),
        'image/object/bbox/xmax': float_list_feature(xmaxs),
        'image/object/bbox/ymin': float_list_feature(ymins),
        'image/object/bbox/ymax': float_list_feature(ymaxs),
        'image/object/class/text': bytes_list_feature(classes_text),
        'image/object/class/label': int64_list_feature(classes),
    }))
    return tf_example

In [7]:
def create_df(data):
    dict_data = json.loads(data['boxes'])
    if(data['Typical Appearance']==1):
        classes = 'Typical Appearance'
    elif(data['Indeterminate Appearance']==1):
        classes = 'Indeterminate Appearance'
    elif(data['Atypical Appearance']==1):
        classes = 'Atypical Appearance'
    else:
        print("ERROR")
        
        
    return_list = []
    for data in dict_data:
        value = (classes,
                 data['xmin'],
                 data['ymin'],
                 data['xmax'],
                 data['ymax']
        )
        return_list.append(value)
        
    column_name = ['class','xmin','ymin','xmax','ymax']
    return_df = pd.DataFrame(return_list, columns=column_name)
    return return_df

In [8]:
def class_text_to_int(value):
    class2int = {
        'Typical Appearance':1,
        'Indeterminate Appearance':2,
        'Atypical Appearance':3
    }
    return class2int[value]

In [9]:
train_df['boxes'] = train_df['boxes'].str.replace(r'\'','"')

writer = tf.io.TFRecordWriter('dataset/tfrecord/train.tfrecord')

for index, data in tqdm(train_df.iterrows()):
    if not glob('dataset/train/'+data['StudyInstanceUID']+'/*/'+data['id']+'.dcm'):
        train_df.drop(index)
        continue
    else:
        path = glob('dataset/train/'+data['StudyInstanceUID']+'/*/'+data['id']+'.dcm')[0]
        
    dir_name  = os.path.dirname(path)
    file_name = os.path.basename(path)[:-4]+'.jpg'
    
    input_data = create_df(data)
    tf_example = create_tf_example(dir_name,file_name,input_data)
    writer.write(tf_example.SerializeToString())
writer.close()

3864it [00:04, 808.00it/s]


In [10]:
eval_df['boxes'] = eval_df['boxes'].str.replace(r'\'','"')

writer = tf.io.TFRecordWriter('dataset/tfrecord/eval.tfrecord')

for index, data in tqdm(eval_df.iterrows()):
    if not glob('dataset/train/'+data['StudyInstanceUID']+'/*/'+data['id']+'.dcm'):
        eval_df.drop(index)
        continue
    else:
        path = glob('dataset/train/'+data['StudyInstanceUID']+'/*/'+data['id']+'.dcm')[0]
        
    dir_name  = os.path.dirname(path)
    file_name = os.path.basename(path)[:-4]+'.jpg'
    
    input_data = create_df(data)
    tf_example = create_tf_example(dir_name,file_name,input_data)
    writer.write(tf_example.SerializeToString())
writer.close()

430it [00:00, 807.21it/s]
