# Imports

In [30]:
import pandas as pd
import seaborn as sns
import numpy as np
import cv2
import pydicom as dicom
import matplotlib.pylab as plt
import matplotlib.patches as patches

from tqdm import tqdm
from collections import Counter
import os
from PIL import Image
import joblib
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [11]:
data=pd.read_csv("merged_data.csv",index_col=0)

In [12]:
data.head()

Unnamed: 0,patientId,x,y,width,height,Target,class,age,sex
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0,No Lung Opacity / Not Normal,51.0,Female
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0,No Lung Opacity / Not Normal,48.0,Female
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0,No Lung Opacity / Not Normal,19.0,Male
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0,Normal,28.0,Male
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,Lung Opacity,32.0,Female


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30227 entries, 0 to 30226
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   patientId  30227 non-null  object 
 1   x          9555 non-null   float64
 2   y          9555 non-null   float64
 3   width      9555 non-null   float64
 4   height     9555 non-null   float64
 5   Target     30227 non-null  int64  
 6   class      30227 non-null  object 
 7   age        30227 non-null  float64
 8   sex        30227 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
train_image_dir="./stage_2_train_images/"
test_image_dir="./stage_2_test_images/"

In [7]:
train_processed_image_dir='./stage_2_train_processed_images_640/'
if not os.path.exists(train_processed_image_dir):
    os.makedirs(train_processed_image_dir)
    
test_processed_image_dir='./stage_2_test_processed_images_640/'
if not os.path.exists(test_processed_image_dir):
    os.makedirs(test_processed_image_dir)    

In [8]:
image_dimension=640
scale=image_dimension/1024

print(scale)

0.625


In [15]:
def process_dicom_image_and_save_compressed_image(srcDir, finalDir, fileName):
    img_np_array = dicom.read_file(os.path.join(srcDir,fileName)).pixel_array
    img = cv2.resize(img_np_array, (image_dimension, image_dimension), interpolation=cv2.INTER_AREA)
    #https://stackoverflow.com/a/54465855/1225413
    kernel = np.array([[-1,-1,-1], 
                       [-1, 9,-1],
                       [-1,-1,-1]])
    sharpened = cv2.filter2D(img, -1, kernel)
    
#     https://stackoverflow.com/a/6917185/1225413
    sharpened =  (255.0 / sharpened.max() * (sharpened - sharpened.min())).astype(np.uint8)
    im = Image.fromarray(sharpened)
    im.save(os.path.join(finalDir,os.path.splitext(fileName)[0]+".png"))
    return

In [14]:
process_dicom_image_and_save_compressed_image(train_image_dir, train_processed_image_dir, list(data[4:5]["patientId"])[0]+".dcm")

In [16]:
%%time
for i in tqdm(data["patientId"].values):
    process_dicom_image_and_save_compressed_image(train_image_dir,train_processed_image_dir, i+".dcm")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30227/30227 [14:17<00:00, 35.25it/s]

Wall time: 14min 17s





In [17]:
%%time
for (root,dirs,files) in os.walk(test_image_dir):    
    for name in tqdm(files):
        process_dicom_image_and_save_compressed_image(root, test_processed_image_dir, name)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:23<00:00, 35.86it/s]

Wall time: 1min 23s





In [18]:
%%time
X_train_list=[]
x_min_list=[]
y_min_list=[]
width_list=[]
height_list=[]
y_train_list=[]
class_list=[]
patient_id_list=[]
target_class_list=[]
for i in tqdm(data[["patientId","x","y","width","height","Target","class"]].values):    
    X_train=train_processed_image_dir+i[0]+".png"    
    y_train=i[5]
    patient_id_list.append(i[0])
    X_train_list.append(X_train)
    x_min_list.append(i[1]*scale)
    y_min_list.append(i[2]*scale)
    width_list.append(i[3]*scale)
    height_list.append(i[4]*scale)
    y_train_list.append(y_train)
    if(y_train==1):
        target_class_list.append("Pneumonia")
    else:
        target_class_list.append("Not Pneumonia")
    class_list.append(i[6])

train_data = pd.DataFrame(zip(patient_id_list, X_train_list, x_min_list, y_min_list, width_list, height_list,
                              y_train_list,target_class_list, class_list),
                          columns=["patientId","path","x","y","width","height", "target", "target_class", "class"])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30227/30227 [00:00<00:00, 539306.40it/s]

Wall time: 103 ms





In [19]:
train_data.head(10)

Unnamed: 0,patientId,path,x,y,width,height,target,target_class,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,./stage_2_train_processed_images_640/0004cfab-...,,,,,0,Not Pneumonia,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,./stage_2_train_processed_images_640/00313ee0-...,,,,,0,Not Pneumonia,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,./stage_2_train_processed_images_640/00322d4d-...,,,,,0,Not Pneumonia,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,./stage_2_train_processed_images_640/003d8fa0-...,,,,,0,Not Pneumonia,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,165.0,95.0,133.125,236.875,1,Pneumonia,Lung Opacity
5,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,351.25,95.0,160.0,283.125,1,Pneumonia,Lung Opacity
6,00569f44-917d-4c86-a842-81832af98c30,./stage_2_train_processed_images_640/00569f44-...,,,,,0,Not Pneumonia,No Lung Opacity / Not Normal
7,006cec2e-6ce2-4549-bffa-eadfcd1e9970,./stage_2_train_processed_images_640/006cec2e-...,,,,,0,Not Pneumonia,No Lung Opacity / Not Normal
8,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,201.875,360.625,100.0,65.0,1,Pneumonia,Lung Opacity
9,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,434.375,359.375,101.25,85.625,1,Pneumonia,Lung Opacity


In [20]:
%%time
X_test_list=[]
for i in tqdm(data[["patientId"]].values):
    X_test=test_processed_image_dir+i[0]+".png"
    X_test_list.append(X_test)

test_data = pd.DataFrame(zip(X_test_list),columns=["path"])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30227/30227 [00:00<00:00, 1509480.02it/s]

Wall time: 29 ms





train_data.to_csv("train_data.csv")

test_data.to_csv("test_data.csv")

# Processing/Creating TF Record  <font size="2">(https://blog.roboflow.com/create-tfrecord/) </font>

In [21]:
%%time
x_min_list=[]
y_min_list=[]
x_max_list=[]
y_max_list=[]
for row in train_data.iterrows():
    if(row[1]["target"]==0):
        x_min_list.append(0.0)
        y_min_list.append(0.0)
        x_max_list.append(0.0)
        y_max_list.append(0.0)
    else:
        x_min_list.append(row[1]["x"])
        y_min_list.append(row[1]["y"])
        x_max_list.append(row[1]["x"]+row[1]["width"])
        y_max_list.append(row[1]["y"]+row[1]["height"])

Wall time: 1.05 s


In [22]:
train_data["x_min"]=x_min_list
train_data["y_min"]=y_min_list
train_data["x_max"]=x_max_list
train_data["y_max"]=y_max_list

In [23]:
train_data.to_csv("train_data_640.csv")

In [24]:
train_data=pd.read_csv("train_data_640.csv")

## Train Validation Split

In [53]:
# https://stackoverflow.com/a/24151789/1225413
train_df, val_df = train_test_split(train_data, test_size=0.2, stratify=train_data["target"], random_state=43)

In [54]:
train_df.shape

(24181, 14)

In [31]:
# from __future__ import division
# from __future__ import print_function
# from __future__ import absolute_import

# import os
import io
# import sys
# import pandas as pd
import tensorflow as tf

from PIL import Image
from collections import namedtuple, OrderedDict

In [32]:
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0], [
                tf.config.experimental.VirtualDeviceConfiguration(
                    memory_limit=12432)
            ])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)

In [33]:
#https://github.com/keshavarzm/TailwindTraders_28ov70d3/blob/6c56e9ca61b02c5aa7536d225feef697a1d8c718/Source/TailwindTraders.Mobile/Resources/AR/guide/generate_tfrecord.py

def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def int64_list_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def bytes_list_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def float_list_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [34]:
#One class only - https://github.com/tensorflow/models/issues/3365#issuecomment-670983385
def class_text_to_int(row_label):
    if row_label == 1:
        return 1
    else:
        None

def split(df, group):
    data = namedtuple('data', ['path', 'object'])
    gb = df.groupby(group)
    return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]

In [37]:
examples = pd.read_csv("train_data_320.csv")
grouped = split(examples, 'path')

26684


In [52]:
grouped[0]

data(path='./stage_2_train_processed_images/0004cfab-14fd-4e49-80ba-63a80b6bddd6.png', object=   Unnamed: 0                             patientId  \
0           0  0004cfab-14fd-4e49-80ba-63a80b6bddd6   

                                                path   x   y  width  height  \
0  ./stage_2_train_processed_images/0004cfab-14fd... NaN NaN    NaN     NaN   

   target   target_class                         class  x_min  y_min  x_max  \
0       0  Not Pneumonia  No Lung Opacity / Not Normal    0.0    0.0    0.0   

   y_max  
0    0.0  )

In [35]:
def standardize(image_data):
    image_data -= np.mean(image_data, axis=0)
    image_data /= np.std(image_data, axis=0)
    return image_data

def create_tf_example(group, path):
#     with tf.gfile.GFile(os.path.join(path, '{}'.format(group.path)), 'rb') as fid:
#         encoded_png = fid.read()
#     print("type -",type(group))
    file_path=group[0]
#     print(file_path)    
    image = tf.io.read_file(file_path) # load the raw data from the file as a string

#     image = standardize(image)    
    image = tf.image.decode_png(image)
    width, height, c = image.shape
    
    image = tf.image.encode_png(image).numpy()
    
#     encoded_png_io = io.BytesIO(encoded_png)
#     image = Image.open(encoded_png_io)


#     image = image.encode("utf-8")
    
    filename = os.path.splitext(os.path.split(file_path)[1])[0].encode('utf8')
#     print(filename)
    image_format = b'png'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in group.object.iterrows():
#         print(row)
        if(row["target"]==0):
            xmins.append(0.0)
            xmaxs.append(0.0)
            ymins.append(0.0)
            ymaxs.append(0.0)
        else:
#             xmins.append(row['x_min'] / width)
#             xmaxs.append(row['x_max'] / width)
#             ymins.append(row['y_min'] / height)
#             ymaxs.append(row['y_max'] / height)
            xmins.append(row['x_min'])
            xmaxs.append(row['x_max'])
            ymins.append(row['y_min'])
            ymaxs.append(row['y_max'])
            classes_text.append(row['target_class'].encode('utf8'))
            classes.append(class_text_to_int(row['target']))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': int64_feature(height),
        'image/width': int64_feature(width),
        'image/filename': bytes_feature(filename),
        'image/source_id': bytes_feature(filename),
        'image/encoded': bytes_feature(image),
        'image/format': bytes_feature(image_format),
        'image/object/bbox/xmin': float_list_feature(xmins),
        'image/object/bbox/xmax': float_list_feature(xmaxs),
        'image/object/bbox/ymin': float_list_feature(ymins),
        'image/object/bbox/ymax': float_list_feature(ymaxs),
        'image/object/class/text': bytes_list_feature(classes_text),
        'image/object/class/label': int64_list_feature(classes),
    }))
    return tf_example

## Faster R-CNN TF Example method

In [60]:
def create_fastr_rcnn_tf_example(group, path):
    file_path=group[0]
    image = tf.io.read_file(file_path) # load the raw data from the file as a string

    image = tf.image.decode_png(image)
    width, height, c = image.shape
    
    image = tf.image.encode_png(image).numpy()
    
    filename = os.path.splitext(os.path.split(file_path)[1])[0].encode('utf8')
    image_format = b'png'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in group.object.iterrows():
        if(row["target"]==0):
            xmins.append(0.0)
            xmaxs.append(0.0)
            ymins.append(0.0)
            ymaxs.append(0.0)
        else:
            # https://stackoverflow.com/a/46815432/1225413
            xmin = row["x"] / image_dimension
            ymin = row["y"] / image_dimension
            
            xmax = (row["x"] + row["width"]) / image_dimension
            ymax = (row["y"] + row["height"]) / image_dimension
            
            #print(xmin, ymin, xmax, ymax)
            
            xmins.append(xmin)
            xmaxs.append(xmax)
            ymins.append(ymin)
            ymaxs.append(ymax)
            classes_text.append(row['target_class'].encode('utf8'))
            classes.append(class_text_to_int(row['target']))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': int64_feature(height),
        'image/width': int64_feature(width),
        'image/filename': bytes_feature(filename),
        'image/source_id': bytes_feature(filename),
        'image/encoded': bytes_feature(image),
        'image/format': bytes_feature(image_format),
        'image/object/bbox/xmin': float_list_feature(xmins),
        'image/object/bbox/xmax': float_list_feature(xmaxs),
        'image/object/bbox/ymin': float_list_feature(ymins),
        'image/object/bbox/ymax': float_list_feature(ymaxs),
        'image/object/class/text': bytes_list_feature(classes_text),
        'image/object/class/label': int64_list_feature(classes),
    }))
    return tf_example

In [70]:
%%time
    import traceback
    output_path ='train.tfrecord'

    writer = tf.io.TFRecordWriter(output_path)
    path = os.path.join('stage_2_train_processed_images')
    examples = pd.read_csv("train_data.csv")
    grouped = split(examples, 'path')

    # added
    file_errors = 0

    for group in tqdm(grouped):
        try:
            tf_example = create_tf_example(group, path)
            writer.write(tf_example.SerializeToString())
        except Exception as e:
            print(e)
            traceback.print_exc()
            # added
            file_errors +=1
            break
#             pass

    writer.close()

    # added
    print("FINISHED. There were %d errors" %file_errors)

    output_path = os.path.join(os.getcwd(), output_path)
    print('Successfully created the TFRecords: {}'.format(output_path))

26684


100%|███████████████████████████████████████████████████████████████████████████| 26684/26684 [01:24<00:00, 315.48it/s]

FINISHED. There were 0 errors
Successfully created the TFRecords: C:\29_Case_Study_2\train.tfrecord
Wall time: 1min 27s





In [45]:
%%time
    import traceback
    output_path ='train_split.tfrecord'

    writer = tf.io.TFRecordWriter(output_path)
    path = os.path.join('stage_2_train_processed_images')
#     examples = pd.read_csv("train_data.csv")
    grouped = split(train_df, 'path')

    # added
    file_errors = 0

    for group in tqdm(grouped):
        try:
            tf_example = create_tf_example(group, path)
            writer.write(tf_example.SerializeToString())
        except Exception as e:
            print(e)
            traceback.print_exc()
            # added
            file_errors +=1
            break
#             pass

    writer.close()

    # added
    print("FINISHED. There were %d errors" %file_errors)

    output_path = os.path.join(os.getcwd(), output_path)
    print('Successfully created the TFRecords: {}'.format(output_path))

NameError: name 'train_df' is not defined

In [23]:
%%time
    import traceback
    output_path ='eval_split.tfrecord'

    writer = tf.io.TFRecordWriter(output_path)
    path = os.path.join('stage_2_train_processed_images')
#     examples = pd.read_csv("train_data.csv")
    grouped = split(val_df, 'path')

    # added
    file_errors = 0

    for group in tqdm(grouped):
        try:
            tf_example = create_tf_example(group, path)
            writer.write(tf_example.SerializeToString())
        except Exception as e:
            print(e)
            traceback.print_exc()
            # added
            file_errors +=1
            break
#             pass

    writer.close()

    # added
    print("FINISHED. There were %d errors" %file_errors)

    output_path = os.path.join(os.getcwd(), output_path)
    print('Successfully created the TFRecords: {}'.format(output_path))

100%|█████████████████████████████████████████████████████████████████████████████| 5896/5896 [00:22<00:00, 266.23it/s]


FINISHED. There were 0 errors
Successfully created the TFRecords: C:\29_Case_Study_2\eval_split.tfrecord
Wall time: 22.8 s


## Creating Dataframe where target is 1(only bounding box cases)

In [25]:
train_data.isna().sum()

Unnamed: 0          0
patientId           0
path                0
x               20672
y               20672
width           20672
height          20672
target              0
target_class        0
class               0
x_min               0
y_min               0
x_max               0
y_max               0
dtype: int64

In [61]:
only_bounding_box_df=train_data.loc[train_data["target"]==1]
only_bounding_box_df.head()

Unnamed: 0.1,Unnamed: 0,patientId,path,x,y,width,height,target,target_class,class,x_min,y_min,x_max,y_max
4,4,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,165.0,95.0,133.125,236.875,1,Pneumonia,Lung Opacity,165.0,95.0,298.125,331.875
5,5,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,351.25,95.0,160.0,283.125,1,Pneumonia,Lung Opacity,351.25,95.0,511.25,378.125
8,8,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,201.875,360.625,100.0,65.0,1,Pneumonia,Lung Opacity,201.875,360.625,301.875,425.625
9,9,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,434.375,359.375,101.25,85.625,1,Pneumonia,Lung Opacity,434.375,359.375,535.625,445.0
14,14,00aecb01-a116-45a2-956c-08d2fa55433f,./stage_2_train_processed_images_640/00aecb01-...,180.0,201.25,58.75,84.375,1,Pneumonia,Lung Opacity,180.0,201.25,238.75,285.625


In [62]:
only_bounding_box_df=only_bounding_box_df.drop(columns="Unnamed: 0", axis=1)
only_bounding_box_df.head()

Unnamed: 0,patientId,path,x,y,width,height,target,target_class,class,x_min,y_min,x_max,y_max
4,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,165.0,95.0,133.125,236.875,1,Pneumonia,Lung Opacity,165.0,95.0,298.125,331.875
5,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,351.25,95.0,160.0,283.125,1,Pneumonia,Lung Opacity,351.25,95.0,511.25,378.125
8,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,201.875,360.625,100.0,65.0,1,Pneumonia,Lung Opacity,201.875,360.625,301.875,425.625
9,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,434.375,359.375,101.25,85.625,1,Pneumonia,Lung Opacity,434.375,359.375,535.625,445.0
14,00aecb01-a116-45a2-956c-08d2fa55433f,./stage_2_train_processed_images_640/00aecb01-...,180.0,201.25,58.75,84.375,1,Pneumonia,Lung Opacity,180.0,201.25,238.75,285.625


In [63]:
only_bounding_box_df.isna().sum()

patientId       0
path            0
x               0
y               0
width           0
height          0
target          0
target_class    0
class           0
x_min           0
y_min           0
x_max           0
y_max           0
dtype: int64

only_bounding_box_df["width"]=640
only_bounding_box_df["height"]=640

In [64]:
only_bounding_box_df

Unnamed: 0,patientId,path,x,y,width,height,target,target_class,class,x_min,y_min,x_max,y_max
4,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,165.000,95.000,133.125,236.875,1,Pneumonia,Lung Opacity,165.000,95.000,298.125,331.875
5,00436515-870c-4b36-a041-de91049b9ab4,./stage_2_train_processed_images_640/00436515-...,351.250,95.000,160.000,283.125,1,Pneumonia,Lung Opacity,351.250,95.000,511.250,378.125
8,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,201.875,360.625,100.000,65.000,1,Pneumonia,Lung Opacity,201.875,360.625,301.875,425.625
9,00704310-78a8-4b38-8475-49f4573b2dbb,./stage_2_train_processed_images_640/00704310-...,434.375,359.375,101.250,85.625,1,Pneumonia,Lung Opacity,434.375,359.375,535.625,445.000
14,00aecb01-a116-45a2-956c-08d2fa55433f,./stage_2_train_processed_images_640/00aecb01-...,180.000,201.250,58.750,84.375,1,Pneumonia,Lung Opacity,180.000,201.250,238.750,285.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30220,c1e73a4e-7afe-4ec5-8af6-ce8315d7a2f2,./stage_2_train_processed_images_640/c1e73a4e-...,197.500,315.000,111.875,170.625,1,Pneumonia,Lung Opacity,197.500,315.000,309.375,485.625
30221,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8,./stage_2_train_processed_images_640/c1ec14ff-...,380.625,290.000,150.000,177.500,1,Pneumonia,Lung Opacity,380.625,290.000,530.625,467.500
30222,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8,./stage_2_train_processed_images_640/c1ec14ff-...,115.625,186.250,142.500,236.875,1,Pneumonia,Lung Opacity,115.625,186.250,258.125,423.125
30225,c1f7889a-9ea9-4acb-b64c-b737c929599a,./stage_2_train_processed_images_640/c1f7889a-...,356.250,245.625,163.125,215.625,1,Pneumonia,Lung Opacity,356.250,245.625,519.375,461.250


In [65]:
train_bb_df, val_bb_df = train_test_split(only_bounding_box_df, test_size=0.2, random_state=43)

## Creating TF Record only using BB Dataframe

In [66]:
%%time
import traceback
output_path ='train_bb_split.tfrecord'

writer = tf.io.TFRecordWriter(output_path)
path = os.path.join('stage_2_train_processed_images')
#     examples = pd.read_csv("train_data.csv")
grouped = split(train_bb_df, 'path')

# added
file_errors = 0

for group in tqdm(grouped):
    try:
        tf_example = create_fastr_rcnn_tf_example(group, path)
        writer.write(tf_example.SerializeToString())
    except Exception as e:
        print(e)
        traceback.print_exc()
        # added
        file_errors +=1
        break

writer.close()

# added
print("FINISHED. There were %d errors" %file_errors)

output_path = os.path.join(os.getcwd(), output_path)
print('Successfully created the TFRecords: {}'.format(output_path))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5372/5372 [01:56<00:00, 46.28it/s]


FINISHED. There were 0 errors
Successfully created the TFRecords: c:\29_Case_Study_2\train_bb_split.tfrecord
Wall time: 1min 57s


In [67]:
%%time
import traceback
output_path ='eval_bb_split.tfrecord'

writer = tf.io.TFRecordWriter(output_path)
path = os.path.join('stage_2_train_processed_images')
#     examples = pd.read_csv("train_data.csv")
grouped = split(val_bb_df, 'path')

# added
file_errors = 0

for group in tqdm(grouped):
    try:
        tf_example = create_fastr_rcnn_tf_example(group, path)
        writer.write(tf_example.SerializeToString())
    except Exception as e:
        print(e)
        traceback.print_exc()
        # added
        file_errors +=1
        break

writer.close()

# added
print("FINISHED. There were %d errors" %file_errors)

output_path = os.path.join(os.getcwd(), output_path)
print('Successfully created the TFRecords: {}'.format(output_path))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1761/1761 [00:38<00:00, 46.26it/s]

FINISHED. There were 0 errors
Successfully created the TFRecords: c:\29_Case_Study_2\eval_bb_split.tfrecord
Wall time: 38.3 s





In [68]:
raw_dataset = tf.data.TFRecordDataset("train_bb_split.tfrecord")

for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

features {
  feature {
    key: "image/encoded"
    value {
      bytes_list {
        value: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\002\200\000\000\002\200\010\000\000\000\000)\246\226\377\000\000 \000IDATx\234\344\375\311\263dk\266\037\010\255\346\353\366\336\336\235\023\315m\362\245^\276\'\025\022\262\242\212\306\2009\2301b\300\230\177\213?\001\243&\01400&e\306\000\254\014\254\240\004\206\251$\275zRIJ\275\314\274]\304i\274\331\315\327\254\265\030\270\237\023\'\"N\334\033\221\371r\3042\273q\217\373n|\373\336?_\355o\255\217\341\357E\274\002D!D;\377\375\276 !\222\275\377\036\343{\357}\360\362O\022\244\207\277\236\371\334\217\205\037w\301\217\316\364\367sA\377\377!\177\334\315BH\037\341\345\323;\343\303\207 \200\235_\231\331\345\037Z\010\300:5\273\354B3\366\200\0000\275w,\300\303a\357\336\373\370\345\345\205\331#:\020\001\254\203\371\323_T\223\002\000\316h\235\235\377\370\350s\2379H\r\021\300\372\t,\032\000Pf\004\320XB\t\023\374\372\177u\365\235\271\340\243~\245x\277\372a\255W\326

<h1>Observation:</h1>

<ol>
    <li>Since the images have high resolution, I have rescaled the image to 224*224.</li>
    <li>Sharpening the image will help to improve performance and training the model.</li>
    <li>All the images are written in separate directory, so original data is not affected.</li>
<ol>