In [8]:
import shutil
import os, sys, random
import xml.etree.ElementTree as ET
from glob import glob
import pandas as pd
from shutil import copyfile
import pandas as pd
from sklearn import preprocessing, model_selection
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import patches
import numpy as np
import os

  from pandas.core.computation.check import NUMEXPR_INSTALLED


### 0. Import Data

Notebook to load and pre-process public data. Same code as one using on Google cloud to train model

In [10]:
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt comet_ml  # install

import torch
import utils
display = utils.notebook_init()  # checks

YOLOv5 🚀 v7.0-226-gdd9e338 Python-3.8.5 torch-2.0.1 CPU


Setup complete ✅ (16 CPUs, 32.0 GB RAM, 905.2/931.5 GB disk)


In [11]:
!git clone 'https://github.com/Shenggan/BCCD_Dataset.git'

Cloning into 'BCCD_Dataset'...
remote: Enumerating objects: 800, done.[K
remote: Total 800 (delta 0), reused 0 (delta 0), pack-reused 800[K
Receiving objects: 100% (800/800), 7.39 MiB | 19.05 MiB/s, done.
Resolving deltas: 100% (378/378), done.


In [12]:
annotations = sorted(glob('BCCD_Dataset/BCCD/Annotations/*.xml'))

df = []
cnt = 0
for file in annotations:
  prev_filename = file.split('/')[-1].split('.')[0] + '.jpg'
  filename = str(cnt) + '.jpg'
  row = []
  parsedXML = ET.parse(file)
  for node in parsedXML.getroot().iter('object'):
    blood_cells = node.find('name').text
    xmin = int(node.find('bndbox/xmin').text)
    xmax = int(node.find('bndbox/xmax').text)
    ymin = int(node.find('bndbox/ymin').text)
    ymax = int(node.find('bndbox/ymax').text)

    row = [prev_filename, filename, blood_cells, xmin, xmax, ymin, ymax]
    df.append(row)
  cnt += 1

data = pd.DataFrame(df, columns=['prev_filename', 'filename', 'cell_type', 'xmin', 'xmax', 'ymin', 'ymax'])

data[['prev_filename','filename', 'cell_type', 'xmin', 'xmax', 'ymin', 'ymax']].to_csv('blood_cell_detection.csv', index=False)


In [13]:
img_width = 640
img_height = 480

def width(df):
  return int(df.xmax - df.xmin)
def height(df):
  return int(df.ymax - df.ymin)
def x_center(df):
  return int(df.xmin + (df.width/2))
def y_center(df):
  return int(df.ymin + (df.height/2))
def w_norm(df):
  return df/img_width
def h_norm(df):
  return df/img_height

df = pd.read_csv('blood_cell_detection.csv')

le = preprocessing.LabelEncoder()
le.fit(df['cell_type'])
print(le.classes_)
labels = le.transform(df['cell_type'])
df['labels'] = labels

df['width'] = df.apply(width, axis=1)
df['height'] = df.apply(height, axis=1)

df['x_center'] = df.apply(x_center, axis=1)
df['y_center'] = df.apply(y_center, axis=1)

df['x_center_norm'] = df['x_center'].apply(w_norm)
df['width_norm'] = df['width'].apply(w_norm)

df['y_center_norm'] = df['y_center'].apply(h_norm)
df['height_norm'] = df['height'].apply(h_norm)

df.head(5)

['Platelets' 'RBC' 'WBC']


Unnamed: 0,prev_filename,filename,cell_type,xmin,xmax,...,y_center,x_center_norm,width_norm,y_center_norm,height_norm
0,BloodImage_00000.jpg,0.jpg,WBC,260,491,...,276,0.585938,0.360938,0.575,0.414583
1,BloodImage_00000.jpg,0.jpg,RBC,78,184,...,385,0.204687,0.165625,0.802083,0.20625
2,BloodImage_00000.jpg,0.jpg,RBC,63,169,...,286,0.18125,0.165625,0.595833,0.20625
3,BloodImage_00000.jpg,0.jpg,RBC,214,320,...,411,0.417187,0.165625,0.85625,0.20625
4,BloodImage_00000.jpg,0.jpg,RBC,414,506,...,398,0.71875,0.14375,0.829167,0.19375


In [14]:
df_train, df_valid = model_selection.train_test_split(df, test_size=0.1, random_state=13, shuffle=True)
print(df_train.shape, df_valid.shape)

(4399, 16) (489, 16)


In [15]:
os.mkdir('bcc/')
os.mkdir('bcc/images/')
os.mkdir('bcc/images/train/')
os.mkdir('bcc/images/valid/')

os.mkdir('bcc/labels/')
os.mkdir('bcc/labels/train/')
os.mkdir('bcc/labels/valid/')

In [16]:
def segregate_data(df, img_path, label_path, train_img_path, train_label_path):
  filenames = []
  for filename in df.filename:
    filenames.append(filename)
  filenames = set(filenames)

  for filename in filenames:
    yolo_list = []

    for _,row in df[df.filename == filename].iterrows():
      yolo_list.append([row.labels, row.x_center_norm, row.y_center_norm, row.width_norm, row.height_norm])

    yolo_list = np.array(yolo_list)
    txt_filename = os.path.join(train_label_path,str(row.prev_filename.split('.')[0])+".txt")
    # Save the .img & .txt files to the corresponding train and validation folders
    np.savetxt(txt_filename, yolo_list, fmt=["%d", "%f", "%f", "%f", "%f"])
    shutil.copyfile(os.path.join(img_path,row.prev_filename), os.path.join(train_img_path,row.prev_filename))

In [17]:
%%time
src_img_path = "BCCD_Dataset/BCCD/JPEGImages/"
src_label_path = "BCCD_Dataset/BCCD/Annotations/"

train_img_path = "bcc/images/train"
train_label_path = "bcc/labels/train"

valid_img_path = "bcc/images/valid"
valid_label_path = "bcc/labels/valid"

segregate_data(df_train, src_img_path, src_label_path, train_img_path, train_label_path)
segregate_data(df_valid, src_img_path, src_label_path, valid_img_path, valid_label_path)

CPU times: user 1.78 s, sys: 552 ms, total: 2.33 s
Wall time: 2.54 s
