In [50]:
import random
import pandas as pd
import numpy as np
import sklearn
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import requests
from io import BytesIO
from scipy import stats
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import tensorflow as tf
import math
import cv2


In [51]:
case_path = './data/thyroidCSV/cases.csv'
coord_path = './data/thyroidCSV/coors.csv'
img_path = './data/thyroidCSV/images.csv'

case_df = pd.read_csv(case_path)
coord_df = pd.read_csv(coord_path)
img_df = pd.read_csv(img_path)

In [52]:
# check img sizes
all_img_path = './data/all_images_by_coord_id/'

img_ids = ['img_'+str(num) for num in range(1, len(coord_df)+1)]


In [67]:
imgs = [Image.open(all_img_path+im_id) for im_id in img_ids]
imgs = [np.array(im) for im in imgs]
imgs = [cv2.copyMakeBorder(im,0,200,0,0,cv2.BORDER_CONSTANT) for im in imgs]


In [68]:
imgs = [Image.fromarray(im) for im in imgs]




In [71]:
new_size = (416, 416)
for im in imgs:
    im.thumbnail(new_size)

    

In [77]:
new_imgs = [im.resize(new_size) for im in imgs]




(416, 416)

In [76]:
sizes = [im.size for im in imgs]
set(sizes)


{(416, 382), (416, 416)}

In [7]:
files = img_df['filename'].values
filenames = ['./data/'+ x for x in files]



In [8]:
img_df['filename'] = filenames


In [9]:
img_df.head(10)

Unnamed: 0,id,case_id,filename,mark
0,1,1,./data/cimalab/bening/545/1.jpg,"[{""points"": [{""x"": 413, ""y"": 91}, {""x"": 406, ""..."
1,2,1,./data/cimalab/bening/545/2.jpg,"[{""points"": [{""x"": 291, ""y"": 76}, {""x"": 281, ""..."
2,3,5,./data/cimalab/bening/549/1.jpg,"[{""points"": [{""x"": 292, ""y"": 123}, {""x"": 293, ..."
3,4,5,./data/cimalab/bening/549/2.jpg,"[{""points"": [{""x"": 259, ""y"": 142}, {""x"": 258, ..."
4,5,6,./data/cimalab/bening/550/1.jpg,"[{""points"": [{""x"": 236, ""y"": 110}, {""x"": 236, ..."
5,6,6,./data/cimalab/bening/550/2.jpg,"[{""points"": [{""x"": 229, ""y"": 92}, {""x"": 226, ""..."
6,7,6,./data/cimalab/bening/550/3.jpg,"[{""points"": [{""x"": 146, ""y"": 159}, {""x"": 173, ..."
7,8,7,./data/cimalab/bening/551/1.jpg,"[{""points"": [{""x"": 260, ""y"": 117}, {""x"": 257, ..."
8,9,7,./data/cimalab/bening/551/2.jpg,"[{""points"": [{""x"": 283, ""y"": 116}, {""x"": 271, ..."
9,10,7,./data/cimalab/bening/551/3.jpg,"[{""points"": [{""x"": 310, ""y"": 161}, {""x"": 302, ..."


In [10]:
case_df.head(3)


Unnamed: 0,id,num,cancer,age,sex,composition,echogenicity,margins,calcification,tirads,reportbacaf,reporteco
0,1,545,,72.0,F,solid,marked hypoechogenecity,ill- defined,,5,,
1,2,546,,63.0,F,solid,isoechogenicity,microlobulated,,4c,,
2,3,547,,61.0,F,spongiform appareance,hypoechogenecity,well defined smooth,,4a,,


In [11]:
coord_df['img_file'] = [img_df[img_df['id'] == n]['filename'].values[0] for n in coord_df['image_id']]



In [12]:
coord_df.head(10)



Unnamed: 0,id,image_id,x,y,w,h,img_file
0,1,1,385,91,68,98,./data/cimalab/bening/545/1.jpg
1,2,2,271,72,100,143,./data/cimalab/bening/545/2.jpg
2,3,3,292,101,79,56,./data/cimalab/bening/549/1.jpg
3,4,4,258,95,69,53,./data/cimalab/bening/549/2.jpg
4,5,5,235,71,104,138,./data/cimalab/bening/550/1.jpg
5,6,6,204,66,160,166,./data/cimalab/bening/550/2.jpg
6,7,7,366,69,115,79,./data/cimalab/bening/550/3.jpg
7,8,8,253,109,65,78,./data/cimalab/bening/551/1.jpg
8,9,9,252,116,61,82,./data/cimalab/bening/551/2.jpg
9,10,10,261,121,59,79,./data/cimalab/bening/551/3.jpg


In [13]:
img_df.head(3)

Unnamed: 0,id,case_id,filename,mark
0,1,1,./data/cimalab/bening/545/1.jpg,"[{""points"": [{""x"": 413, ""y"": 91}, {""x"": 406, ""..."
1,2,1,./data/cimalab/bening/545/2.jpg,"[{""points"": [{""x"": 291, ""y"": 76}, {""x"": 281, ""..."
2,3,5,./data/cimalab/bening/549/1.jpg,"[{""points"": [{""x"": 292, ""y"": 123}, {""x"": 293, ..."


In [21]:
random.seed(5)
new_df = coord_df.copy()
train, validate, test = np.split(new_df.sample(frac=1), [int(.8*len(new_df)), int(.9*len(new_df))])

In [22]:
train.head(5)
num_list = train.image_id.values


[1,
 2,
 4,
 5,
 6,
 8,
 9,
 10,
 11,
 13,
 14,
 16,
 17,
 18,
 19,
 22,
 23,
 24,
 25,
 31,
 34,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 46,
 47,
 48,
 50,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 71,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 109,
 110,
 111,
 112,
 112,
 113,
 114,
 116,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 139,
 139,
 140,
 140,
 142,
 142,
 143,
 143,
 144,
 144,
 145,
 145,
 146,
 147,
 148,
 150,
 151,
 152,
 153,
 154,
 156,
 158,
 158,
 159,
 159,
 160,
 160,
 161,
 161,
 165,
 165,
 166,
 166,
 167,
 167,
 168,
 169,
 169,
 170,
 170,
 171,
 171,
 173,
 173,
 174,
 175,
 175,
 176,
 178,
 179,
 179,
 180,
 181,
 182,
 182,
 183,
 184,
 184,
 185,
 186,
 187,
 189,
 189,
 190,
 191,
 192,
 193,

In [15]:
# DARKNET/AlexeyAB implementation
# def find_center_scaled(x, y, w, h):
#     # pass in x, y, w, h from df.values
#     end_x = x + w
#     end_y = y - h
#     center_x = (x + end_x)/2
#     center_y = (y + end_y)/2
#     # method suggested here: https://github.com/AlexeyAB/Yolo_mark/issues/60
#     scaled_x = center_x/560
#     scaled_y = center_y/360
#     scaled_w = w/560
#     scaled_h = h/360
    
#     return scaled_x, scaled_y, scaled_w, scaled_h


# def write_files(df_name):
#     str_name =[x for x in globals() if globals()[x] is df_name][0]
#     txt_files = []
#     num_list = df_name.image_id.values
#     for num in num_list:
#         df = df_name[df_name.image_id == num]
#         dim = df.values[0]
#         s_x, s_y, s_w, s_h = find_center_scaled(dim[2], dim[3], dim[4], dim[5])
#         f = open('./data/{}_annot/img_{}.txt'.format(str_name, num),"w+")
#         f.write('1 {} {} {} {}'.format(s_x, s_y, s_w, s_h))
#         f.close()
#         txt_files.append('./data/{}/img_{}.txt'.format(str_name, num))
#         img = Image.open(df.img_file.values[0])
#         img.save('./data/'+str_name+'/img_'+str(num)+'.jpg', 'JPEG')
#     f = open('./data/{}.txt'.format(str_name), 'w+')
#     for t in txt_files:
#         f.write(t+'\n')
#     f.close()


In [25]:
# VOC format
import xml.etree.ElementTree as ET
import xml.dom.minidom
def prep_for_VOC(x, y, w, h, folder, filename):
    if 'train' in folder:
        folder = 'train'
    if 'valid' in folder:
        folder = 'validate'
    if 'test' in folder:
        folder = 'test'
    x = round(x*.75)
    y = round(y*.75)
    w = round(w*.75)
    h = round(h*.75)
    root = ET.Element('annotation')
    fol_elem = ET.SubElement(root, 'folder')
    fol_elem.text = folder
    file_elem = ET.SubElement(root, 'filename')
    file_elem.text = filename
    path_elem = ET.SubElement(root,'path')
    path_elem.text = ('/Users/kathleensullivan/PythonNYCDSA/Machine_Learning/capstone/data/'+folder+'/'+filename)
    src_elem = ET.SubElement(root, 'source')
    db = ET.SubElement(src_elem, 'database')
    db.text=('Unknown')
    sz_elem = ET.SubElement(root, 'size')
    w_elem = ET.SubElement(sz_elem, 'width')
    w_elem.text = str(w)
    h_elem = ET.SubElement(sz_elem, 'height')
    h_elem.text = str(h)
    d_elem = ET.SubElement(sz_elem, 'depth')
    d_elem.text = '3'
    seg_elem = ET.SubElement(root,'segmented')
    seg_elem.text='0'
    obj_elem = ET.SubElement(root, 'object')
    name_elem = ET.SubElement(obj_elem,'name')
    name_elem.text = 'lesion'
    pose_elem = ET.SubElement(obj_elem,'pose')
    pose_elem.text = 'Unspecified'
    trunc_elem = ET.SubElement(obj_elem,'truncated')
    trunc_elem.text = '0'
    dif_elem = ET.SubElement(obj_elem,'difficult')
    dif_elem.text = '0'
    
    bb_elem = ET.SubElement(obj_elem,'bndbox')
    xmin_elem = ET.SubElement(bb_elem,'xmin')
    xmin_elem.text = str(x)
    xmax_elem = ET.SubElement(bb_elem,'xmax')
    xmax_elem.text = str(x + w)
    ymin_elem = ET.SubElement(bb_elem,'ymin')
    ymin_elem.text = str(y)
    ymax_elem = ET.SubElement(bb_elem,'ymax')
    ymax_elem.text = str(y + h)
    return root
    

In [26]:
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom


def write_files(df_name):
    str_name =[x for x in globals() if globals()[x] is df_name][0]
    txt_files = []
    num_list = df_name.image_id.values
    for num in num_list:
        df = df_name[df_name.image_id == num]
        dim = df.values[0]
        
        tree = ET.tostring(prep_for_VOC(dim[2],dim[3], dim[4], dim[5], str_name+'_annot', 'img_{}.png'.format(num)) )
        xmlstr = minidom.parseString(tree)
        xmlstr = xmlstr.toprettyxml() 
        xmlstr = xmlstr[22:].strip()
        with open('./data/{}_annot/img_{}.xml'.format(str_name, num), 'w') as f:
             f.write(xmlstr)
        txt_files.append('./data/{}/img_{}.xml'.format(str_name, num))
        img = Image.open(df.img_file.values[0]).convert('LA')
        
        img.save('./data/'+str_name+'/img_'+str(num)+'.png', 'PNG')
    f = open('./data/{}.txt'.format(str_name), 'w+')
    for t in txt_files:
        f.write(t+'\n')
    f.close()
    

In [19]:


train=train.loc[]
train = train.sample(30)
validate = validate.sample(10)


In [20]:
# write partial files for testing
# write_files(train)
# write_files(validate)

In [21]:
#### full files
# write_files(validate)
# write_files(train)
# write_files(test)


In [22]:
key_points = coord_df.copy()
key_points.head()

Unnamed: 0,id,image_id,x,y,w,h,img_file
0,1,1,385,91,68,98,./data/cimalab/bening/545/1.jpg
1,2,2,271,72,100,143,./data/cimalab/bening/545/2.jpg
2,3,3,292,101,79,56,./data/cimalab/bening/549/1.jpg
3,4,4,258,95,69,53,./data/cimalab/bening/549/2.jpg
4,5,5,235,71,104,138,./data/cimalab/bening/550/1.jpg


In [None]:
# Create ground truth files

def write_gr_truth(df_name):
    str_name =[x for x in globals() if globals()[x] is df_name][0]
    txt_files = []
    num_list = df_name.image_id.values
    for num in num_list:
        df = df_name[df_name.image_id == num]
        dim = df.values[0]
        left, top, right, bottom = round(dim[2]/4), round(dim[3]/4), round((dim[4] + dim[2])/4), round((dim[5]+dim[3])/4)
        f = open('./data/{}_annot/img_{}.txt'.format(str_name, num),"w+")
        f.write('lesion', {} {} {} {}'.format(left, top, right, bottom))
        f.close()
        txt_files.append('./data/{}/img_{}.txt'.format(str_name, num))
        img = Image.open(df.img_file.values[0])
        img.save('./Object-Detection-Metrics/groundtruth/'+str_name+'/img_'+str(num)+'.png', 'PNG')
    f = open('./data/{}.txt'.format(str_name), 'w+')
    for t in txt_files:
        f.write(t+'\n')
    f.close()