In [1]:
import cv2
import pandas as pd
import numpy as np
import os
import glob
import shutil
import pytesseract
import re
import time
import argparse
from statistics import mode
import struct
import imghdr
from collections import Counter

In [2]:
!pwd

/home/mark/lambda/pill_identifier


In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
output_dir = "./results"
input_dir = "./images/"

In [5]:
def apply_threshold(img, argument):
    switcher = {
        1: cv2.threshold(cv2.GaussianBlur(img, (9, 9), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
        2: cv2.threshold(cv2.GaussianBlur(img, (7, 7), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
        3: cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
        4: cv2.threshold(cv2.medianBlur(img, 5), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
        5: cv2.threshold(cv2.medianBlur(img, 3), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
        6: cv2.adaptiveThreshold(cv2.GaussianBlur(img, (5, 5), 0), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2),
        7: cv2.adaptiveThreshold(cv2.medianBlur(img, 3), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2),
    }
    return switcher.get(argument, "Invalid method")


# def crop_image(img, start_x, start_y, end_x, end_y):
#     cropped = img[start_y:end_y, start_x:end_x]
#     return cropped

In [6]:

def get_string(img_path, method):
    # Read image using opencv
    img = cv2.imread(img_path)
    file_name = os.path.basename(img_path).split('.')[0]
    file_name = file_name.split()[0]

    output_path = os.path.join(output_dir, file_name)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Crop the areas where provision number is more likely present
#     img = crop_image(img, pnr_area[0], pnr_area[1], pnr_area[2], pnr_area[3])
    # img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)

    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)

    #  Apply threshold to get image with only black and white
    img = apply_threshold(img, method)
    save_path = os.path.join(output_path, file_name + "_filter_" + str(method) + ".jpg")
    cv2.imwrite(save_path, img)

    # Recognize text with tesseract for python
    result = pytesseract.image_to_string(img, lang="eng")

    return result

In [7]:
# !wget https://pillbox.nlm.nih.gov/downloads/pillbox_production_images_full_201805.zip

In [8]:
# !unzip pillbox_production_images_full_201805.zip

In [9]:
url = 'pillbox_201805.tab'
df_ = pd.read_csv(url,sep='\t')

In [10]:
df = pd.concat([df_[(df_.SPLIMPRINT.notnull()) & (df_.image_id.notnull()) & (df_.image_id != 'no_product_image')][['image_id','SPLIMPRINT']],pd.read_csv('pill_ids.csv')],ignore_index=True)

In [11]:
df.head()

Unnamed: 0,image_id,SPLIMPRINT
0,50844-0175-10_NLMIMAGE10_5135A8AD,44;175
1,00093720656,9;3;7206
2,604290566,WATSON;744;1
3,007811034,GG55;5
4,59762-0145-01_NLMIMAGE10_333E99F4,FELDENE;PFIZER;323


In [12]:
df[df.image_id == '005361500']

Unnamed: 0,image_id,SPLIMPRINT
8428,5361500,CPC;1490


In [13]:
df.to_csv('pillbox_201904_clean.csv')

Index(['image_id', ' SPLIMPRINT'], dtype='object')
Index(['image_id', ' SPLIMPRINT'], dtype='object')

In [14]:
df.dtypes

image_id      object
SPLIMPRINT    object
dtype: object

In [15]:
df[df['SPLIMPRINT'].str.contains("M;L;15")]

Unnamed: 0,image_id,SPLIMPRINT
5034,00378-1823-01_D723EBAF,M;L;15


In [16]:
df[df['SPLIMPRINT'].str.contains("3229")]

Unnamed: 0,image_id,SPLIMPRINT
6236,00002-3229-30_3E1E1F50,LILLY;3229;40;mg


In [17]:
#regex = r"([\w\d]+)(?!\s+\d+\s+mg)"
regex = r"([\w\d]+)"

In [18]:
im_names = glob.glob(os.path.join(input_dir, '*.png')) + \
           glob.glob(os.path.join(input_dir, '*.jpg')) + \
           glob.glob(os.path.join(input_dir, '*.jpeg'))

In [19]:
print('len(im_names)',len(im_names))
df.shape

len(im_names) 8041


(8437, 2)

In [20]:
# dft = df.copy(deep=True)

In [21]:
def get_image_size(fname):
    '''Determine the image type of fhandle and return its size.
    from draco'''
    with open(fname, 'rb') as fhandle:
        head = fhandle.read(24)
        if len(head) != 24:
            return
        if imghdr.what(fname) == 'png':
            check = struct.unpack('>i', head[4:8])[0]
            if check != 0x0d0a1a0a:
                return
            width, height = struct.unpack('>ii', head[16:24])
        elif imghdr.what(fname) == 'gif':
            width, height = struct.unpack('<HH', head[6:10])
        elif imghdr.what(fname) == 'jpeg':
            try:
                fhandle.seek(0) # Read 0xff next
                size = 2
                ftype = 0
                while not 0xc0 <= ftype <= 0xcf:
                    fhandle.seek(size, 1)
                    byte = fhandle.read(1)
                    while ord(byte) == 0xff:
                        byte = fhandle.read(1)
                    ftype = ord(byte)
                    size = struct.unpack('>H', fhandle.read(2))[0] - 2
                # We are at a SOFn block
                fhandle.seek(1, 1)  # Skip `precision' byte.
                height, width = struct.unpack('>HH', fhandle.read(4))
            except Exception: #IGNORE:W0703
                return
        else:
            return
        return width, height

In [22]:

# def get_counter(imprints, tag):
#     dirname = os.path.basename(dirpath)
#     ann_dirpath = join(dirpath, 'ann')
#     letters = ''
#     lens = []
#     for filename in os.listdir(ann_dirpath):
#         json_filepath = join(ann_dirpath, filename)
#         ann = json.load(open(json_filepath, 'r'))
#         tags = ann['tags']
#         if tag in tags:
#             description = ann['description']
#             lens.append(len(description))
#             letters += description
#     print('Max pill length in "%s":' % dirname, max(Counter(lens).keys()))
#     return Counter(letters)
# c_val = get_counter('../data/anpr_ocr__train', 'val')
# c_train = get_counter('../data/anpr_ocr__train', 'train')
# letters_train = set(c_train.keys())
# letters_val = set(c_val.keys())
# if letters_train == letters_val:
#     print('Letters in train and val do match')
# else:
#     raise Exception()
# # print(len(letters_train), len(letters_val), len(letters_val | letters_train))
# letters = sorted(list(letters_train))
# print('Letters:', ' '.join(letters))

NameError: name 'dirpath' is not defined

In [44]:
!rm json/*.json

In [50]:
worked = 0
failed = 0
i = 0
# imprints = []

for name_ in im_names:
    #     print('name_', name_)
    #     print('i',i)
    name = name_[9:-4]
    try:
        if len(df[df.image_id == name]) != 1:
            print(f"name {name} has {len(df[df.image_id == name])} rows")
        row = df[df.image_id == name].iloc[0]
        spli = row['SPLIMPRINT']
#         print('spli', spli)
        if spli.find(' OR ') > 0:
            spli = spli[0:spli.index(' OR ')]
        if spli.find('[or];') > 0:
            spli = spli[0:spli.index('[or];')]
        imprint = spli
#         if len(spli) > 10:
#             if spli[0:6] == "LILLY":
#                 spli = spli.split(';')[1]
        
#         print(row.index)
#         imprint = spli.replace(';', '')           
#         print('imprint', imprint)
#         imprints.extend(imprint)
    except:
        print(f'name {name} not found for {name_} {row["SPLIMPRINT"]} | {imprint} | {spli}')
#         %debug         
        break   
    with open('./json/' + name + '.png.json', "wt") as file:
#         print('writing:' + './json/' + name + '.png.json')
        w, h = get_image_size(name_)
        if i < 7000:
            file.write(f'{{"tags": ["train"], "description": "{imprint}", "objects": ["{row["SPLIMPRINT"]}"],\
                       "size": {{"height": {h}, "width": {w}}}}}')
        else:
            file.write(f'{{"tags": ["val"], "description": "{imprint}", "objects": ["{row["SPLIMPRINT"]}"],\
                       "size": {{"height": {h}, "width": {w}}}}}')
#     print('name', name)

    i += 1
    continue
#     print('row[8]', row[8])
    imprint = row[8].split(';')[1] if len(
        row[8]) > 10 else row[8].replace(';', '')
#     print('inprint', imprint)
    g1 = None
    for ti in range(1, 8):
        r = get_string(name_, ti)
        try:
            m = re.match(regex, r)
            g1 = m.groups(1)[0] if m.groups(1) else None
        except:
            g1 = None
#         print(f'correct: {g1 == imprint} result: {r} match: {g1}')
        if g1 == imprint:
            break
    if g1 == imprint:
        worked += 1
    else:
        failed += 1
    if i % 100 == 0:
        print('i', i)
    i += 1
print('worked', worked)
print('failed', failed)

name 50419-0403-01_SPLIMAGE30_9A194D0A has 2 rows
name 499380101 has 2 rows
name 65597-0101-30_35131A98 has 2 rows
name 00093019901 has 3 rows
name 524270286 has 2 rows
name 60429-203_M_LH3 has 2 rows
name 00093745701 has 7 rows
name 00172409760 has 8 rows
name 633040535 has 2 rows
name 605052551 has 2 rows
name 00093213101 has 2 rows
name 4ec4203e-f1f8-0398-e054-00144ff88e88 has 2 rows
name 00093075601 has 2 rows
name 615700079 has 2 rows
name 00555015904 has 4 rows
name 000548183 has 2 rows
name 499380102 has 2 rows
name 2ab8564f-f95f-7512-e054-00144ff88e88 has 3 rows
name 00093517220 has 2 rows
name 60429-201_M_LH1 has 2 rows
name 000548174 has 2 rows
name 000548179 has 2 rows
name 00093738456 has 5 rows
name 00093777201 has 2 rows
name 50111-0434-01_SPLIMAGE30_F22FF97F has 2 rows
name 00093226801 has 3 rows
name 00093112201 has 2 rows
name 00781-1852-20_NLMIMAGE10_50402831 has 2 rows
name 633040828 has 2 rows
name 00555015802 has 2 rows
name 00555083102 has 2 rows
name 000544297 ha

In [None]:
image_id = '50419-0403-01_SPLIMAGE30_9A194D0A'
df[df.image_id == image_id]

In [None]:
print('worked', worked)
print('failed', failed)

In [None]:
m = re.match(regex,"3227 10 mg")

In [None]:
m.groups(1)

In [None]:
#left <img src="/images/pills/nlm/00002-3229-30_SPLIMAGE30_3E1E1F50.jpg
#right <img src="/images/pills/fio/LLY32291.JPG" 