# What is this?

Because my artificial data set didn't work, I will need to collect real examples of numbers and symbols from real KenKen puzzles. I will do this by getting 28x28 images of the numbers and symbols from example puzzles and then manually categorizing them.

In [276]:
import os
import string
import random
import math
from copy import deepcopy
from hashlib import sha1

import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np 

from scipy import ndimage as ndi
from scipy.misc import imsave

from skimage.transform import resize
from skimage.measure import find_contours
from skimage.morphology import erosion, dilation, rectangle

%matplotlib inline

In [232]:
IMAGE_DIR = 'puzzle_examples/'
SAVED_IMAGE_DIR = 'true_kenken_num_images/'
UNIQUE_IDS = set()
UNIQUE_IMAGES = set()
IMAGE_SIZE = 28
NULL_IMAGE = np.zeros((28,28)) + 1



In [77]:
#generate a unique ID to save an image
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

In [294]:
#Save the image to the SAVED_IMAGE_DIR, must be categorized manually later
def save_image(image, x,y):
    suffix = id_generator()
    while suffix in UNIQUE_IDS:
        suffix = id_generator()
    UNIQUE_IDS.add(suffix)
    extension = '.jpg'
    filename = x+'_'+y+suffix + extension
    output_file = os.path.join(SAVED_IMAGE_DIR, filename)
    #print(output_file)
    imsave(output_file, image)

In [79]:
#clean out the JPGs from the saved folder
#also flush UNIQUE_IMAGES and UNIQUE_IDS
def clear_saved_images():
    for image in os.listdir(SAVED_IMAGE_DIR):
        image_file = os.path.join(SAVED_IMAGE_DIR, image)
        if image_file[-3:] == 'jpg':
            os.remove(image_file)
    
    #Reinitialize UNIQUE_IDS and UNIQUE_IMAGES
    UNIQUE_IDS.clear()
    UNIQUE_IMAGES.clear()

In [310]:
def binarize(matrix, threshold=0.3):
    temp = deepcopy(matrix)
    for m in range(len(temp)):
        for n in range(len(temp[m])):
            if temp[m][n] <= threshold:
                temp[m][n] = 0.0
            else:
                temp[m][n] = 1.0
                
    return temp

In [351]:
def fetch_images():
    dataset = []
    labels = []
    #i is used for creating the one-hot label by adding 1 to the index in a zeros array
    #Read each image from the folder in full_image_dir/
    for image in os.listdir(IMAGE_DIR):
        image_file = os.path.join(IMAGE_DIR,image)
        print('Reading ', image_file)
        try:
            #get the image and resize
            image_data = ndi.imread(image_file, mode = 'L')
            resized_image = resize(image_data, (200,200))
            
            #apply the transformations from 01_basic_image_tests.ipynb
            binar = deepcopy(resized_image)
            binar = binarize(binar, 0.6)
            
            # Get the upleft and lowright black pixels for croping to corners
            # up left
            cond = False
            for m in range(10):
                for n in range(10):
                    if binar[m][n] == 0:
                        up_left = [m,n]
                        cond = True
                        break
                if cond==True:
                    break

            # low right
            cond = False
            for m in range(199,189,-1):
                for n in range(199,189,-1):
                    #print(m,n, binar[m][n])
                    if binar[m][n] == 0:
                        low_right = [m,n]
                        cond = True
                        break
                if cond==True:
                    break
                    
            resized_image = resize(resized_image[up_left[0]:low_right[0]+1,up_left[1]:low_right[1]+1], (200,200))
            binar = binarize(resized_image, 0.4)
            
            undilated = deepcopy(binar)
            

            
            #dilate the binarized image
            selem = rectangle(2,2)
            dil = dilation(binar, selem)
            
            #binarize dilation
            dil = binarize(dil)
            
            #final = dil
            
            final = deepcopy(dil)
            for i in range(4):
                for j in range(4):
                    final[i*50+3:i*50+25,j*50+3:j*50+44] = undilated[i*50+3:i*50+25,j*50+3:j*50+44]
            
            #Try to remove all borders and grid lines in the image. 
            #Do this by scanning over rows and cols and if more than 25%
            #of the pixels are <= 0.45 then set the entire row to 1(white)
            
            #first rows
            for row in range(len(final)):
                count = 0
                for pixel in final[row,:]:
                    if pixel == 0:
                        count += 1
                if count >= 45:
                    final[row,:] = final[row,:]*0 + 1
                    
            #columns
            for col in range(len(final[0,:])):
                count = 0
                for pixel in final[:,col]:
                    if pixel == 0:
                        count += 1
                if count >= 45:
                    final[:,col] = final[:,col]*0 + 1
            
            #add some final erosion (black) to fill out numbers and ensure they're connected
            final = binarize(erosion(final, rectangle(2,1)),0.)
            
            #save copy of original image to new directory
            imsave(os.path.join(SAVED_IMAGE_DIR, image), final)
                
            #the regions in the image that will be searched for contours
            regions_of_interest = [final[3:39,3:46],
                                  final[3:39,53:96],
                                  final[3:39,103:146],
                                  final[3:39,153:196],
                                  final[53:89,3:46],
                                  final[53:89,53:96],
                                  final[53:89,103:146],
                                  final[53:89,153:196],
                                  final[103:139,3:46],
                                  final[103:139,53:96],
                                  final[103:139,103:146],
                                  final[103:139,153:196],
                                  final[153:189,3:46],
                                  final[153:189,53:96],
                                  final[153:189,103:146],
                                  final[153:189,153:196]
                                  ]
            
            
            #get the contour lines and make a bounding box of each contour
            total_images_found = 0
            x = 0
            y = 0
            for region in regions_of_interest:  
                #Results is where the sub-ROIs will be stored
                results = []
                ctrs = find_contours(region, .9)
                rects = [np.array(
                        [[min(ctr, key=lambda x: x[0])[0],min(ctr, key=lambda x: x[1])[1]],
                         [min(ctr, key=lambda x: x[0])[0],max(ctr, key=lambda x: x[1])[1]],
                         [max(ctr, key=lambda x: x[0])[0],max(ctr, key=lambda x: x[1])[1]],
                         [max(ctr, key=lambda x: x[0])[0],min(ctr, key=lambda x: x[1])[1]],
                         [min(ctr, key=lambda x: x[0])[0],min(ctr, key=lambda x: x[1])[1]]])
                          for ctr in ctrs]
                
                #print(rects)

                #loop over the bounding boxes and store that region, the regions will need 
                #to be filtered so that there aren't regions within regions
                for rect in rects:
                    try:
                        pt1 = rect[0][0] #m min
                        pt2 = rect[2][0] #m max
                        pt3 = rect[0][1] #n min
                        pt4 = rect[1][1] #n max
                        results.append([pt1,pt2,pt3,pt4])

                    except:
                        print('There was an error')
                        
                        
                #filter out a result contained in another result
                #This isn't very efficient and will likely need a better
                #algorithm for images taken with a camera but it works well for now
                for result in results:
                    temp = [res for res in results if res != result]
                    for other in temp:
                        if result[0] >= other[0] and result[1] <= other[1] and\
                        result[2] >= other[2] and result[3] <= other[3]:
                            try:
                                results.remove(result)
                            except ValueError as e:
                                print('Error removing result from results, ', e)
                
                
                #combine those with similar midpoints (mainly used for finding division symbols)
                midpoints = [(result[3]-result[2])/2+result[2] for result in results]
                
                for i,result in enumerate(results):
                    diff = [j for j, m in enumerate(midpoints) if abs(m-midpoints[i]) < 3]
                    #need to reinitialize new_results between loops
                    new_results = []
                    if len(diff) > 1:
                        new_results = [result for j,result in enumerate(results) if j not in diff]
                        similar_obj = np.array([results[j] for j in diff])
                        new_object = np.array([min(similar_obj[:,0]),
                                              max(similar_obj[:,1]),
                                              min(similar_obj[:,2]),
                                              max(similar_obj[:,3])])
                        #print(new_object)
                        new_results.append(new_object)
                        
                        break
                    else:
                        continue
                
                #assign new_results to results if new results were obtained
                try:
                    if len(new_results) > 0:
                        results = new_results
                except:
                    pass         
                
                #Loop again over the filtered results, resize each one and save the output
                num_sym_found = 0
                for result in results:
                    try:
                        roi = region[int(result[0]):int(result[1]+1), int(result[2]):int(result[3])+1] 
                        roi = resize(roi, (IMAGE_SIZE, IMAGE_SIZE))
                        
                        #skip null images (black)
                        if np.array_equal(roi, NULL_IMAGE):
                            continue
                            
                        if sha1(roi) not in UNIQUE_IMAGES:
                            save_image(roi, str(x),str(y))
                            UNIQUE_IMAGES.add(sha1(roi))
                            num_sym_found += 1
                    except ValueError as e:
                        print('Problem saving image, ', e)
                
                print('{} numbers/symbols found in quadrant {},{}'.format(num_sym_found, x,y))
                y+=1
                if y == 4:
                    y = 0
                    x+=1
                total_images_found+=num_sym_found
            
            print('{} numbers/symbols found in image'.format(total_images_found))
                    
        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')



In [369]:
clear_saved_images()
fetch_images()

('Reading ', 'puzzle_examples/.DS_Store')
('Could not read:', 'puzzle_examples/.DS_Store', ':', IOError("cannot identify image file 'puzzle_examples/.DS_Store'",), "- it's ok, skipping.")
('Reading ', 'puzzle_examples/22.jpg')
1 numbers/symbols found in quadrant 0,0
2 numbers/symbols found in quadrant 0,1
0 numbers/symbols found in quadrant 0,2
2 numbers/symbols found in quadrant 0,3
0 numbers/symbols found in quadrant 1,0
1 numbers/symbols found in quadrant 1,1
2 numbers/symbols found in quadrant 1,2
0 numbers/symbols found in quadrant 1,3
3 numbers/symbols found in quadrant 2,0
0 numbers/symbols found in quadrant 2,1
0 numbers/symbols found in quadrant 2,2
0 numbers/symbols found in quadrant 2,3
2 numbers/symbols found in quadrant 3,0
0 numbers/symbols found in quadrant 3,1
2 numbers/symbols found in quadrant 3,2
0 numbers/symbols found in quadrant 3,3
15 numbers/symbols found in image
