In [None]:
import functools
import glob
import os

import matplotlib.pyplot as plt
import matplotlib as mpl

import PIL.Image
import PIL.ImageFilter

from tqdm import tqdm

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, TapTool, OpenURL
from bokeh.transform import jitter

output_notebook()

In [None]:
blanks = open("blanks.list").read().split("\n")[:1000]
notblanks = open("notblanks.list").read().split("\n")[:1000]

In [None]:
def image_crop_ratio(image, left, top, right, bottom):
    assert 0 <= left <= 1
    assert 0 <= top <= 1
    assert 0 <= right <= 1
    assert 0 <= bottom <= 1
    left = int(image.size[0] * left)
    top = int(image.size[1] * top)
    right = int(image.size[0] * right)
    bottom = int(image.size[1] * bottom)
    return image.crop((left, top, right, bottom))

In [None]:
def image_histogram_normalized(image, mask=None):
    histogram = image.convert("L").histogram(mask)
    pixels = sum(histogram)
    assert len(histogram) == 256, "After grayscale conversion, image should have 256 levels of grey"
    return [ v/pixels for v in histogram ]    

In [None]:
def histogram_make_bins(histogram, bin_count):
    increment = 1/bin_count
    next_threshold = increment
    total = 0
    bins = []
    for pixel_value, pixel_count in enumerate(histogram):
        total += pixel_count
        while total >= next_threshold:
            bins.append(pixel_value)
            next_threshold += increment
    return bins[:bin_count-1]

In [None]:
def histogram_maximalize(histogram):
    max_value = max(histogram)
    return [ v/max_value for v in histogram ]

In [None]:
def image_average(image):
    return sum(image.convert("L").getdata()) / image.size[0] / image.size[1] / 256

In [None]:
def image_shrink(image, white_threshold=200, noise_threshold=0.2, thickness=16):
    # if we shrunk the image all the way, return a single white pixel
    if image.size[0] <= thickness or image.size[1] <= thickness:
        return PIL.Image.new("L", (1, 1), 255)
    # areas to check
    zones = [
        ("left",
         thickness * image.size[1],            # how many pixels
         (0, 0, thickness, image.size[1]),             # zone to check
         (thickness, 0, image.size[0], image.size[1]), # rest of the image
        ),
        ("right",
         thickness * image.size[1],
         (image.size[0]-thickness, 0, image.size[0], image.size[1]),
         (0, 0, image.size[0]-thickness, image.size[1]),
        ),
        ("top",
         thickness * image.size[0],
         (0, 0, image.size[0], thickness),
         (0, thickness, image.size[0], image.size[1]),
        ),
        ("bottom",
         thickness * image.size[0],
         (0, image.size[1]-thickness, image.size[0], image.size[1]),
         (0, 0, image.size[0], image.size[1]-thickness),
        ),
    ]
    for zone_name, how_many_pixels, line_to_check, rest_of_image in zones:
        line = image.crop(line_to_check)
        non_white_pixels = [ p for p in line.convert("L").getdata() if p < white_threshold ]
        noise = len(non_white_pixels) / how_many_pixels
        #print(zone_name, noise)
        if noise < noise_threshold:
            return image_shrink(image.crop(rest_of_image), white_threshold, noise_threshold, thickness)
    return image    

In [None]:
def image_improve_dynamic_range(image):
    image = image.convert("L")
    pixels = sorted(list(image.getdata()))
    white_threshold = pixels[len(pixels)//5]
    black_threshold = pixels[len(pixels)//1000]
    scale = 256/(white_threshold-black_threshold)
    offset = -scale*black_threshold
    newimage = PIL.Image.new("L", image.size)
    newimage.putdata(image.getdata(), scale, offset)
    return newimage


In [None]:
colors = dict(
    notblank="black",
    blank="red",
    unknown="yellow"
)

In [None]:
linestyles = dict(
    notblank="solid",
    blank="dotted",
    unknown="dashdot",
)

In [None]:
indices = dict(
    notblank=1,
    written=2,
    unknown=3,
)

In [None]:
TOOLTIPS = [
    ("image", "@filename"),
    ("label", "@label"),
    ("stampiness", "@stampiness"),
    ("writiness", "@writiness"),
    ("x", "@x"),
    ("y", "@y"),
]

In [None]:
TOOLTIPS = """
    <div>
    <img width="256" src="/files/@filename">
    <p>@filename</p>
    <p>@label - stampiness=@stampiness - writiness=@writiness</p>
    </div>
"""

In [None]:
@functools.cache
def image_load(filename, index=0):
    image = PIL.Image.open(filename)

    stamp_area = image_crop_ratio(image, 3/4, 0, 1, 1/3)
    shrunk_area = image_shrink(stamp_area)
    stampiness = image_average(shrunk_area)
    
    text_area = image_crop_ratio(image, 1/8, 1/4, 3/4, 7/8)
    dynamic = image_improve_dynamic_range(text_area)
    writiness = image_average(dynamic)
    #bins = dynamic.histogram()
    histogram = image_histogram_normalized(dynamic)

    bins = histogram_make_bins(histogram, 1000)
    #bins = histogram
    
    bins = [ writiness ]
    
    #mask = PIL.Image.new("1", image.size, 1)
    #mask.paste(0, (int(3/4*image.size[0]), 0, image.size[0], int(1/3*image.size[1])))
    #histogram = image_histogram_normalized(image, mask)
    #bins = histogram_make_bins(histogram, 10000)
    #bins = histogram_maximalize(histogram)
    #bins = histogram[:]
    #bins = [ image_average(shrunk_area) ]
    #low, high = bins[0], bins[-1]
    #if low == high:
    #    return []
    #bins = [ (v-low)/(high-low) for v in bins ]
    #print(len(bins))
    if filename in blanks:
        label = "blank"
    elif filename in notblanks:
        label = "notblank"
    else:
        raise ValueError(f"{filename} is neither blank or notblank")
    cds = ColumnDataSource(dict(
        x = list(range(len(bins))) if len(bins)>1 else [index],
        y = bins,
        filename = len(bins) * [filename],
        label = len(bins) * [label],
        stampiness = len(bins) * [stampiness],
        writiness = len(bins) * [writiness],
    ))
    return dict(cds=cds, label=label, stampiness=stampiness, writiness=writiness)

In [None]:
errors = []
sample_size = 1000
stats = {}
f = figure(title="Der Schriebdetektor 4000", tooltips=TOOLTIPS)
f.add_tools(TapTool(callback=OpenURL(url="/files/@filename")))
for index, filename in tqdm(enumerate(blanks[:sample_size]+notblanks[:sample_size])):
    try:
        data = image_load(filename, index)
    except Exception as e:
        errors.append((filename, e))
        continue
    label = data["label"]
    if label == "front":
        continue
    if len(data["cds"].data["x"]) == 1:
        if data["stampiness"] < 0.75:
            guess = "notblank"
        elif data["writiness"] < 0.98:
            guess = "notblank"
        else:
            guess = "blank"
        color = "green" if guess==label else "red"
        method = f.star if label=="notblank" else f.circle
        kwargs = dict(size=12)
        stats[label, label==guess] = stats.get((label, label==guess), 0) + 1
    else:
        method = f.line
        kwargs = dict()
    method(x="stampiness", y="y", source=data["cds"], color=color, **kwargs)
show(f)
stats

In [None]:
errors = []
sample_size = 1000
stats = {}
f = figure(title="Der Schriebdetektor 4000", tooltips=TOOLTIPS)
f.add_tools(TapTool(callback=OpenURL(url="/files/@filename")))
for index, filename in tqdm(enumerate(blanks[:sample_size]+notblanks[:sample_size])):
    try:
        data = image_load(filename, index)
    except Exception as e:
        errors.append((filename, e))
        continue
    label = data["label"]
    if label == "front":
        continue
    if len(data["cds"].data["x"]) == 1:
        if data["stampiness"] < 0.75:
            guess = "notblank"
        elif data["writiness"] < 0.98:
            guess = "notblank"
        else:
            guess = "blank"
        color = "green" if guess==label else "red"
        method = f.star if label=="notblank" else f.circle
        kwargs = dict(size=12)
        stats[label, label==guess] = stats.get((label, label==guess), 0) + 1
    else:
        method = f.line
        kwargs = dict()
    method(x="stampiness", y="y", source=data["cds"], color=color, **kwargs)
show(f)
stats

In [None]:
f = figure(title="Der Stampfendetektor 3000", tooltips=TOOLTIPS)
for filename in IMAGES:
    data = image_load(filename)
    label = data["label"]
    if label == "front":
        continue
    color = colors[label]
    if len(data["cds"].data["x"]) == 1:
        method = f.circle
        kwargs = dict(size=16)
    else:
        method = f.line
        kwargs = dict()
    method(x="x", y="y", source=data["cds"], color=color, **kwargs)
show(f)

In [None]:
i = PIL.Image.open(IMAGES[41]).convert("L")
pixels = sorted(list(i.getdata()))
white_threshold = pixels[len(pixels)//5]
black_threshold = pixels[len(pixels)//1000]
scale = 256/(white_threshold-black_threshold)
offset = -scale*black_threshold
i.putdata(i.getdata(), scale, offset)
i               
#stamp = image_crop_ratio(i, 3/4, 0, 1, 1/3)
#h = image_histogram_normalized(stamp)
#image_shrink(stamp)
#plt.plot(h)
#i

In [None]:
import json, math

with open("_brownie/inference.json") as f:
    data = json.load(f)

THRESHOLD = 0.8
LABELS = set([ img["label"] for img in data ])
MAXVAL = max([ img[list(LABELS)[0]] for img in data ])

i = 0
for img in data:
    img["i"] = i
    i += 1
    # specific code comes here
    if img["blank"] > THRESHOLD*MAXVAL:
        if img["label"] == "blank":
            img["guess"] = "right"
        else:
            img["guess"] = "wrong"
    elif img["with_address"] > THRESHOLD*MAXVAL:
        if img["label"] == "with_address":
            img["guess"] = "right"
        else:
            img["guess"] = "wrong"
    else:
        img["guess"] = "unsure"
    #
    img["max"] = max([img[label] for label in LABELS])
    #img["x"] = sum([img[label] * math.sin(2*i*math.pi/len(LABELS)) for (i, label) in enumerate(LABELS) ])
    #img["y"] = sum([img[label] * math.cos(2*i*math.pi/len(LABELS)) for (i, label) in enumerate(LABELS) ])
    #if img["label"] == "blank":
    #    img["x"] -= 300
    #if img["label"] == "with_address":
    #    img["x"] += 300
            
#print(data[0])
    
PALETTE = zip(LABELS, ("red", "green", "blue"))
        
TOOLTIPS = """
<div>
<img width="256" src="/files/@file_name">
<p>@filename</p>
<p>@label ({})</p>
</div>
""".format(", ".join(["{label}=@{label}/{MAXVAL}".format(label=label, MAXVAL=MAXVAL) for label in LABELS]))

def make_cds(list_of_dicts):
    if not list_of_dicts:
        return ColumnDataSource()
    return ColumnDataSource({
        k: [ i[k] for i in list_of_dicts ]
        for k in list_of_dicts[0]
    })

f = figure(
    title="Model Performance (model: BROWNIE)",
    tooltips=TOOLTIPS,
    sizing_mode="stretch_width",
    x_range=list(LABELS),
)
f.add_tools(TapTool(callback=OpenURL(url="/files/@file_name")))
for guess, color, func, size in (
    ("right", "green", f.star, 6),
    ("wrong", "red", f.circle, 10),
    ("unsure", "black", f.square, 4),
    ):
    func(
        source=make_cds([ img for img in data if img["guess"]==guess]),
        color=color, size=size,
        y="max", x=jitter("label", width=0.6, range=f.x_range),
        #x="i", y=label,
        legend_label=guess,
    )
f.legend.location="left"
show(f)
