## Calculate distribution from a list of images

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from IPython.display import display, HTML

# import matplotlib and numpy
import matplotlib.pyplot as plt 
import matplotlib.image as mpimage
import matplotlib.patches as patches

from matplotlib import cm
import colorcet as cc
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np 
import pandas as pd
import cv2

# import operating system and glob libraries
import os

import time
from datetime import datetime

import scipy
from scipy import ndimage

from pathlib import Path

from calcStatistics import *

cmap_grey = cc.cm.linear_grey_0_100_c0
noDisplaySamples = 5


In [3]:
class ImageReader :
    #channels the number of channels that must be returned from open
    def __init__(self, outChannels=3):
        self.outChannels = outChannels
    def open(self, fn):
        """Return an image with the number of outChannels"""
        raise NotImplementedError

class JPGImageReader(ImageReader) :
    #outChannels the number of channels that must be returned from open
    #ie if the image is rgb then it is converted to graysale
    def __init__(self, outChannels=3):
        super().__init__(outChannels)

    def open(self, fn):
        """ Opens an image using OpenCV given the file path.

        Arguments:
            fn: the file path of the image

        Returns:
            The image in RGB og Grayscale format as numpy array of floats 
            normalized to range between 0.0 - 1.0
        """
        flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
        if not os.path.exists(fn) and not str(fn).startswith("http"):
            raise OSError('No such file or directory: {}'.format(fn))
        elif os.path.isdir(fn) and not str(fn).startswith("http"):
            raise OSError('Is a directory: {}'.format(fn))
        else:
            #res = np.array(Image.open(fn), dtype=np.float32)/255
            #if len(res.shape)==2: res = np.repeat(res[...,None],3,2)
            #return res
            try:
                if str(fn).startswith("http"):
                    req = urllib.urlopen(str(fn))
                    image = np.asarray(bytearray(req.read()), dtype="uint8")
                    im = cv2.imdecode(image, flags).astype(np.float32)/255
                else:
                    im = cv2.imread(str(fn), flags).astype(np.float32)/255
                if im is None: raise OSError(f'File not recognized by opencv: {fn}')

                if self.outChannels==3:
                    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 
                elif len(im.shape)==3 and im.shape[2] == 3:
                    #Y = 0.299 R + 0.587 G + 0.114 B
                    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY )
                    im = np.expand_dims(im,2)
                    
                return im
            except Exception as e:
                raise OSError('Error handling image at: {}'.format(fn)) from e
                
class GrayImageReader(ImageReader):
    def __init__(self): super().__init__(outChannels=1)
        
    def open(self, fn):
        im = cv2.imread(str(fn),cv2.IMREAD_ANYDEPTH)
        #fastai expect 3 dimension as for jpgs
        return np.expand_dims(im,2)/65536.0                

In [4]:
def calcHistogram(reader, files, nbBins):
    startClock  = time.clock()
    bins        = np.linspace(0,1,nbBins+1)
    histograms  = []
    filenames   = []
    for i in range(len(files)):
        path        = Path(files[i])
        im          = reader.open(path)
        #hist,bins   = np.histogram(np.unique(im.flatten()), bins)
        hist,bins   = np.histogram(im.flatten(), bins)
                
        histograms.append(hist.astype(int))
        filenames.append(Path(path.stem).name)
                
        if i>0 and i-(int(i/50)*50) == 0 :
            progress      = (i+1)/len(files)
            percentage    = int( np.around(progress *100) )
            timePassed    = time.clock() - startClock
            remainingTime = timePassed *(len(files)-i) / i
            print( f"Processed: {percentage}% - number:{i} - sekunder: {int(timePassed)} s mangler: {int(remainingTime)}" )
    
    return bins, histograms, filenames


In [5]:
def showStatistics(hists):    
    mHist = hists.mean(axis=0).values
    bins  = np.asarray(list(map(float, hists.columns)))
    
    print(f"Characteristics of mean histograms")
    np.set_printoptions(formatter={'float_kind':lambda x: "%.3f" % x})
    print(f"Raw data: mean, std, mode, median, mode and median skewness : {np.round(mean_sd(bins, mHist),3)}")
    #print(f"Smoothed: mean, std, mode, median, mode and median skewness : {np.round(mean_sd(df.bins, df.hist_smoothed),3)}")
    np.set_printoptions()
    

In [7]:
def plotHistograms( df, ixCutStart, ixCutEnd ):
    #make som plotting
    fig = plt.figure(figsize = (12,5)) 
    ax1 = fig.add_subplot(111)
    ax1.set_title("histograms")
    bins = np.asarray(list(map(float, df.columns)))
    b    = bins[ixCutStart:-ixCutEnd]
    for i in range(df.shape[0]):
        h = df.iloc[i,ixCutStart:-ixCutEnd]
        h = np.asarray(h)
        ax1.plot( b, h )

In [9]:
def mainProcess(path, files, reader, ixCutStart, ixCutEnd):
    startClock  = time.clock()
    print("Number of images to be processed: ", len(files))
    #plt.imshow(np.squeeze(reader.open(files[0])),cmap=cmap_grey);

    bins, histograms, filenames  = calcHistogram(reader, files, 255)
    middelBins = 0.5*np.asarray( bins[0:-1]+bins[1:] )
    hists = pd.DataFrame(index=filenames, data = np.asarray(histograms), columns=np.round(middelBins,6) ) 
    hists.to_csv(path / "histograms.csv" )
    print("\nHistograms")
    display(hists.sample(noDisplaySamples))
    
    
    half_bin = 0.5*(bins[1]-bins[0])
    middelBins = bins[ixCutStart:-ixCutEnd-1]+half_bin    

    stats = calcStatistics(middelBins, hists, ixCutStart, ixCutEnd)
    stats.insert(1,"hdogvscat","")
    stats.hdogvscat = np.asarray([f[0:3] for f in filenames])
    
    stats.to_csv(path / "histogram_statistics.csv", index=False )
    print("\nHistogram statistics")
    display(stats.sample(noDisplaySamples))
    
    showStatistics (hists)
#    plotHistograms(hists, ixCutStart, ixCutEnd)
#    plt.show()
    print(f"The images were processes in {int(time.clock() - startClock)} seconds" )   


In [10]:
print("Data:For imagenet - dog&cats")
path  = Path("../../../fastai-data/data/dogscats/")
files = list( path.glob("**/*.jpg") )
print(f"Imagepath: {path}")

imagenet_stats_mean = np.asarray([0.485, 0.456, 0.406])
imagenet_stats_std  = np.asarray([0.229, 0.224, 0.225])
cv2_rgb2gray        = np.asarray([0.299, 0.587, 0.114])
inet_mean           = round(sum(cv2_rgb2gray*imagenet_stats_mean),3)
inet_std            = round(np.sqrt(sum(cv2_rgb2gray*imagenet_stats_std*imagenet_stats_std)),3) 
print(f"\nImagenet grayscale has mean og std = ({inet_mean}, {inet_std})")


ixCutStart = 5
ixCutEnd   = 5
mainProcess(path, files[:1000], JPGImageReader( outChannels=1 ), ixCutStart, ixCutEnd)


Data:For imagenet - dog&cats
Imagepath: ../../../fastai-data/data/dogscats

Imagenet grayscale has mean og std = (0.459, 0.226)
Number of images to be processed:  1000
Processed: 5% - number:50 - sekunder: 0 s mangler: 11
Processed: 10% - number:100 - sekunder: 1 s mangler: 10
Processed: 15% - number:150 - sekunder: 1 s mangler: 9
Processed: 20% - number:200 - sekunder: 2 s mangler: 8
Processed: 25% - number:250 - sekunder: 2 s mangler: 8
Processed: 30% - number:300 - sekunder: 3 s mangler: 7
Processed: 35% - number:350 - sekunder: 3 s mangler: 7
Processed: 40% - number:400 - sekunder: 4 s mangler: 6
Processed: 45% - number:450 - sekunder: 5 s mangler: 6
Processed: 50% - number:500 - sekunder: 5 s mangler: 5
Processed: 55% - number:550 - sekunder: 6 s mangler: 5
Processed: 60% - number:600 - sekunder: 6 s mangler: 4
Processed: 65% - number:650 - sekunder: 7 s mangler: 3
Processed: 70% - number:700 - sekunder: 7 s mangler: 3
Processed: 75% - number:750 - sekunder: 8 s mangler: 2
Process

Unnamed: 0,0.001961,0.005882,0.009804,0.013725,0.017647,0.021569,0.02549,0.029412,0.033333,0.037255,...,0.962745,0.966667,0.970588,0.97451,0.978431,0.982353,0.986275,0.990196,0.994118,0.998039
dog.5144,0,8,11,10,11,11,20,19,31,23,...,0,0,0,0,0,0,0,0,0,0
dog.996,16,11,16,18,33,43,31,81,109,134,...,10,10,8,10,10,12,10,5,10,2
dog.1247,0,0,1,1,5,3,8,5,7,7,...,78,64,89,92,113,129,213,259,517,1351
dog.873,36,21,71,303,758,3671,4859,4836,3189,2922,...,11,8,20,15,20,14,16,10,8,11
dog.8186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



Histogram statistics


Unnamed: 0,hfilenames,hdogvscat,hMean,hMedian,hMode,hModeW,hSD,hModeSkewness,hMedianSkewness
644,dog.8096,dog,0.3893,0.38235,0.24902,0.24104,0.19154,0.73235,0.03625
608,dog.9213,dog,0.39891,0.36275,0.3549,0.35,0.21353,0.20611,0.16938
383,dog.2834,dog,0.23583,0.22157,0.22157,0.22095,0.10742,0.13273,0.13273
77,dog.4238,dog,0.48725,0.47647,0.46863,0.46852,0.25005,0.07447,0.0431
660,dog.686,dog,0.50392,0.48824,0.50784,0.50712,0.1619,-0.02426,0.09686


Characteristics of mean histograms
Raw data: mean, std, mode, median, mode and median skewness : [0.464 0.453 0.998 0.997 0.246 -2.167 0.047]
The images were processes in 12 seconds
