In [None]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import tifffile
import sys, time, os, csv
import cProfile, pstats
from jutils import tqdm_alias as tqdm

import psfmatrix, lfimage
import projector, lfdeconv
import special_fftconvolve as special
import jutils as util
import py_light_field as plf

In [None]:
maxiter = 8
tileFactor = 2
projectorClass = projector.Projector_allC
inputImage = lfimage.LoadLightFieldTiff('/Users/jonny/Movies/Nils files/Rectified/Left/Cam_Left_40_X1_N19.tif')
hMatrix = psfmatrix.LoadMatrix('PSFmatrix/reducedPSFMatrix_M22.2NA0.5MLPitch125fml3125from-156to156zspacing4Nnum19lambda520n1.33.mat')

### Performance issues to consider
- ConvolvePart4 is only threaded over the images, i.e. it is single-threaded if we only have one image! That is definitely not optimal, and I need to work on how to do something about that. Is there any reason I cannot thread over aa,bb? The advantage would be that the threading would be at a higher level. The disadvantage would be that I would only have a limited number of units to thread over (even if there were a large number of images). I suppose I could do both(!). Perhaps the complication is just that I would be threading at quite a high level, with a lot going on below it...  One way or another, I do need to come up with a solution. It's possible that I could thread calculateRow, but that would need care because the work elements would be so small.
- Note that the GPU version should not suffer from this issue (because it's threaded at a much lower level anyway).
- There is a general trend where work elements that take longer have a lower thread efficiency. It really does seem like what is happening is just natural variation in run time. It is causing a problem because I have hard sync points at which I wait before allowing anything else to continue at all. I may just have to accept that, unless I really want to over-engineer things and restructure my C code.
- (I have tried making memory-aligned python arrays, but that does not seem to have speeded anything up)

In [None]:
if True:
    # Timing measurements
    inputImageXN = np.tile(inputImage[np.newaxis,:,:],(tileFactor,1,1))
    # We will only process one plane, in the interests of speed,
    # although we must remember that that means the analysis may not 
    # be representative of a full deconvolution pipeline
    planesToRun = range(0,1)
    # Prime the cache first
    Htf = lfdeconv.BackwardProjectACC(hMatrix, inputImageXN, planes=planesToRun, progress=tqdm, logPrint=False, projector=projectorClass())
    # Now actually measure
    plf.SetStatsFile('timestats.txt', False)
    Htf = lfdeconv.BackwardProjectACC(hMatrix, inputImageXN, planes=planesToRun, progress=tqdm, logPrint=False, projector=projectorClass())
    plf.SetStatsFile('', True)

In [None]:
if True:
    rows = []
    with open('timestats.txt') as f:
        cf = csv.reader(f, delimiter='\t')
        for row in cf:
            rows.append(row)
    vals = []
    delta = np.array(rows[-1][1:]).astype(np.double) - np.array(rows[0][1:]).astype(np.double)
    print('Overall efficiency', delta[1]/delta[0], delta[0], delta[1])
    thresh = 0.04
    plt.figure()
    plt.title('Parallelism with tile factor {0}'.format(tileFactor))
    deltas = []
    categories = dict()
    print('Some performance sampling points:')
    for i in range(0, len(rows)-1):
        row = rows[i]
        delta = np.array(rows[i+1][1:]).astype(np.double) - np.array(row[1:]).astype(np.double)
        deltas.append(delta)
        if (i < 20) or (delta[0] > thresh):
            print(' ', i, row[0], delta[0], delta[1], delta[2], delta[1]/delta[0])
        vals.append(delta[1]/delta[0])
        c='black'
        marker='.'
        if row[0] == 'FHInit':
            c='yellow'
        if row[0] == 'FHRun':
            c='red'
        elif row[0] == 'ConvolvePart4Run':
            c='green'
        elif row[0] == 'MirrorY':
            c='blue'
        elif row[0] == 'IRFFT':
            c='orange'
        elif row[0].endswith('Plan'):
            c='brown'
        if row[0] in categories:
            categories[row[0]] += delta
        else:
            categories[row[0]] = delta

        if (i < 10000):
            x = delta[0]
            if (x > thresh):
                x = thresh
            plt.plot(x, delta[1]/delta[0], marker, color=c)
    plt.show()

deltas = np.array(deltas).T

In [None]:
av = 0
weights = 0
print('Performance broken down by category')
for k in categories:
    print(' ', k, categories[k][0], categories[k][1], categories[k][1]/categories[k][0])
    weights += categories[k][0]
    av += categories[k][0] * categories[k][1]/categories[k][0]
av /= weights
print('Average CPU load: %.2fx' % av)

In [None]:
# Monitor page faults just to check they are not a major issue,
# and that they do not correlate with run times
plt.figure()
plt.plot(np.minimum(deltas[0], thresh), deltas[5], '.', label='minflt')
plt.legend()
plt.show()