Here, we compare the results of the fit quality and photometric accuracy metrics of the pad/no_pad PyTorchDIA tests.

In [1]:
# imports
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import norm
from astropy.stats import mad_std
import seaborn as sns
import pandas as pd
%matplotlib inline

In [2]:
# load data
data_pad = np.genfromtxt('PyTorch_Pad.txt')
data_nopad = np.genfromtxt('PyTorch_NoPad.txt')
print(data_pad.shape, data_nopad.shape)

# data files have columns ordered such as:
# data_file = np.vstack((np.sum(kernel), B0, MSE, MFB, MFV, F_measured, var_min,
#                            star_density, phi_r, sky, phi_k,
#                            SNR_ref, SNR_imag, F_frac, shift_x, shift_y, ll))

(913, 17) (913, 17)


In [3]:
# clean data - make sure we're comparing identical sets of simulations
# the pad results are always computed first, so this is simply a case
# of removing some rows from the pad results file
def clean_data(pad, nopad):
    
    # 'image parameter' columns that will be identical to both data results files
    columns = [7,8,9,10,11,12,13,13,14,15]
    pad_image_params = pad[:, columns]
    nopad_image_params = nopad[:, columns]
    
    # cycle through each image parameter row, and check it appears in both files
    mask = []
    for indx in range(0, len(pad_image_params)):
        if pad_image_params[indx] in nopad_image_params:
            mask.append(0)
        else:
            mask.append(1)
    
    # remove rows unique to pad file
    mask = np.array(mask)
    pad = pad[np.where(mask == 0)]
    
    # check we're truly consistent
    pad_image_params = pad[:, columns]
    nopad_image_params = nopad[:, columns]
    if (pad_image_params == nopad_image_params).all() == True:
        print("Saul good...")
    else:
        print("Something's gone wrong...")
    
    print('Final file shapes:')
    print(pad.shape, nopad.shape)

    return pad, nopad

data_pad, data_nopad = clean_data(data_pad, data_nopad)

Saul good...
Final file shapes:
(913, 17) (913, 17)


In [4]:
# any NaNs to deal with?
print(np.isnan(data_pad).any(), np.isnan(data_nopad).any())

False False


OK, as a separate exercise, it'd be of interest in computing the likelihood ratios for both approaches.
The log-likelihood, for the data under the MLE model for both approaches is available in the last column of the data results files

In [5]:
pad_ll, nopad_ll = data_pad[:,-1], data_nopad[:,-1]

In [6]:
# likelihood ratio
lr = pad_ll - nopad_ll # for log-probabilites, ratio is the difference
# if pad_ll > nopad_ll, then lr > 0
print('Median, minimum and maximum:', np.median(lr), np.min(lr), np.max(lr))
print('Fraction where Pad peforms better:', np.where(lr > 0)[0].shape[0] / len(lr))

Median, minimum and maximum: 90.57977947662584 -468.82570407842286 15039.18148404616
Fraction where Pad peforms better: 0.9923329682365827


In [7]:
# create pandas dataframe - we'll need this to plot w. Seaborn later on

def numpy_to_DataFrame(array):
    df = pd.DataFrame(array, columns=['$P$', 'B0', 'MSE', 'MFB', 'MFV',
                                      'F_meas', 'var_min', 'Stellar Density', 'phi_r',
                                      'sky', 'phi_k', 'SNR_ref', '$\mathrm{SNR}_{I}$',
                                       '$\mathcal{F}_{\mathrm{Max}}/\mathcal{F}_{\mathrm{Total}}$',
                                        '$\delta x$', '$\delta y$', 'll'])

    
    df['$\mathcal{F}_{\mathrm{measured}}/\sigma_{\mathrm{min}}$'] = df['F_meas']/np.sqrt(df['var_min'])
    
    ## drop unecessary columns
    df = pd.DataFrame.drop(df, columns=['B0', 'F_meas', 'var_min', 'sky'])
    
    ## Define the SNR and sampling regimes ##
    # SNR: 3 categories
    cat1 = '8 < $\mathrm{SNR}_{I}$ < 40'
    cat2 = '40 < $\mathrm{SNR}_{I}$ < 200'
    cat3 = '200 < $\mathrm{SNR}_{I}$ < 1000'
    
    df['SNR regime'] = 0
    df['SNR regime'][(8. < df['$\mathrm{SNR}_{I}$']) & (df['$\mathrm{SNR}_{I}$'] < 40.)] = cat1
    df['SNR regime'][(40. < df['$\mathrm{SNR}_{I}$']) & (df['$\mathrm{SNR}_{I}$'] < 200.)] = cat2
    df['SNR regime'][(200. < df['$\mathrm{SNR}_{I}$']) & (df['$\mathrm{SNR}_{I}$'] < 1000.)] = cat3
    df = df[df['SNR regime']!= 0]

    # set to categorical type
    df['SNR regime'] = df['SNR regime'].astype('category')

    # sampling: 4 categories
    # under-/over-sampled reference or kernel
    # a critically sampled gaussian PSF has sigma = 1 pixel (or equivalently, FWHM = 2.35 pixels)
    cat4 = '$\phi_R > 1, \phi_K > 1$'
    cat5 = '$\phi_R > 1, \phi_K < 1$'
    cat6 = '$\phi_R < 1, \phi_K > 1$'
    cat7 = '$\phi_R < 1, \phi_K < 1$'
    
    df['Sampling regime'] = 0
    df['Sampling regime'][(df['phi_r'] > 1) & (df['phi_k'] > 1)] = cat4
    df['Sampling regime'][(df['phi_r'] > 1) & (df['phi_k'] < 1)] = cat5
    df['Sampling regime'][(df['phi_r'] < 1) & (df['phi_k'] > 1)] = cat6
    df['Sampling regime'][(df['phi_r'] < 1) & (df['phi_k'] < 1)] = cat7
    df = df[df['Sampling regime']!= 0]
    
    # set to categorical type
    df['Sampling regime'] = df['Sampling regime'].astype('category')

    # drop more uneccesary columns
    df = pd.DataFrame.drop(df, columns=['SNR_ref', '$\mathrm{SNR}_{I}$', 'phi_k', 'phi_r',
                                       '$\mathcal{F}_{\mathrm{Max}}/\mathcal{F}_{\mathrm{Total}}$',
                                       'Stellar Density', '$\delta x$', '$\delta y$', 'll'])
    
    

    return df

df_pad = numpy_to_DataFrame(data_pad)
df_nopad = numpy_to_DataFrame(data_nopad)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
df_pad

Unnamed: 0,$P$,MSE,MFB,MFV,$\mathcal{F}_{\mathrm{measured}}/\sigma_{\mathrm{min}}$,SNR regime,Sampling regime
3,0.977430,841.488958,-0.000387,0.997873,0.753407,40 < $\mathrm{SNR}_{I}$ < 200,"$\phi_R < 1, \phi_K < 1$"
4,0.968664,1083.681635,-0.001116,1.026604,-2.140991,200 < $\mathrm{SNR}_{I}$ < 1000,"$\phi_R < 1, \phi_K > 1$"
5,1.072733,538.146119,-0.006863,1.123900,-12.438067,40 < $\mathrm{SNR}_{I}$ < 200,"$\phi_R < 1, \phi_K > 1$"
6,1.003307,639.155363,-0.005010,1.144066,-7.470446,200 < $\mathrm{SNR}_{I}$ < 1000,"$\phi_R < 1, \phi_K > 1$"
7,0.970104,79.401543,-0.000934,0.997328,-0.243574,8 < $\mathrm{SNR}_{I}$ < 40,"$\phi_R > 1, \phi_K > 1$"
...,...,...,...,...,...,...,...
907,0.988742,658.498546,0.000396,0.973296,-0.159092,8 < $\mathrm{SNR}_{I}$ < 40,"$\phi_R > 1, \phi_K > 1$"
908,1.012317,847.364930,-0.001704,0.957530,-1.052855,40 < $\mathrm{SNR}_{I}$ < 200,"$\phi_R > 1, \phi_K > 1$"
910,0.942318,540.521945,-0.000660,0.985257,-1.000044,8 < $\mathrm{SNR}_{I}$ < 40,"$\phi_R < 1, \phi_K > 1$"
911,1.016841,941.353094,0.000229,0.980971,0.441940,40 < $\mathrm{SNR}_{I}$ < 200,"$\phi_R < 1, \phi_K > 1$"


In [9]:
# model fit quality and photometric accuracy metrics for the 3 SNR regimes
# each of which is subdivided into 1 of 4 sampling regimes
SNR_cats = ['8 < $\mathrm{SNR}_{I}$ < 40', '40 < $\mathrm{SNR}_{I}$ < 200', '200 < $\mathrm{SNR}_{I}$ < 1000']
sampling_cats = ['$\phi_R > 1, \phi_K > 1$', '$\phi_R > 1, \phi_K < 1$',
                 '$\phi_R < 1, \phi_K > 1$', '$\phi_R < 1, \phi_K < 1$']
def metrics(df):
    
    output_metrics = np.zeros((3*4, 6))
    
    i = 0
    
    for snr in SNR_cats:
        
        for sampling in sampling_cats:
        
            # SNR regime
            print('\nSNR regime:', snr)
            print('Sampling regime:', sampling)

            # fit quality metrics
            Ps = df['$P$'][(df['SNR regime'] == snr) & (df['Sampling regime'] == sampling)]
            MSEs = df['MSE'][(df['SNR regime'] == snr) & (df['Sampling regime'] == sampling)]
            MFBs = df['MFB'][(df['SNR regime'] == snr) & (df['Sampling regime'] == sampling)]
            MFVs = df['MFV'][(df['SNR regime'] == snr) & (df['Sampling regime'] == sampling)]

            print('Number of entries:', len(Ps))
            
            #print('P:', np.nanmedian(Ps))
            #print('MSE:', np.nanmedian(MSEs))
            #print('MFB:', np.nanmedian(MFBs))
            #print('MFV:', np.nanmedian(MFVs))

            # photometric accuracy metrics
            colname = '$\mathcal{F}_{\mathrm{measured}}/\sigma_{\mathrm{min}}$'
            nr = df[colname][(df['SNR regime'] == snr) & (df['Sampling regime'] == sampling)]
            nr = df[colname][(df['SNR regime'] == snr) & (df['Sampling regime'] == sampling)]

            # remove bad outliers
            sigma = mad_std(nr)
            nr = nr[nr < np.nanmedian(nr) + 5*sigma]
            nr = nr[nr > np.nanmedian(nr) - 5*sigma]

            MPB = 1./len(nr) * np.nansum(nr)
            MPV = 1./(len(nr) - 1) * np.nansum((nr - MPB)**2)
            MedPB = np.nanmedian(nr)
            MedPV = (mad_std(nr))**2
            #print('N_set:', len(nr))
            #print('MPB:', MPB)
            #print('MPV:', MPV)
            #print('MedPB:', MedPB)
            #print('MedPV:', MedPV)
            
            print('|   P    |   MSE   |   MFB   |   MFV |   MPB |  MPV  |')
            results = np.array([np.median(Ps), np.median(MSEs), np.median(MFBs), np.median(MFVs), MPB, MPV])
            np.set_printoptions(precision=4, suppress=True)
            print(results)

            #output_metrics = np.append(output_metrics, [np.median(Ps), np.median(MSEs),  np.median(MFBs),
            #                                           np.median(MFVs), MPB, MPV],)
            
            output_metrics[i] = [np.median(Ps), np.median(MSEs),  np.median(MFBs), np.median(MFVs), MPB, MPV]
            
            i += 1

        
    return output_metrics

print('Padded')        
pad_metrics = metrics(df_pad)
print('\nNot Padded')
nopad_metrics = metrics(df_nopad)

Padded

SNR regime: 8 < $\mathrm{SNR}_{I}$ < 40
Sampling regime: $\phi_R > 1, \phi_K > 1$
Number of entries: 113
|   P    |   MSE   |   MFB   |   MFV |   MPB |  MPV  |
[  1.0001 506.9571   0.0002   0.9863  -0.1872   1.3508]

SNR regime: 8 < $\mathrm{SNR}_{I}$ < 40
Sampling regime: $\phi_R > 1, \phi_K < 1$
Number of entries: 33
|   P    |   MSE   |   MFB   |   MFV |   MPB |  MPV  |
[  0.9985 537.5381  -0.0002   0.9907   0.3101   0.959 ]

SNR regime: 8 < $\mathrm{SNR}_{I}$ < 40
Sampling regime: $\phi_R < 1, \phi_K > 1$
Number of entries: 43
|   P    |   MSE   |   MFB   |   MFV |   MPB |  MPV  |
[  1.0008 567.5243   0.0001   0.9863  -0.3937   1.637 ]

SNR regime: 8 < $\mathrm{SNR}_{I}$ < 40
Sampling regime: $\phi_R < 1, \phi_K < 1$
Number of entries: 9
|   P    |   MSE   |   MFB   |   MFV |   MPB |  MPV  |
[  0.9936 441.3856  -0.0006   0.9992   1.0025   0.664 ]

SNR regime: 40 < $\mathrm{SNR}_{I}$ < 200
Sampling regime: $\phi_R > 1, \phi_K > 1$
Number of entries: 129
|   P    |   MSE   | 

In [10]:
pad_metrics.shape, nopad_metrics.shape

((12, 6), (12, 6))

In [11]:
pad_metrics, nopad_metrics

(array([[   1.0001,  506.9571,    0.0002,    0.9863,   -0.1872,    1.3508],
        [   0.9985,  537.5381,   -0.0002,    0.9907,    0.3101,    0.959 ],
        [   1.0008,  567.5243,    0.0001,    0.9863,   -0.3937,    1.637 ],
        [   0.9936,  441.3856,   -0.0006,    0.9992,    1.0025,    0.664 ],
        [   0.9997,  513.1494,   -0.0009,    0.9916,   -1.1774,    2.0942],
        [   0.9997,  516.866 ,   -0.0007,    0.9935,   -0.0629,    1.4486],
        [   1.0043,  627.3164,   -0.0004,    0.9943,   -1.4648,    5.9417],
        [   0.9984,  700.8094,   -0.0007,    1.0078,   -1.1669,    1.6085],
        [   0.9956,  706.2045,   -0.0028,    1.0286,   -3.332 ,    6.7095],
        [   0.9971,  744.525 ,   -0.0019,    1.0132,   -0.9799,    1.9197],
        [   0.9995,  841.7449,   -0.003 ,    1.0752,   -4.2263,   14.2864],
        [   1.0015, 1077.1404,   -0.0021,    1.0952,   -2.6422,    2.8218]]),
 array([[   0.9935,  509.2872,    0.0016,    0.9904,    0.3841,    0.6666],
        [ 

In [12]:
np.median(pad_metrics, axis=0), np.median(nopad_metrics, axis=0), 

(array([  0.9996, 597.4203,  -0.0007,   0.9967,  -1.0734,   1.7784]),
 array([  0.9978, 608.0006,   0.0057,   1.0004,   0.1793,   0.7648]))

Focussing only on the fit quality metrics (we'll get to photometric accuracy in a bit), it appears that padding is the superior approach for these image pairs. In general, the photometric scale factor is closer to 1, the MSE is lower, the MFB is closer to 0 and the MFV is closer to 1.

However, the MPB and MPV is clearly worse when padding. This however, is not surprising. The MPB and MPV apply only to the central, brightest star. We tend to overfit this star anyway, so when we reject even more border pixels, the overfitting becomes even worse, and so the MPV is almost always < 1 for the no padding approach. Indeed, for the the case of 142x142 images and a 19x19 kernel, if we don't pad we are removing...

In [13]:
142**2 - 124**2

4788

4788 pixels, which is...

In [14]:
4788 / 142**2

0.23745288633207698

almost a quarter of the total original number of pixels, and so the relative influence of the bright central star is greater still.