# Create simualted datasets

For both the training and testing datasets.

The processing steps include:
- Domain randomisation (relrod, spot_spread)
- Multiple phases (6)
- Corruption of data (disabled for this example)
- Adding nose (S&P)
- Adding background intensity

In [1]:
# Packages
%matplotlib qt
import numpy as np
import hyperspy.api as hs
import pyxem as pxm
import diffpy.structure
from matplotlib import pyplot as plt
from tempfile import TemporaryFile
from diffsims.libraries.structure_library import StructureLibrary
from diffsims.generators.diffraction_generator import DiffractionGenerator
from diffsims.generators.library_generator import DiffractionLibraryGenerator, VectorLibraryGenerator
from pyxem.utils.sim_utils import sim_as_signal
import tqdm
import gc
import os



In [2]:
### Variables

# Paths
root = r'C:\Users\Sauron\Documents\jf631\SED_scripts'
structures_path = os.path.join(root, 'nn_models/crystal_phases')
phase_files = ['cubic_fapbi_scaled.cif', 'pbi2.cif', 'pbbr2.cif', 'pb.cif', 'gratia_4h.cif', 'gratia_6h.cif']

# Calibration values
calibration = 0.0045

# Processing values
n_angle_points = 500

# Domain amplification
simulated_direct_beam_bool = [False,]
relrod_list = [0.002, 0.02, 0.03]
spot_spread_list = [0.006, 0.014, 0.022]

# Simulation microscope values (for azimuthal integration)
detector_size = 515 #px
beam_energy = 200.0 #keV
wavelength = 2.5079e-12 #m
detector_pix_size = 55e-6 #m
from pyxem.detectors import Medipix515x515Detector
detector = Medipix515x515Detector()

# Data corruption
#corrupt_n_times = 2

# Noise addition values (do not change)
include_also_non_noisy_simulation = True
snrs = [0.9, 0.99]
intensity_spikes = [0.25,]

# Cropping and post-processing
cropping_start = 0.11
cropping_stop = 1.30
sqrt_signal = False
cropped_signal_points = 182 # To rebin signal, if necessary

# Background parameterisation values (A: pre-exp factor, tau: decay time constant)
a_vals = [0.5, 1., 2., 5.]
tau_vals = [0.5, 1, 1.5]

In [3]:
val = n_angle_points * len(phase_files) * len(relrod_list) * len(spot_spread_list) * len(snrs) * len(intensity_spikes)
print('Approx amount of 2D diffraction patterns that will be produced: {}'.format(val))
memory = detector_size**2 * val * 4 / 1e9  #4 bytes per float32 value
print('Approx memory needed: {} GB'.format(memory))

Approx amount of 2D diffraction patterns that will be produced: 54000
Approx memory needed: 57.2886 GB


## Simulate data for each phase

In [4]:
phase_dict = {}
for phase in phase_files:
    name = phase.split(".")[0]
    phase_dict[name] = diffpy.structure.loadStructure(os.path.join(structures_path, phase))

print('n_phases = {}'.format(len(phase_dict)))

n_phases = 6


In [5]:
def get_random_euler(npoints):
    radius = 1
    np.random.seed(1)
    u = np.random.randint(-100,100+1,size=(npoints,))/100 
    u2 = 2*np.pi*np.random.random(size=(npoints,))
    theta = 2*np.pi*np.random.random(size=(npoints,))
    x = radius*np.sqrt(1-u**2)*np.cos(theta)
    y = radius*np.sqrt(1-u**2)*np.sin(theta)
    z = radius*u 
    phi = np.arccos(z/radius)
    eulerAlpha = u2
    eulerBeta = phi
    eulerGamma = theta
    return np.array([np.rad2deg(eulerAlpha),np.rad2deg(eulerBeta),np.rad2deg(eulerGamma)]).T 


def get_reciprocal_radius(detector_size, calibration):
    half_pattern_size = detector_size // 2
    reciprocal_radius = calibration * half_pattern_size
    return reciprocal_radius


def create_diffraction_library(phase_dict, euler_list,
                                       beam_energy, relrod_length,
                                       calibration, detector_size,
                                       with_direct_beam):

    phase_names = list(phase_dict.keys())
    phases = list(phase_dict.values())
    euler_list_n = [euler_list, ] * len(phase_names)

    sample_lib = StructureLibrary(phase_names, phases, euler_list_n)
    ediff = DiffractionGenerator(beam_energy, relrod_length)
    diff_gen = DiffractionLibraryGenerator(ediff)

    reciprocal_radius = get_reciprocal_radius(detector_size, calibration)
    library = diff_gen.get_diffraction_library(sample_lib,
                                               calibration=calibration,
                                               reciprocal_radius=reciprocal_radius,
                                               half_shape=(detector_size//2, detector_size//2),
                                               with_direct_beam=with_direct_beam)
    return library

In [6]:
%%capture
data = {}
for key, val in phase_dict.items():
    data[key] = []
for with_direct_beam in simulated_direct_beam_bool:
    for relrod_length in tqdm.tqdm(relrod_list):
        for spot_spread in spot_spread_list:

            euler_list = get_random_euler(n_angle_points)

            library = create_diffraction_library(phase_dict, euler_list,
                                                 beam_energy, relrod_length,
                                                 calibration, detector_size,
                                                 with_direct_beam)

            reciprocal_radius = get_reciprocal_radius(detector_size, calibration)
            for euler in euler_list:
                for phase in library.keys():
                    pattern = sim_as_signal(library.get_library_entry(phase=phase,
                                                                      angle=euler)['Sim'],
                                            detector_size, spot_spread, reciprocal_radius)

                    data[phase].append(pattern)

In [7]:
# Stack data
import dask.array as da

for i, value in enumerate(data.values()):
    list_data = da.from_array([x.data for x in value], chunks=(10, detector_size, detector_size))

    if i ==0:
        #list_data = np.expand_dims(list_data, 1)
        training_data = list_data
    else:
        #list_data = np.expand_dims(list_data, 1)
        training_data = da.vstack([training_data, list_data],)

del data
del library
del list_data
gc.collect()

shape = (len(phase_dict.keys()),
         n_angle_points*len(relrod_list)*len(spot_spread_list)*len(simulated_direct_beam_bool),
         detector_size,
         detector_size)

training_data = training_data.reshape(shape)
training_data = pxm.LazyElectronDiffraction2D(training_data)
training_data.set_diffraction_calibration(calibration)
print(training_data)

<LazyElectronDiffraction2D, title: , dimensions: (4500, 6|515, 515)>


## Recenter

In [8]:
shiftList = np.zeros((np.size(training_data.data,0),
                      np.size(training_data.data,1),
                      2,)
                     )

shiftList[:,:,0]=0.5
shiftList[:,:,1]=0.5

shiftList = shiftList.reshape(-1, shiftList.shape[-1]) # Flatten the 2D navigtion axis

training_data.compute()
training_data.align2D(shifts=shiftList,crop=False,fill_value=0., parallel=True)

name = '2D_simulated_data_{}classes_{}neuler_domainrand_centered_{}cal.hspy'.format(np.size(training_data.data,0),  n_angle_points, calibration)
#training_data.save(os.path.join('2d_simulated_data', name))
print(training_data)



[########################################] | 100% Completed |  9.4s
<ElectronDiffraction2D, title: , dimensions: (4500, 6|515, 515)>


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=27000.0), HTML(value='')))

## Add noise

In two steps:
- S&P noise
- Poisson noise

In [9]:
def add_noise_to_simulation(simulation_arr, snr, int_salt,):

    import numpy as np

    # Salt and pepper
    def addsalt_pepper(dp_arr, snr, int_min = 0, int_max = int_salt,):

        p0 = snr
        # Add noise
        size = np.shape(dp_arr)
        mask = np.random.choice(a=(0, 1, 2),
                                size=size,
                                p=[p0, (1 - p0) / 2., (1 - p0) / 2.])

        im = dp_arr.copy()
        #im[mask == 1] = int_min # salt noise
        im[mask == 2] = int_max # pepper noise

        return im

    # Add poisson noise on sp noise and normalise
    im = simulation_arr.copy()
    im += np.random.poisson(im)
    im = im / im.max()

    # Add bright spots randomly accross detector
    im_sp = addsalt_pepper(im, snr,)

    return im_sp

In [10]:
# Map the noise addition function on signal

training_data_noisy = []

# Include the non-corrupted data in the dataset?
if include_also_non_noisy_simulation:
    training_data_noisy.append(training_data)

# Append noisy data
for snr in snrs:
    for int_spike in intensity_spikes:

        signal_noisy = training_data.map(add_noise_to_simulation,
                                         snr=snr, int_salt=int_spike,
                                         inplace=False, parallel=True)

        training_data_noisy.append(signal_noisy)

del training_data
gc.collect()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=27000.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=27000.0), HTML(value='')))

16

In [11]:
training_data_noisy = hs.stack(training_data_noisy, axis=0)
print(training_data_noisy)

[########################################] | 100% Completed |  0.8s
<ElectronDiffraction2D, title: , dimensions: (13500, 6|515, 515)>


## Integrate radially

In [12]:
camera_length = detector_pix_size / (wavelength * calibration * 1e10)
training_data_noisy.unit = "k_A^-1"
training_data_noisy.set_experimental_parameters(beam_energy=beam_energy)
radial_steps = np.ceil((int(detector_size/2) - 1)/2)*2
training_data_1D = training_data_noisy.get_azimuthal_integral1d(npt_rad=radial_steps,
                                                          center=([detector_size/2,detector_size/2]),
                                                          detector=detector,
                                                          detector_dist=camera_length,
                                                          map_kwargs={'parallel':True})
print(training_data_1D)

del training_data_noisy
gc.collect()

[0, 10.269911516467715]
<ElectronDiffraction1D, title: , dimensions: (13500, 6|256)>



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=81000.0), HTML(value='')))

1534

## Crop, normalise, sqrt, and rebin

In [13]:
# Crop
training_data_1D.crop_signal1D(cropping_start, cropping_stop)

# Rebin
scale_rebin = training_data_1D.data.shape[-1] / cropped_signal_points
scale_rebin
training_data_1D = training_data_1D.rebin(scale=(1,1,scale_rebin))

if sqrt_signal:
    training_data_1D.data = np.sqrt(training_data_1D.data)

dpmax = training_data_1D.data.max(2)
training_data_1D_norm = training_data_1D.data/dpmax[:,:,np.newaxis]

print(training_data_1D_norm.shape)

(6, 13500, 182)


  


## Add simulated background

Approximate background as a $A*exp^{(-tau \: q)}$ value.

In [14]:
def add_background_to_signal1d_array(normalised_1d_sim_data_array, x_axis,
                                     a_val, tau_val, bkg_function='exp_decay'):
    """
    :param normalised_1d_sim_data_array:
        The normalised 1d signal array (nav axis should be (points, phases, q))
    :param x_axis: array of the actual q values
        The A and tau values are optimised for 1/A-1 magnitude
    :return: extended signal with new sets of sim data without and with bakgrounds
    """
    def inv_q(x, A, tau):
        return A * x**(-tau)

    def exp_decay(x, A, tau):
        return A * np.exp(- tau * x)

    if bkg_function == 'exp_decay':
        bkg = exp_decay(x_axis, a_val, tau_val)
    elif bkg_function == 'inv_q':
        bkg = inv_q(x_axis, a_val, tau_val)

    s = normalised_1d_sim_data_array + bkg
    return s


# Expand datasets by copying and adding bkg
training_data_1D_norm_bkg = training_data_1D_norm

# Get the x-axis values from which to calculate bkg
qs = training_data_1D.axes_manager.signal_axes[0].axis

# Add bkg to signal
for a in a_vals:
    for tau in tau_vals:
        bkg_data = add_background_to_signal1d_array(training_data_1D_norm, qs, a, tau)
        training_data_1D_norm_bkg = np.hstack((training_data_1D_norm_bkg, bkg_data))

# Renormalise data
dpmax = training_data_1D_norm_bkg.max(-1)
training_data_1D_norm_bkg_norm = training_data_1D_norm_bkg/dpmax[:,:,np.newaxis]

In [15]:
training_data_1D_norm_bkg_norm.shape

(6, 175500, 182)

## NN requirements: reshape and labelling

In [16]:
training_data_1D_norm_bkg_norm = training_data_1D_norm_bkg_norm.reshape(-1, training_data_1D_norm_bkg_norm.shape[-1])

print(training_data_1D_norm_bkg_norm.shape)

(1053000, 182)


In [17]:
# Create labels
n_phases = len(phase_dict)
labels = np.zeros((n_phases, int(training_data_1D_norm_bkg_norm.shape[0]/n_phases)))
for i in range(n_phases):
    labels[i,:] = i

training_labels = labels.flatten()
training_labels.shape

(1053000,)

In [18]:
# Check for outliers and nan values
where_nan = np.argwhere(np.isnan(training_data_1D_norm_bkg_norm))
training_data_1D_norm_bkg_norm = np.delete(training_data_1D_norm_bkg_norm, where_nan[:,0], axis = 0)
training_labels = np.delete(training_labels, where_nan[:,0], axis = 0)
print(training_data_1D_norm_bkg_norm.shape, training_labels.shape)

(1037907, 182) (1037907,)


In [19]:
store_train_data = TemporaryFile()
x = training_data_1D_norm_bkg_norm
y = training_labels
phase_names = list(phase_dict.keys())

In [20]:
np.savez('1D_simulated_data_cal{}_{}classes_{}neuler_domainrand_noisy_bkg'.format(calibration,
                                                                              n_phases,
                                                                        n_angle_points,),
         x=x, y=y, phases=phase_names)

## Misc data

Radial integration in parallel