# Create Simulated Datasets of 2D Images (1D Integration)

## Coordinates Output
For both the training and testing datasets.
This will simplify the simulation to take a step back.

This will output diffraction simulation images and integrate into 1D.

## Initialization

In [1]:
# Packages
%matplotlib qt
import numpy as np
import pandas as pd
import hyperspy.api as hs
import pyxem as pxm
import diffpy.structure
from matplotlib import pyplot as plt
from tempfile import TemporaryFile
from diffsims.libraries.structure_library import StructureLibrary
from diffsims.generators.diffraction_generator import DiffractionGenerator
from diffsims.generators.library_generator import DiffractionLibraryGenerator, VectorLibraryGenerator
from diffsims.sims.diffraction_simulation import DiffractionSimulation
from diffsims.libraries.diffraction_library import DiffractionLibrary
from pyxem.generators.indexation_generator import VectorIndexationGenerator
from pyxem.generators.subpixelrefinement_generator import SubpixelrefinementGenerator
from pyxem.signals.diffraction_vectors import DiffractionVectors
import tqdm
import gc
import os



In [3]:
### Variables

# Paths
root = r'C:/Users/anish/Documents/GitHub/ml_pyxem/mini_2/'

# Phases
structures_path = os.path.join(root, 'crystal_phases')
phase_files = ['p4mbm_tetragonal.cif','2','3']
add_bkg_phase = False # Do you want to add a bkg/just noise phase at the end? If True, the final datasets will be phases + 1 shape.

# Calibration values
calibration = 0.00588 #To have multiple, create an array (i.e. calibrations = [0.00588]) and set for value in for loop

# Processing values
n_angle_points = 30000
multiplicity = 2

# Domain amplification
simulated_direct_beam_bool = [False,]
relrod_list = [0.02,]
spot_spread_list = [0.02,]
scattering_params = 'lobato'

# Noise addition values (do not change)
remove_peaks = True
add_noise = True
include_also_non_noisy_simulation = False # If add noise, do you want to also have the non-noisy data?
snrs = [0.9, 0.99]
intensity_spikes = [0.25,]

# Cropping and post-processing
cropping_start_k = 0.11 #k units
cropping_stop_k = 1.30 #k_units
cropped_signal_k_points = 147 # To rebin signal, if necessary (when using k_units)

cropping_start_px = 13.55 #pixels
cropping_stop_px = 160.55 #pixels
sqrt_signal = False

# Background parameterisation values (A: pre-exp factor, tau: decay time constant)
add_background_to = 'none' # Select from 'all', '1D_only', 'none'
a_vals = [1., 5.]
tau_vals = [0.5, 1.5]

# Simulation microscope values (for azimuthal integration)
detector_size = 515 #px
beam_energy = 200.0 #keV
wavelength = 2.5079e-12 #m
detector_pix_size = 55e-6 #m
from pyxem.detectors import Medipix515x515Detector
detector = Medipix515x515Detector()

# Debug (save and plot files)
save_hspy_files = False
plot_hspy_files = False

In [4]:
val = n_angle_points * multiplicity * (len(phase_files)) * len(relrod_list) * len(spot_spread_list) * len(snrs) * len(intensity_spikes)
print('Approx amount of 2D diffraction patterns that will be produced: {}'.format(val))
memory = detector_size**2 * val * 4 / 1e9  #4 bytes per float32 value
print('Approx memory needed: {} GB'.format(memory))

Approx amount of 2D diffraction patterns that will be produced: 720000
Approx memory needed: 763.848 GB


## Simulate Data

### Define Functions

In [47]:
phase_dict = {}
for phase in phase_files:
     name = phase.split(".")[0]
     phase_dict[name] = diffpy.structure.loadStructure(os.path.join('crystal_phases', phase))
     print('n_phases = {}'.format(len(phase_dict)))

n_phases = 1


In [48]:
def get_random_euler(npoints):
    radius = 1
    np.random.seed(1)
    u = np.random.randint(-100,100+1,size=(npoints,))/100 
    u2 = 2*np.pi*np.random.random(size=(npoints,))
    theta = 2*np.pi*np.random.random(size=(npoints,))
    x = radius*np.sqrt(1-u**2)*np.cos(theta)
    y = radius*np.sqrt(1-u**2)*np.sin(theta)
    z = radius*u 
    phi = np.arccos(z/radius)
    eulerAlpha = u2
    eulerBeta = phi
    eulerGamma = theta
    return np.array([np.rad2deg(eulerAlpha),np.rad2deg(eulerBeta),np.rad2deg(eulerGamma)]).T 

def get_reciprocal_radius(detector_size, calibration):
    half_pattern_size = detector_size // 2
    reciprocal_radius = calibration * half_pattern_size
    return reciprocal_radius


def create_diffraction_library(phase_dict, euler_list, beam_energy, scattering_params, relrod_length, calibration, detector_size, with_direct_beam):

    phase_names = list(phase_dict.keys())
    phases = list(phase_dict.values())
    euler_list_n = [euler_list, ] * len(phase_names)

    sample_lib = StructureLibrary(phase_names, phases, euler_list_n)
    ediff = DiffractionGenerator(beam_energy, scattering_params, relrod_length)
    diff_gen = DiffractionLibraryGenerator(ediff)

    reciprocal_radius = get_reciprocal_radius(detector_size, calibration)
    
    library = diff_gen.get_diffraction_library(sample_lib,
                                               calibration=calibration,
                                               reciprocal_radius=reciprocal_radius,
                                               half_shape=(detector_size//2, detector_size//2),
                                               with_direct_beam=with_direct_beam)
    return library

### Create Diffraction Patterns

In [49]:
%%capture

data = {}
for key, val in phase_dict.items():
    data[key] = []
    
for with_direct_beam in simulated_direct_beam_bool:
    for relrod_length in tqdm.tqdm(relrod_list):
        for spot_spread in spot_spread_list:

            euler_list = get_random_euler(n_angle_points)
            euler_list = np.tile(euler_list, (multiplicity, 1))

            library = create_diffraction_library(phase_dict, euler_list, beam_energy, scattering_params, relrod_length, calibration, detector_size, with_direct_beam)
            reciprocal_radius = get_reciprocal_radius(detector_size, calibration)

In [50]:
euler_list

array([[124.40186174, 129.05012254, 201.12833824],
       [142.83629072,  66.42182152,  50.53929789],
       [193.97402424, 106.26020471,  71.31653607],
       [150.91002519,  68.28438272, 288.26804472],
       [246.67902014,  70.73122451, 348.57416726],
       [ 73.6028099 , 102.12235224, 112.83270414],
       [316.1222771 ,  23.07391807, 249.23614164],
       [  9.85953355,  63.89611886, 315.50009483],
       [241.36830366,  73.14204398, 322.05839886],
       [150.22972885, 106.85795602,  30.61591609]])

In [51]:
#print(library)

### Plotting Check

In [52]:
# px_coords_array = library['p4mbm_tetragonal']['pixel_coords']
# px_coords_i_x = library['p4mbm_tetragonal']['pixel_coords'][0][:,0]
# px_coords_i_y = library['p4mbm_tetragonal']['pixel_coords'][0][:,1]
# plt.figure(1)
# plt.scatter(px_coords_i_x,px_coords_i_y) #shows diffraction pattern of a single configuration - based on Cartesian coordinates

#rec_coords_array = library['p4mbm_tetragonal']['rec_coords']
#rec_coords_i_x = library['p4mbm_tetragonal']['rec_coords'][0][:,0]
#rec_coords_i_y = library['p4mbm_tetragonal']['rec_coords'][0][:,1]
#plt.figure(2)
#plt.scatter(rec_coords_i_x,rec_coords_i_y) #shows diffraction pattern of a single configuration - based on reciprocal coordinates

## Data Augmentation

### Peak Removal

### Define Functions

In [53]:
def remove_random_peaks(phase_dict, n_patterns, size):
    import random
    
    phase_names = list(phase_dict.keys())
    augmented_library = library
    
    for phase_name in phase_names:
        for i in range(n_patterns):
                v = random.sample(range(len(augmented_library[phase_name]['rec_coords'][i])-1),size)
                augmented_library[phase_name]['rec_coords'][i][v] = 0
                augmented_library[phase_name]['pixel_coords'][i][v] = 0
                augmented_library[phase_name]['intensities'][i][v] = 0
    return augmented_library

In [54]:
if remove_peaks:
    augmented_library = remove_random_peaks(phase_dict=phase_dict, n_patterns=len(euler_list), size=1)
else:
    augmented_library = library
    
#print(augmented_library)

### Plotting Check

In [55]:
# for i in range(len(euler_list)):
#     px_coords_i_x = augmented_library['p4mbm_tetragonal']['pixel_coords'][i][:,0]
#     px_coords_i_y = augmented_library['p4mbm_tetragonal']['pixel_coords'][i][:,1]
#     plt.figure(i+1)
#     plt.scatter(px_coords_i_x,px_coords_i_y) #shows diffraction pattern of a single configuration - based on Cartesian coordinates

# px_coords_array = augmented_library['p4mbm_tetragonal']['pixel_coords']
# px_coords_i_x = augmented_library['p4mbm_tetragonal']['pixel_coords'][0][:,0]
# px_coords_i_y = augmented_library['p4mbm_tetragonal']['pixel_coords'][0][:,1]
# plt.figure(2)
# plt.scatter(px_coords_i_x,px_coords_i_y) #shows diffraction pattern of a single configuration - based on Cartesian coordinates

# px_coords_array = augmented_library['p4mbm_tetragonal']['pixel_coords']
# px_coords_i_x = augmented_library['p4mbm_tetragonal']['pixel_coords'][1][:,0]
# px_coords_i_y = augmented_library['p4mbm_tetragonal']['pixel_coords'][1][:,1]
# plt.figure(3)
# plt.scatter(px_coords_i_x,px_coords_i_y) #shows diffraction pattern of a single configuration - based on Cartesian coordinates

In [56]:
for euler in euler_list:
    for phase in library.keys():
        pattern = DiffractionSimulation.get_diffraction_pattern(augmented_library.get_library_entry(phase=phase,angle=euler)['Sim'])
        data[phase].append(pattern)
        # plt.figure()as
        # plt.imshow(pattern, cmap='viridis')

### Stack Data

In [58]:
import dask.array as da

for i, value in enumerate(data.values()):
    list_data = da.from_array([x.data for x in value], chunks=(10, 512, 512))

    if i ==0:
        #list_data = np.expand_dims(list_data, 1)
        training_data = list_data
    else:
        #list_data = np.expand_dims(list_data, 1)
        training_data = da.vstack([training_data, list_data],)


# del data
# del library
# del augmented_library
# del list_data
gc.collect()

shape = (len(phase_dict.keys()),
         n_angle_points * len(relrod_list) * len(spot_spread_list) * len(simulated_direct_beam_bool),
         512,
         512)       

# print(np.shape(data.values()))
#print(np.shape(list_data))
# print(np.shape(training_data))
# print(shape)

training_data = training_data.reshape(shape)
training_data = pxm.signals.LazyElectronDiffraction2D(training_data)
training_data.set_diffraction_calibration(calibration)
print(training_data)

<LazyElectronDiffraction2D, title: , dimensions: (10, 1|512, 512)>


In [59]:
training_data.plot()

[########################################] | 100% Completed |  0.2s


### Recenter

In [60]:
shiftList = np.zeros((np.size(training_data.data,0),
                      np.size(training_data.data,1), 2
                      )
                     )

shiftList[:,:,0]=0.5
shiftList[:,:,1]=0.5

#shiftList = shiftList.flatten() #Flatten the 2D navigation axis

#reshape(-1, shiftList.shape[-1]) # Flatten the 2D navigtion axis

training_data.compute()
training_data.align2D(shifts=shiftList,crop=False,fill_value=0., parallel=True)

[########################################] | 100% Completed |  0.1s




  0%|          | 0/10 [00:00<?, ?it/s]

In [61]:
if save_hspy_files:
    name = '2D_hspy_simdata_{}classes_{}neuler_{}cal_{}relrod_{}spotsize.hspy'.format(
        len(phase_dict), n_angle_points, calibration, relrod_list, spot_spread_list)

    training_data.save(name, overwrite=True)
if plot_hspy_files:
    training_data.plot(cmap='viridis')

print(training_data)

<ElectronDiffraction2D, title: , dimensions: (10, 1|512, 512)>


### Background Phase

In [62]:
# Only if `add_bkg_phase` is True. Otherwise no changes.
if add_bkg_phase:
    # Add phase in the dictionary
    phase_dict['bkg_phase'] = []

    # Create blank detector
    shape_blank = np.shape(training_data,)[1:]
    shape_blank = (1,) + shape_blank
    blank = pxm.signals.electron_diffraction2d.ElectronDiffraction2D(np.zeros(shape_blank))
    training_data = hs.stack([training_data, blank], axis=1)

print(len(phase_dict))
training_data.data.shape

1


(1, 10, 512, 512)

### Define Functions

In [63]:
def add_noise_to_simulation(simulation_arr, snr, int_salt,):
    import numpy as np
    # Salt and pepper
    def addsalt_pepper(dp_arr, snr, int_min = 0, int_max = int_salt,):
        p0 = snr
        # Add noise
        size = np.shape(dp_arr)
        mask = np.random.choice(a=(0, 1, 2),
                                size=size,
                                p=[p0, (1 - p0) / 2., (1 - p0) / 2.])

        im = dp_arr.copy()
        #im[mask == 1] = int_min # salt noise
        im[mask == 2] = int_max # pepper noise
        return im
    # Add poisson noise on sp noise and normalise
    im = simulation_arr.copy()
    im += np.random.poisson(im)
    max = im.max()
    if max == 0:
        im = im
    else:
        im = im / im.max()
    # Add bright spots randomly accross detector
    im_sp = addsalt_pepper(im, snr,)
    return im_sp

def add_background_to_signal_array(normalised_sim_data_array, x_axis,
                                     a_val, tau_val, bkg_function='exp_decay', dimensions=1):
    """
    :param normalised_sim_data_array:
        The normalised 1d signal array (nav axis should be (points, phases, q))
    :param x_axis: array of the actual q values
        The A and tau values are optimised for 1/A-1 magnitude
    :return: extended signal with new sets of sim data without and with bakgrounds
    """
    def inv_q(x, A, tau):
        return A * x**(-tau)

    def exp_decay(x, A, tau):
        return A * np.exp(- tau * x)

    if bkg_function == 'exp_decay':
        bkg = exp_decay(x_axis, a_val, tau_val)
    elif bkg_function == 'inv_q':
        bkg = inv_q(x_axis, a_val, tau_val)

    if dimensions == 1:
        return normalised_sim_data_array + bkg

    elif dimensions == 2:
        n = normalised_sim_data_array.shape[-1]
        bkg = np.tile(bkg, (n,1)).T
        return normalised_sim_data_array + bkg

### Salt & Pepper, Poisson Noise

In [64]:
# Map the noise addition function on signal
if add_noise:
    training_data_noisy = []

    # Include the non-corrupted data in the dataset?
    if include_also_non_noisy_simulation:
        training_data_noisy.append(training_data)

    # Append noisy data
    for snr in snrs:
        for int_spike in intensity_spikes:

            signal_noisy = training_data.map(add_noise_to_simulation,
                                             snr=snr, int_salt=int_spike,
                                             inplace=False, parallel=True)

            training_data_noisy.append(signal_noisy)

    del training_data
    del signal_noisy
    gc.collect()

    training_data_noisy = hs.stack(training_data_noisy, axis=0)

else:
    # No noise addition
    training_data_noisy = training_data

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[########################################] | 100% Completed |  0.1s


### Saving

In [65]:
if save_hspy_files and add_noise:
    name = '2D_hspy_simdata_{}classes_{}neuler_{}cal_{}relrod_{}spotsize_withNoise.hspy'.format(
        len(phase_dict), n_angle_points, calibration, relrod_list, spot_spread_list)

    training_data_noisy.save(name, overwrite=True)
if plot_hspy_files and add_noise:
    training_data_noisy.plot(cmap='viridis')

print(training_data_noisy)

<ElectronDiffraction2D, title: Stack of , dimensions: (20, 1|512, 512)>


## Integrate Radially 1D

In [66]:
camera_length = detector_pix_size / (wavelength * calibration * 1e10)
training_data_noisy.unit = "k_A^-1"
training_data_noisy.set_experimental_parameters(beam_energy=beam_energy)
radial_steps = int(np.ceil((int(detector_size/2) - 1)/2)*2)
training_data_noisy.set_ai(center=([detector_size/2,detector_size/2]))#, detector=detector, detector_dist=camera_length)
training_data_1D = training_data_noisy.get_azimuthal_integral1d(npt=radial_steps) #, map_kwargs={'parallel':True})
print(training_data_1D)

del training_data_noisy
gc.collect()

[                                        ] | 0% Completed |  0.0s



[########################################] | 100% Completed |  0.7s
<ElectronDiffraction1D, title: , dimensions: (20, 1|256)>


1736

### Saving

In [67]:
if save_hspy_files:
    name = '1D_hspy_simdata_{}classes_{}neuler_{}cal_{}relrod_{}spotsize.hspy'.format(
        len(phase_dict), n_angle_points, calibration, relrod_list, spot_spread_list)

    training_data_1D.save(name, overwrite=True)
if plot_hspy_files:
    training_data_1D.plot()

print(training_data_1D)

<ElectronDiffraction1D, title: , dimensions: (20, 1|256)>


### Normalise and Sqrt

In [68]:
# 1D dataset
# Sqrt signal (if wanted)
if sqrt_signal:
    training_data_1D.data = np.sqrt(training_data_1D.data)

# Normalise
dpmax = training_data_1D.data.max(2)
training_data_1D_norm = training_data_1D.data/dpmax[:,:,np.newaxis]

# Correct any nan value
nan_mask = np.isnan(training_data_1D_norm)
training_data_1D_norm[nan_mask] = 0

print(training_data_1D_norm.shape)

(1, 20, 256)


## Add Simulated Background

Approximate background as a $A*exp^{(-tau \: q)}$ value.

In [69]:
# For 1D dataset
# Expand datasets by copying and adding bkg
training_data_1D_norm_bkg = training_data_1D_norm

if add_background_to != 'none':
    # Get the x-axis values from which to calculate bkg
    qs = training_data_1D.axes_manager.signal_axes[0].axis
    qs

    # Add bkg to signal
    for a in a_vals:
        for tau in tau_vals:
            bkg_data = add_background_to_signal_array(training_data_1D_norm, qs, a, tau)
            training_data_1D_norm_bkg = np.hstack((training_data_1D_norm_bkg, bkg_data))

    del bkg_data
    gc.collect()

training_data_1D_norm_bkg.shape

(1, 20, 256)

## Crop, Rebin and Renormalise

Crop both in terms of q (rebin but no shift) and pixel values (shift but no rebin).

In [70]:
# 1D dataset
training_data_1D_norm_bkg = hs.signals.Signal1D(training_data_1D_norm_bkg)

training_data_1D_px = training_data_1D_norm_bkg.deepcopy()

# Recreate .hspy object to crop with k units
scale = training_data_1D.axes_manager.signal_axes[0].scale
offset = training_data_1D.axes_manager.signal_axes[0].offset
training_data_1D_norm_bkg.axes_manager.signal_axes[0].scale = scale
training_data_1D_norm_bkg.axes_manager.signal_axes[0].offset = offset

training_data_1D_q = training_data_1D_norm_bkg.deepcopy()

# del training_data_1D
# del training_data_1D_norm
# del training_data_1D_norm_bkg
gc.collect()

# In k units:
# Crop in k units
training_data_1D_q.crop_signal1D(cropping_start_k, cropping_stop_k)
# Rebin
scale_rebin = training_data_1D_q.data.shape[-1] / cropped_signal_k_points
scale_rebin
training_data_1D_q = training_data_1D_q.rebin(scale=(1,1,scale_rebin))
# Renormalise data
dpmax = training_data_1D_q.data.max(-1)
training_data_1D_q = training_data_1D_q.data/dpmax[:,:,np.newaxis]

# In pixel units:
# Crop in pixel units
training_data_1D_px.crop_signal1D(cropping_start_px, cropping_stop_px)
# Renormalise data
dpmax = training_data_1D_px.data.max(-1)
training_data_1D_px = training_data_1D_px.data/dpmax[:,:,np.newaxis]

print(training_data_1D_q.shape)
print(training_data_1D_px.shape)

(1, 20, 147)
(1, 20, 147)


## NN Requirements: Reshape and Labelling

In [71]:
phase_names = list(phase_dict.keys())

print(phase_names)

['p4mbm_tetragonal']


In [72]:
# 1D dataset
#training_data_1D_q = training_data_1D_q.reshape(-1, training_data_1D_q.shape[-1])
training_data_1D_px = training_data_1D_px.reshape(-1, training_data_1D_px.shape[-1])

print(training_data_1D_q.shape)
print(training_data_1D_px.shape)

(1, 20, 147)
(20, 147)


In [73]:
# Create labels for 1D
n_phases = len(phase_dict)
labels = np.zeros((n_phases, int(training_data_1D_q.shape[0]/n_phases)))
for i in range(n_phases):
    labels[i,:] = i

training_labels = labels.flatten()
training_labels.shape

(1,)

### Saving

In [74]:
# Check for outliers and nan values
where_nan_q = np.argwhere(np.isnan(training_data_1D_q))
where_nan_px = np.argwhere(np.isnan(training_data_1D_px))

training_data_1D_q = np.delete(training_data_1D_q, where_nan_q[:,0], axis = 0)
training_labels_q = np.delete(training_labels, where_nan_q[:,0], axis = 0)

training_data_1D_px = np.delete(training_data_1D_px, where_nan_px[:,0], axis = 0)
training_labels_px = np.delete(training_labels, where_nan_px[:,0], axis = 0)

print(training_data_1D_q.shape, training_labels_q.shape)
print(training_data_1D_px.shape, training_labels_px.shape)

(1, 20, 147) (1,)
(20, 147) (1,)


In [75]:
store_train_data = TemporaryFile()
x = training_data_1D_q
y = training_labels_q

np.savez('1D_simulated_data_cal{}_cropK_{}classes_{}neuler'.format(calibration,
                                                                              n_phases,
                                                                        n_angle_points,),
         x=x, y=y, phases=phase_names)

In [76]:
store_train_data = TemporaryFile()
x = training_data_1D_px
y = training_labels_px

np.savez('1D_simulated_data_cal{}_cropPX_{}classes_{}neuler'.format(calibration,
                                                                              n_phases,
                                                                        n_angle_points,),
         x=x, y=y, phases=phase_names)

### Plotting

In [77]:
i = 0
plt.figure()
#plt.plot(training_data_1D_px[i], label='px')
plt.plot(training_data_1D_q[i], label='q')
plt.legend()
#plt.savefig('1D_plt_compare_k_q_cropping_i{}.png'.format(i))

# del training_data_1D_px
# del training_data_1D_q

<matplotlib.legend.Legend at 0x7fbe2a425940>