In [2]:
import pandas as pd
import numpy as np
import glob, os
from util import *
from sklearn.preprocessing import MinMaxScaler
import pickle

In [1]:
def concatenate_arrays(*arrays):
    return np.concatenate(arrays, axis=None)


def split_and_concatenate(fields, scaling_factors, normalize_intensity=True, normalize_phase=True):
    energy_out = np.array([])
    intensity_out = np.array([])
    phase_out = np.array([])
    i = 0
    for field in fields:
        intensity = get_intensity(field)
        phase = get_phase(field)
        energy = calc_energy_expanded(field, scaling_factors["grid_spacing"][i], scaling_factors["beam_area"])

        energy_out = concatenate_arrays(energy_out, energy)

        int_factor = np.max(intensity)
        if (int_factor == 0 or normalize_intensity==False):
            intensity = intensity
        else:
            intensity = intensity / int_factor
        intensity_out = concatenate_arrays(intensity_out, intensity)
        
        if normalize_phase:
            phase_out = concatenate_arrays(phase_out, phase / np.pi)
        phase_out = concatenate_arrays(phase_out, phase)
        i += 1
    return concatenate_arrays(intensity_out, phase_out, energy_out)

def sample_concat1(directory, output_dir, scaling_factors, k=10, n=20, save=True, normalize_intensity=True, normalize_phase=True):
    '''
    Concats .npy files in a given directory based on sliding length

    Input:
    directory: directory of samples
    k: int, length of sliding window
    n: int, number of files used to train

    Output:
    save new sample files in some new directory
    '''
    np.random.seed(42)
    fn = os.path.join(directory, "*.npy")
    files = glob.glob(fn)
    np.random.shuffle(files)
    i = -1
    scaler = MinMaxScaler(feature_range=(0, 1))
    for split in np.array_split(files, n):
        i += 1
        print(i)
        # get all dict
        param_path = np.array([i.replace(".npy", ".pkl") for i in split])
        np.save(os.path.join(output_dir, f"param_{i}.npy"), param_path)
        X = []
        y = []
        for f in split:
            arr = np.load(f)

            temp_array = np.array([split_and_concatenate([temp[:1892], temp[1892:1892 * 2], temp[-348:]],
                                                         scaling_factors,normalize_intensity=True, normalize_phase=True) for temp in arr])
            np.save(os.path.join(output_dir, f"temp_shape_{i}.npy"), temp_array.shape)
            arr = temp_array
            scaler.partial_fit(arr.reshape(arr.shape[0], -1))
            shape = (k, arr.shape[1])
            y.append(arr[1:])
            duplicate = np.repeat([arr[0]], k - 1, axis=0)
            arr = np.concatenate((duplicate, arr[:-1]), axis=0)
            new = np.array([[arr[i:i + k]] for i in range(len(arr) - k + 1)]).reshape(-1, k, arr.shape[1])
            X.append(new)

        X, y = np.concatenate(X), np.concatenate(y)

        if save:
            np.save(os.path.join(output_dir, f"X_{i}.npy"), X)
            np.save(os.path.join(output_dir, f"y_{i}.npy"), y)
            with open(os.path.join(output_dir, 'scaler.pkl'), 'wb') as file:
                pickle.dump(scaler, file)

    print("scaling new")
    for i in range(n):
        X = np.load(os.path.join(output_dir, f"X_{i}.npy"))
        y = np.load(os.path.join(output_dir, f"y_{i}.npy"))
        X_new = np.copy(X)
        y_new = scaler.transform(y.reshape(y.shape[0], -1))
        for jj in range(X.shape[1]):
            X_new[:, jj, :] = scaler.transform(X[:, jj, :].reshape(X.shape[0], -1))
        if save:
            np.save(os.path.join(output_dir, f"X_new_{i}.npy"), X_new)
            np.save(os.path.join(output_dir, f"y_new_{i}.npy"), y_new)

    return X, y, X_new, y_new, scaler, param_path


In [9]:
# Import required data and set up vectors and dictionaries (paths may needed to be adjusted based on your operating system, file structure, and from where code is being run)
freq_vectors_shg1 = np.load(
    "../Data/shg_freq_domain_ds.npy")
freq_vectors_shg2 = freq_vectors_shg1 # these are equivalent here
freq_vectors_sfg = np.load(
    "../Data/sfg_freq_domain_ds.npy")

domain_spacing_1 = (freq_vectors_shg1[1] - freq_vectors_shg1[0]) * 1e12 #scaled to be back in Hz
domain_spacing_2 = (freq_vectors_shg2[1] - freq_vectors_shg2[0]) * 1e12
domain_spacing_3 = (freq_vectors_sfg[1] - freq_vectors_sfg[0]) * 1e12

factors_freq = {"beam_area": 400e-6 ** 2 * np.pi,
                "grid_spacing": [domain_spacing_1, domain_spacing_2, domain_spacing_3],
                "domain_spacing_1": domain_spacing_1, "domain_spacing_2": domain_spacing_2,
                "domain_spacing_3": domain_spacing_3} #beam radius 400 um (and circular beam)

data_directory = "/sdf/group/lcls/ds/scratch/s2e_scratch/Data/SFG_0504" #this is where you downloaded data from SDR repo
output_dir = "/sdf/group/lcls/ds/scratch/s2e_scratch/Data/SFG_intPhEn/test" #this is where you want to store the reformatted data


In [None]:
X1, y1, X1_new, y1_new, scaler1, file1 = sample_concat1(data_directory, output_dir, scaling_factors=factors_freq, k=10, n=100, save=True)
with open(os.path.join(output_dir1, 'scaler_bckkup.pkl'), 'wb') as file:
    pickle.dump(scaler1, file)