In [None]:
##################### Centroiding and correcting/filtering script for FLASH beamtime data #####################

# Step 1: Centroiding the data. Loads raw .bin files with respective settings files, and centroids the data using Cython. Cluster
#         data is examined via cluster-size and time-spread histograms. The centroided data is then saved to a list for further
#         use (as a dataframe).

# Step 2: Appending BAM, FEL pulse energies, BID, etc from h5 to the above centroided dataframes.

# Step 3: For checking if BID assignment in each settings file is correct; plots number of ions in shot (ie, BID 'intensity')
#         against pulse energy assigned to that shot. If a clear positive trend is seen, the settings files are good and don't 
#         need correcting. However, if no correlation is seen, the settings file needs to be corrected.

# Step 4: Plotting FEL pulse intensity (in arbitrary units, but close to uJ) histogram. This then allows us to define a suitable 
#         FEL energy/intensity range to filter by when plotting ion-yields as a function of pump-probe delay time.

# Step 5: Filtering BID-filtered data by FEL pulse intensity (filtering rows with FEL pulse intensities ~ 2 sigma away from mean
#         FEL pulse intensity, found roughly in the centre of the histogram above).

# Step 6: Plot time-of-flight spectra for each run on same plot; TOF calibration, and plotting in m/z domain to aid assignment.

# Step 7: Normalising delay bins by number of shots in each bin. For each delay, we want to find how many unique tIds there are 
# and normalise by that amount. This produces fully corrected ion-yield plots - no more processing needs to be done.

# Step 8: Saving filtered and processed data as .npy arrays

# Step 9: Loading separate .npy files and concatenating them to make one large dataframe which can be saved.

# Step 10: (Optional) Read in raw .bin files and settings files and make a concatenated uncentroided (but FEL-filtered) array
#          to save.

# (Cells labelled with 'JHM' correspond to code which I (James M) have either partially or fully developed or adapted. Code
#  taken from other sources is credited cell-by-cell.)



In [None]:
# 1a. Import modules and definitions for centroiding script.

# Note: cython requires Visual Studio Community 2019, install the native development tools option under the Python
# development workload (https://visualstudio.microsoft.com/).

# Adapted from: Centroiding (PImMS binary) - 200911.ipynb (with JHM edits to read in BID and delay values from settings files)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
%load_ext cython

# Creates PImMS x,y,t data from binary, sorts by t:
def read_bin_torder(filename):
    
    #directory=os.chdir(r"/asap3/flash/gpfs/bl1/2023/data/11013421/raw/PImMS/")
    directory=os.chdir(r'/asap3/flash/gpfs/bl1/2018/data/11003927/raw/pimms/') # 2018
    file = open(str(filename)+'.bin', 'rb')
    xyt_data = []

    delays = 1000*((np.loadtxt(str(filename)+'-settings.txt', usecols=[3]))) # read in settings file and chooses delay value
    delays = delays.astype(int)
    BID = np.around(np.loadtxt(filename+'-settings.txt', usecols=[0]),3) # reads in BID values from settings file
    BID = BID.astype(int)
    
    lasershot=0
    
    for block,d,b in zip(iter(lambda: file, ''),delays,BID):

        try:
            m, n = np.fromfile(file, dtype='<i', count=2)
            frame = np.reshape(np.fromfile(file, dtype='<u2', count=m*n), (m, n))
            lasershot += 1
            if len(frame)>0:

                ls_col = np.zeros((len(frame),1), dtype='int16')
                ls_col = ls_col + lasershot
                frame = np.append(frame,ls_col,1) # appends counter-generated tag IDs (starting from 0) to array

                col_zeros = np.zeros((len(frame),1), dtype='int16')
                col_tId = col_zeros + b
                frame = np.append(frame,col_tId,axis=1) # appends settings file BIDs to array

                col_zeros = np.zeros((len(frame),1), dtype='int16')
                col_delays = col_zeros + d
                frame = np.append(frame,col_delays,axis=1) # appends delay values to array
                frame = frame[frame[:,2].argsort()] # orders by time-of-flight
                xyt_data.append(frame)
        except ValueError:
            break
    file.close()
    return np.vstack(xyt_data)

In [None]:
%%cython --annotate
# 1b. Creates centroiding function.

# From: Centroiding (PImMS binary) - 200911.ipynb

import numpy as np
cimport numpy as np
cimport cython
@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)   # Deactivate negative indexing
@cython.cdivision(True)
def centroid_shots2(int[:,:] input_arr, int time_win, int min_shot, int max_shot,
                  int min_t, int max_t, int min_cluster_size, int max_cluster_size):
    # Reads an input array of PImMS data (see program 3) and returns an [x, y, t, shot, cluster size, cluster timespread] 
    # array. Note that each frame must be ordered by time-of-flight.
 
    # Define variables, data types, and empty arrays:
    cdef int hit_no, p
    cdef int i, j, min_time, max_time, size, shot, test, j_row2, j_col2
    cdef int j_time, j_row, j_col, x, y, t, ct, i_row, i_col, k
    cdef int nshots = 0
    cdef int nevent = 0
    cdef int nend = 0
    cdef int t_spread = 0
    cdef int cluster_min_time = 100000
    cdef int cluster_max_time = 0
    cdef int event_counter = 0    
    cdef double row_sum, col_sum, time_sum, cx, cy, time                              
    cdef np.ndarray[dtype=np.int32_t, ndim=2] im = np.zeros((324,324), dtype=np.int32) # Image array
    cdef np.ndarray[dtype=np.int32_t, ndim=2] cluster_arr = np.zeros((5000,3), dtype=np.int32) # Cluster array
    cdef np.ndarray[dtype=np.int32_t, ndim=2] event_array = np.zeros((50000000,6), dtype=np.int32) # Centroided events array
    
    # Determine hit count from input array rows:
    hit_no = (input_arr.shape)[0] 

    # Cycle through frames, determine hits per frame:
    for shot in range(min_shot, max_shot):
        for test in range(nshots, hit_no):
            if (input_arr[test,3] == shot):
                nshots = test
                break
        for test in range(nshots, hit_no):
            if input_arr[test,3] > shot:
                nend = test-1
                break

        # Cycle through hits in a given frame, find unique pixel clusters in x,y,t:
        for i in range(nshots, nend):
            # Check if the hit has already been counted (1) or not (0):
            if (input_arr[i,4] == 0): 
                t=input_arr[i,2]
                # Check that the hit is within the desired time range, based on user input:
                if min_t<=t<=max_t:
                    if (input_arr[i,4] == 0):
                        cluster_min_time = 100000
                        cluster_max_time = 0
                        size=1                       # Cluster size counter
                        nevent+=1                    # Event counter
                        input_arr[i,4]=1             # Count hit
                        i_row = input_arr[i,0]
                        i_col = input_arr[i,1]
                        cluster_arr[size-1, 0] = i_row   # Add x to cluster array
                        cluster_arr[size-1, 1] = i_col   # Add y to cluster array
                        cluster_arr[size-1, 2] = t       # Add t to cluster array
                        min_time = t                     # Establish initial cluster time 
                        max_time = t+time_win            # Establish allowed cluster time spread, based on user input 
                        im[i_row, i_col] = nevent        # Assign pixel coordinates to the event
                        found_neighbour = True
                        # Look for nearest neighbour pixels and add them to the cluster, continue while found_neighbour = True
                        # or until the maximum cluster size (plus some leeway) is reached:
                        for p in range(max_cluster_size+10):
                            if found_neighbour:
                                found_neighbour = False
                                # Cycle through the hits in the frame again:
                                for j in range(i, nend):
                                    found_j = False
                                    # Check that the new hit is also uncounted:
                                    if input_arr[j,4]==0: 
                                        j_time = input_arr[j,2]
                                        # Check that the new hit is also within the desired time range:
                                        if min_time<=j_time<=max_time:
                                            j_row = input_arr[j,0]
                                            j_col = input_arr[j,1]
                                            # Look for an event among the eight nearest neighbour pixels of the new hit,
                                            # continue until found_j = True:
                                            for j_row2 in range(j_row-1, j_row+2):
                                                if found_j == False:
                                                    for j_col2 in range(j_col-1, j_col+2):
                                                        # If one of the neighbours is an assigned event, add the new hit to it:
                                                        if im[j_row2, j_col2] == nevent:
                                                            input_arr[j,4]=1                # Count new hit
                                                            size+=1                         # Increase cluster size
                                                            im[j_row, j_col] = nevent       # Assign pixel coordinates to event
                                                            cluster_arr[size-1, 0] = j_row  # Add new x to cluster array
                                                            cluster_arr[size-1, 1] = j_col  # Add new y to cluster array
                                                            cluster_arr[size-1, 2] = j_time # Add new t to cluster array
                                                            found_neighbour = True
                                                            found_j = True
                                                            break 


                        # Centroid cluster array into a single coordinate using a center-of-mass approach (see Slater, C. S. 
                        # Studies of photoinduced molecular dynamics using a fast imaging sensor; Springer, 2015; pp 54–62),
                        # if the cluster size is within the set range:
                        if min_cluster_size<=size<=max_cluster_size:
                            row_sum = 0
                            col_sum = 0
                            time_sum = 0
                            # Cycle through hits assigned to the cluster array, until the cluster size is reached.
                            # NB. The cluster array is not reset to zero from cluster to cluster, but is overwritten,
                            # data from larger clusters will exist in rows from 'size' onward, but won't be counted.
                            for k in range(size):
                                cx = cluster_arr[k, 0]
                                cy = cluster_arr[k, 1]
                                ct = cluster_arr[k, 2]
                                time = ct-t+1           # Zero cluster event times to the earliest hit in the cluster.
                                row_sum+=cx/time
                                col_sum+=cy/time
                                time_sum+=1/time
                                
                                # Determine earliest and latest times in the cluster, to calculate the time spread:                                
                                if ct>cluster_max_time:
                                    cluster_max_time = ct
                                if ct<cluster_min_time:
                                    cluster_min_time = ct
                                    
                                    
                            t_spread = cluster_max_time - cluster_min_time
                            
                            # Add centroided event to array:
                            event_array[event_counter, 0] = <int>(row_sum/time_sum+0.5)
                            event_array[event_counter, 1] = <int>(col_sum/time_sum+0.5)
                            event_array[event_counter, 2] = t
                            event_array[event_counter, 3] = shot
                            event_array[event_counter, 4] = size
                            event_array[event_counter, 5] = t_spread
                            event_counter+=1

    return(event_array[0:event_counter,:])

In [None]:
# 1c. Centroids files (supplied as a list); returns an [x, y, t, shot, cluster size, cluster timespread] array.

# Adapted from: Centroiding (PImMS binary) - 200911.ipynb (with JHM edits to loop over multiple run numbers and to save BID and
#               delay values to separate lists for future reappending)

import os

#file_list=['222','223','224','226','227'] # indene 2023
#file_list = ['147','148','150','151'] # fluorene 2023
#file_list = ['108','109','110','111','112','113'] # CPP, 2023

file_list = ['275','276','283','284','285','286','288','289','304']

#file_list = ['129','130']

read = 1
counter = 1

start_shots = 0 
end_shots = 1000000

min_time = 0 # minimum time-of-flight range
max_time = 5000 # maximum time-of-flight range

#min_time = 1831
#max_time = 1840
ion = 'total'

# 2023 beamtime centroiding parameters
timewin = 2 # for full data, approx. no clusters seen at time-windows greater than 6.
min_cluster_size = 2 # statistically unlikely that two noisy pixels will activate next to each other in a given timeframe
max_cluster_size = 10 # trade-off between size of data and not wanting to group lots of single pixel activations together

event_array_list = []
BID_delay_list_of_lists = []

for f in file_list:
    
    # Read file, convert x,y,t data into input array:
    if read:
        start_time = time.time()
        print('Opening file: '+str(f))
        data_array = read_bin_torder(f)
        print("Reading took %s seconds" % round((time.time()-start_time),3))
        #print(np.shape(data_array))
    data_array = data_array[np.where((data_array[:,2]>=min_time) & (data_array[:,2]<=max_time))] # Filters data by time
    
    BID_delay_list = data_array[:,-3:] # extracts counter tag_id, BID, delay columns (in that order)
    BID_delay_list_of_lists.append(BID_delay_list) # saving for reappending later (by file)
    deleted_array = data_array[:, :-2] # data_array but without BID, delay
    
    data_array = deleted_array
    print(f"Shape of filtered data_array: {np.shape(data_array)}")
    
    data_array2 = np.zeros((np.shape(data_array)[0], 6), dtype='int32') # Adds extra columns for hit counting. (changed to 7 instead of 6)
#     data_array2.astype(int) 
    #print(np.shape(data_array2))
    data_array2[:,:-2] = data_array # fills in data, returns data with two added columns of zeros
    #print(data_array)
    #print(data_array2.dtype)
    #print(data_array[-1,-1]) # last BID

    # Centroid data:
    start_time = time.time()
    print(f"Starting centroiding for file: {f}")
    event_array = centroid_shots2(data_array2, timewin, start_shots, end_shots, min_time, max_time, min_cluster_size, max_cluster_size)
    #print(np.shape(event_array))
    event_array_list.append(event_array)
    print(f"Centroiding for file {f} took {round((time.time()-start_time),3)} seconds")

print("Centroiding calculations completed.")

In [None]:
# 1d. Examine cluster data. Plots cluster-size and time-spread histograms, and plots both together as a 2D histogram.

# Adapted from: Centroiding (PImMS binary) - 200911.ipynb

import pandas as pd

#directory=os.chdir(r"/home/merrickj/Documents/indene_centroided") # directory for saving centroided plots (indene)
#directory=os.chdir(r"/home/merrickj/Documents/fluorene_centroided") # directory for saving centroided plots (fluorene)
#directory=os.chdir(r"/home/merrickj/Documents/CPP_centroided") # directory for saving centroided plots (CPP)

directory = os.chdir(r"/home/merrickj/Documents/fluorene_2018_all")

for f, event_array in zip(file_list, event_array_list):
    
    # Cluster size histogram:
    unique, counts = np.unique(event_array[:,4], return_counts=True)
    fig, ax = plt.subplots(figsize=(9,5))
    ax.bar(unique, counts, alpha=0.5)
    ax.set_xlim(min_cluster_size, max_cluster_size)
    ax.set_title(f"Cluster-size histogram: ({min_time},{max_time})", fontsize=16, y=1.03, pad = 10)
    ax.set_xlabel('Cluster size / pixels', fontsize=12)
    ax.set_ylabel('Counts', fontsize=12)
    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)
    ax.grid(which='major')
    ax.grid(which='minor')
    ax.minorticks_on()
    
    textstr = '\n'.join((f"File: {f}",f"Min. cluster size / pixels: {min_cluster_size}",f"Max. cluster size / pixels: {max_cluster_size}", f"Max. time spread / timebins: {timewin}"))
    props = dict(boxstyle='round', facecolor='white', alpha=0.5)
    ax.text(1.05, 0.50, textstr, transform=ax.transAxes, fontsize=14,
    verticalalignment='center', bbox=props)
    
    #fig.savefig(f"run_{f}_centroided_clustersize_histogram_{ion}.png",bbox_inches="tight")

    # Time spread histogram:
    unique2, counts2 = np.unique(event_array[:,5], return_counts=True)
    fig2, ax2 = plt.subplots(figsize=(9,5))
    ax2.bar(unique2, counts2, alpha=0.5)
    ax2.set_xlim(0,timewin)
    ax2.set_title(f"Time-spread histogram: ({min_time},{max_time})", fontsize=16, y=1.03, pad = 10)
    ax2.set_xlabel('Time-bins', fontsize=12)
    ax2.set_ylabel('Counts', fontsize=12)
    ax2.tick_params(axis='x', labelsize=12)
    ax2.tick_params(axis='y', labelsize=12)
    ax2.grid(which='major')
    ax2.grid(which='minor')
    ax2.minorticks_on()
    
    textstr = '\n'.join((f"File: {f}",f"Min. cluster size / pixels: {min_cluster_size}",f"Max. cluster size / pixels: {max_cluster_size}", f"Max. time spread / timebins: {timewin}"))
    props = dict(boxstyle='round', facecolor='white', alpha=0.5)
    ax2.text(1.05, 0.50, textstr, transform=ax2.transAxes, fontsize=14,
    verticalalignment='center', bbox=props)
    
    #fig2.savefig(f"run_{f}_centroided_timebinsize_histogram_{ion}.png",bbox_inches="tight")

    # 2D distribution:
    cluster_dist, xedges, yedges = np.histogram2d(event_array[:,4], event_array[:,5], bins =[np.arange(0,61), np.arange(0,9)])
    fig3,ax3 = plt.subplots(figsize=(9,5))
    picture = ax3.imshow(cluster_dist, interpolation='nearest', aspect='auto', cmap='inferno', vmax=np.max(cluster_dist))
    ax3.set_title(f"Cluster-size / time-spread histogram: ({min_time},{max_time})", fontsize=16, y=1.03, pad = 10)
    ax3.set_xlabel('Time-bin', fontsize=12)
    ax3.set_ylabel('Cluster size / pixels', fontsize=12)
    ax3.tick_params(axis='x', labelsize=12)
    ax3.tick_params(axis='y', labelsize=12)
    ax3.set_xlim(0,7)
    ax3.set_ylim(min_cluster_size,max_cluster_size)
    ax3.grid(which='major')
    ax3.grid(which='minor')
    cbar = plt.colorbar(picture)
    cbar.set_label('Intensity / counts ', fontsize = 14)
    
    textstr = '\n'.join((f"File: {f}",f"Min. cluster size / pixels: {min_cluster_size}",f"Max. cluster size / pixels: {max_cluster_size}", f"Max. time spread / timebins: {timewin}"))
    props = dict(boxstyle='round', facecolor='white', alpha=0.5)
    ax3.text(1.30, 0.50, textstr, transform=ax3.transAxes, fontsize=14,
    verticalalignment='center', bbox=props)
    
    #fig3.savefig(f"run_{f}_centroided_clustersize_timebinsize_2dhistogram_{ion}.png",bbox_inches="tight")


In [None]:
# 1e. Making a list of centroided dataframes

centroided_df_list = []

for f, event_array, BID_delay_list in zip(file_list, event_array_list, BID_delay_list_of_lists):
    
    print(f"File: {f}")
    
    df = pd.DataFrame(event_array, columns = ['x','y','ToF','counter_tagID','size','spread'])
    
    df_to_append = pd.DataFrame(BID_delay_list, columns = ['counter_tagID', 'BID', 'delay'])
    df_to_append = df_to_append.drop_duplicates()
    
    total_df = df.merge(df_to_append, on = 'counter_tagID') # reappends BID and delay to data
    total_df = total_df.drop(['counter_tagID'], axis = 1) # counter_tagID isn't needed after centroiding
    total_df = total_df.reindex(columns=['x','y','ToF','BID','delay','size','spread']) # reorders columns as wanted
    
    centroided_df_list.append(total_df)
    print(total_df)

In [None]:
# 2. Appending BAM, FEL pulse energies, BID, etc from h5 to the above dataframes. (JU + JHM)

import h5py
import gc
import os
import glob

#file_list=['222','223','224','226','227'] # run numbers for indene 2023
#hdf_file_list=['44345','44346','44347','44348','44349'] # corresponding h5 run numbers for indene 2023

# file_list = ['147','148','150','151'] # fluorene 2023
# hdf_file_list = ['44249','44250','44251','44252'] # fluorene 2023

#file_list = ['129','130'] # CPP 2023
#hdf_file_list = ['44227','44228'] # CPP 2023

file_list = ['284'] # fluorene 2018
hdf_file_list = ['21608'] # fluorene 2018

BID_filtered_df_list= []
#os.chdir(r"/asap3/flash/gpfs/bl1/2023/data/11013421/raw/hdf/express-0/fl1user1/") # 2023
os.chdir(r"/asap3/flash/gpfs/bl1/2018/data/11003927/raw/hdf/online-2/fl1user1/") # 2018
h5_file_list = os.listdir() # lists ALL h5 files in this directory (for entire beamtime)

for run_number, hdf_file in enumerate(hdf_file_list): # loop over run numbers
    
    print(f"Run number: {file_list[run_number]}")
    print(f"Length of df list: {len(centroided_df_list)}")
    print(f"Length of file list: {len(file_list)}")
    
    data_energy = []
    data_BAM = []
    counter_hdf = 1
    
    for file in h5_file_list: # looping over all h5 files in directory
        
        if f"run{hdf_file}" in str(file): # filtering h5 files by run number
            
            with h5py.File(file, 'r') as hdf:
                
                #energy = np.array(hdf.get('FL1/Photon Diagnostic/GMD/Pulse resolved energy/energy BDA/value')) # 2023
                energy = np.array(hdf.get('Photon Diagnostic/GMD/Pulse resolved energy/energy BDA (raw) copy')) # 2018
                #print(energy)

                #energy_BIDs = hdf.get('FL1/Photon Diagnostic/GMD/Pulse resolved energy/energy BDA/index')[()] # 2023
                energy_BIDs = hdf.get('Photon Diagnostic/GMD/Pulse resolved energy/energy BDA (raw) copy') # 2018
                print(energy_BIDs)
                
                #BAM = hdf.get('FL1/Electron Diagnostic/Bunch charge/before undulator/value') # 2023
                BAM = hdf.get('Electron Diagnostic/BAM/4DBC3/electron bunch arrival time (low charge)').value # 2018
                print(BAM)
                
                #BAM_BIDs = hdf.get('FL1/Electron Diagnostic/Bunch charge/before undulator/index') # 2023
                BAM_BIDs = hdf.get('FL1/Electron Diagnostic/BAM/1SFELC/electron bunch arrival time (low charge)/index').value

                for x in np.arange(len(energy)):
                    test = np.zeros(2)
                    test[0] = energy[x][0][0]
                    test[1] = energy_BIDs[x]
                    data_energy.append(test)
                for y in np.arange(len(BAM)):
                    BAM_values = np.zeros(2)
                    BAM_values[0] = BAM[y][0]
                    BAM_values[1] = BAM_BIDs[y]
                    data_BAM.append(BAM_values)
                hdf.close()

        else:
            pass
        
    #print(f"data_energy:{data_energy}")
    data_energy=np.vstack(data_energy)
    data_energy = data_energy[data_energy[:,1].argsort()]
    data_BAM=np.vstack(data_BAM)
    data_BAM = data_BAM[data_BAM[:,1].argsort()]
    gc.collect()

    df = centroided_df_list[run_number]
    #print(df)
    df_energy = pd.DataFrame(data_energy,columns =['Energy','BID'])
    df_BAM = pd.DataFrame(data_BAM,columns =['BAM','BID'])
    #print(df_energy)
    #print(df_BAM)
    
    BID_filtered_df = df_energy.merge(df_BAM,on = 'BID').merge(df,on = 'BID')
    #BID_filtered_df = pd.merge(data,df,on = "BID")
    BID_filtered_df['delay'] = BID_filtered_df['delay'].apply(lambda x: x*0.001) # just converts delay to picoseconds (ps)
    BID_filtered_df['Jitter_delay'] = BID_filtered_df['delay'] - BID_filtered_df['BAM']
    #BID_filtered_df['Delay_round'] = BID_filtered_df.Jitter_delay.mul(2).round(1).div(2) # rounds BAM-corrected delay to nearest 0.05 ps (as given by delay step size in logbook) - indene, fluorene
    BID_filtered_df['Delay_round'] = BID_filtered_df.Jitter_delay.mul(2).round(1).div(1) # round to nearest 0.1 (for CPP)
    print(np.unique(BID_filtered_df['Delay_round']))
    #print(BID_filtered_df)

    # Getting rid of columns which aren't needed

    del BID_filtered_df['BAM']
    del BID_filtered_df['delay']
    del BID_filtered_df['Jitter_delay']
    #del BID_filtered_df['size']
    #del BID_filtered_df['spread']

    print(BID_filtered_df)
    BID_filtered_df_list.append(BID_filtered_df)
 

In [None]:
# 3. (For checking if BID assignment in each settings file is correct): Plotting number of ions in shot (ie, BID 'intensity')
#    against pulse energy assigned to that shot. If a clear positive trend is seen, the settings files are good and don't need
#    correcting. However, if no correlation is seen, the settings file needs to be corrected (see below - eventually). (JHM)

# N.B: Now scatter plots as intensity heatmaps instead.

directory=os.chdir(r"/home/merrickj/Documents/indene_BID_filter_check_plots")
#directory=os.chdir(r"/home/merrickj/Documents/fluorene_BID_filter_check_plots")
#directory=os.chdir(r"/home/merrickj/Documents/CPP_BID_filter_check_plots")

import matplotlib.pyplot as plt 

for df,file in zip(BID_filtered_df_list,file_list):
    
    ions_in_shot_df = df.groupby(['BID'], sort=False).size().reset_index(name='ions_in_shot') # produces dataframe containing BID against ion count; preserves order of BIDs
    ions_in_shot_list = ions_in_shot_df['ions_in_shot'].to_list()
    BID_list = ions_in_shot_df['BID'].to_list()
    print(f"Length of ions_in_shot_list list: {len(ions_in_shot_list)}")
    
    FEL_pulse_energy_per_shot = []
    
    for BID in BID_list:
        
        locating_df = df.loc[df['BID'] == BID, 'Energy'].iloc[0]
        FEL_pulse_energy_per_shot.append(locating_df)
        
    print(f"Length of FEL_pulse_energy_per_shot list: {len(FEL_pulse_energy_per_shot)}")
    
    ions_in_shot_list_copy = ions_in_shot_list # saves copies for loop iterations below
    FEL_pulse_energy_per_shot_copy = FEL_pulse_energy_per_shot
    
    # Plotting code #
    fig, (ax1,ax2,ax3) = plt.subplots(nrows = 1, ncols = 3, figsize = (18,8), sharey = True)
    
    ax1.set_xlabel('FEL pulse energy / a.u.', fontsize=14)
    ax2.set_xlabel('FEL pulse energy / a.u.', fontsize=14)
    ax3.set_xlabel('FEL pulse energy / a.u.', fontsize=14)
    plt.tick_params(axis='x', labelsize=12)
    ax1.set_ylabel('Ions in shot', fontsize=14)
    plt.tick_params(axis='y', labelsize=12)
    
    ax1.grid(which='major')
    ax1.grid(which='minor')
    ax1.minorticks_on()
    ax2.grid(which='major')
    ax2.grid(which='minor')
    ax2.minorticks_on()
    ax3.grid(which='major')
    ax3.grid(which='minor')
    ax3.minorticks_on()
    
    
    plt.suptitle(f"Ions in shot versus FEL pulse energy for file: {file}", fontsize = 16) # title at top of graph
    
    BID_shift_list = [-1,0,1] # BID shifts list for subplot
    
    for BID_shift in BID_shift_list:
        
        ions_in_shot_list = ions_in_shot_list_copy # resets list for second and third loop iterations
        FEL_pulse_energy_per_shot= FEL_pulse_energy_per_shot_copy # resets list for second and third loop iterations
        
        if BID_shift == -1:

            # for shifting BID by -1; deletes first entry in ions_in_shot_list and last entry in FEL_pulse_energy_per_shot list
            h1 = ax1.hist2d(ions_in_shot_list[1:], FEL_pulse_energy_per_shot[:-1], bins=(100, 100), cmap=plt.cm.Blues)
            
        elif BID_shift  == 0:
            
            # no BID shift, so no need to splice
            h2 = ax2.hist2d(ions_in_shot_list, FEL_pulse_energy_per_shot, bins=(100, 100), cmap=plt.cm.Blues)
        
        elif BID_shift == 1:

            # for shifting BID by +1; delete last entry in ions_in_shot_list and first entry in FEL_pulse_energy_per_shot list
            h3 = ax3.hist2d(ions_in_shot_list[:-1], FEL_pulse_energy_per_shot[1:], bins=(100, 100), cmap=plt.cm.Blues)
        
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.15)
    
    ax1.set_title(f" BID shift = {BID_shift_list[0]}", fontsize = 14, loc = 'left')
    ax2.set_title(f" BID shift = {BID_shift_list[1]}", fontsize = 14, loc = 'left')
    ax3.set_title(f" BID shift = +{BID_shift_list[2]}", fontsize = 14, loc = 'left')
    
    cbar1 = plt.colorbar(h1[3], ax = ax1, orientation = 'horizontal')
    cbar1.set_label('Intensity / counts ', fontsize = 14)
    cbar2 = plt.colorbar(h2[3], ax = ax2, orientation = 'horizontal')
    cbar2.set_label('Intensity / counts ', fontsize = 14)
    cbar3 = plt.colorbar(h3[3], ax = ax3, orientation = 'horizontal')
    cbar3.set_label('Intensity / counts ', fontsize = 14)
    
    fig.savefig(f"run_{file}_2Dhistogram_BIDshifts.png",bbox_inches="tight")
    
    plt.show()

In [None]:
# 4. Plotting FEL pulse intensity (in arbitrary units, but close to uJ) histogram. This then allows us to define a suitable FEL
#    energy/intensity range to filter by when plotting ion-yields as a function of pump-probe delay time. (JHM)

import matplotlib.pyplot as plt

mu_list = []
std_list = []
FEL_lower_list = []
FEL_higher_list = []

for run_number, BID_filtered_df in enumerate(BID_filtered_df_list): # just looping over every BID-filtered dataframe in list
    
    energies = np.array(BID_filtered_df['Energy'])
    fig, ax = plt.subplots(figsize = (10,8))
    ax.hist(energies, bins = 100)
    ax.grid(which='major')
    ax.grid(which='minor')
    ax.minorticks_on()
    plt.xlabel('FEL pulse intensity / a.u. ', fontsize=12)
    plt.ylabel('Abundance / a.u. ', fontsize=12)
    plt.title(f"FEL pulse intensity histogram (run number {file_list[run_number]})", fontsize = 16)
    
    directory=os.chdir(r"/home/merrickj/Documents/indene_BID_filter_check_plots")
    #directory=os.chdir(r"/home/merrickj/Documents/fluorene_BID_filter_check_plots")
    #directory=os.chdir(r"/home/merrickj/Documents/CPP_BID_filter_check_plots")
    
    mu_FEL = round(np.mean(energies), 2) # rounds to 2 d.p. for ease of comparison between runs
    print(f"Mean calculated for run number {file_list[run_number]}: {mu_FEL}")
    std_FEL = round(np.std(energies), 2) # rounds to 2 d.p. for ease of comparison between runs
    print(f"Standard deviation calculated for run number {file_list[run_number]}: {std_FEL}")
    
    textstr = '\n'.join((
    r'$\mu=%.2f$' % (mu_FEL, ),
    r'$\sigma=%.2f$' % (std_FEL, )))
    
    props = dict(boxstyle='round', facecolor='white', alpha=0.5)
    ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=14,
    verticalalignment='top', bbox=props)
    
    answer = input("Do you want to save the FEL histogram as a .png file (y/n)?")
    if answer in ['y', 'Y', 'yes', 'Yes', 'YES']:
    
        plt.savefig(f"FEL_histogram_{file_list[run_number]}.png", bbox_inches="tight")

    FEL_lower = mu_FEL - (2*std_FEL) # mu - 2*sigma
    FEL_higher = mu_FEL + (2*std_FEL) # mu + 2*sigma
    
    mu_list.append(mu_FEL)
    std_list.append(std_FEL)
    FEL_lower_list.append(FEL_lower)
    FEL_higher_list.append(FEL_higher)


In [None]:
# 5. Filtering BID-filtered data by FEL pulse intensity (filtering rows with FEL pulse intensities ~ 1 sigma away from mean
#    FEL pulse intensity, found roughly in the centre of the histogram above). (JHM)

FEL_BID_filtered_df_list = []

for run_number, BID_filtered_df in enumerate(BID_filtered_df_list): # just looping over every BID-filtered dataframe in list
    
    FEL_lower = FEL_lower_list[run_number]
    FEL_higher = FEL_higher_list[run_number]
    
    FEL_BID_filtered_df = BID_filtered_df[(BID_filtered_df['Energy'] >= FEL_lower) & (BID_filtered_df['Energy'] <= FEL_higher)]
    FEL_BID_filtered_df_list.append(FEL_BID_filtered_df)
    
    print(FEL_BID_filtered_df)

In [None]:
# 6a. Plot time-of-flight spectra for each run on same plot
#    Adapted from: "on the fly ion images + delay + rdf NOT CENTROIDED v6 EMW C4H4Se.ipynb"

fig, ax = plt.subplots(figsize=(15, 6))
plt.xlabel('Time-of-flight / a.u.', fontsize=14)
plt.tick_params(axis='x', labelsize=12)
plt.ylabel('Intensity / a.u.', fontsize=14)
plt.tick_params(axis='y', labelsize=12)
ax.set_xlim(2050,2500) # alter as necessary for molecule of interest
ax.grid(which='major')
ax.grid(which='minor')
ax.minorticks_on()
tof_appended = []

#for df,file in zip(FEL_BID_filtered_df_list,file_list):
for df,file in zip(centroided_df_list,file_list):
    plt.title('Run-separated ToF spectrum')
    df = df[(df['ToF']>2000) & (df['ToF']<2500)]
    tof = df.groupby(['ToF']).size().reset_index(name='intensity')
    tof_times_list = tof['ToF'].to_list()
    #print(tof)
    tof_appended.append(tof)
    plt.plot(tof.ToF,tof.intensity,label=str(file))
    ax.legend(loc='upper right', fontsize=18)
    
plt.show()

#fig.savefig(f"indene_run_separated_tof_spectrum.png", bbox_inches="tight")
#fig.savefig(f"fluorene_run_separated_tof_spectrum.png", bbox_inches="tight")
#fig.savefig(f"CPP_run_separated_tof_spectrum.png", bbox_inches="tight")
fig.savefig(f"fluorene_run_separated_tof_spectrum.png", bbox_inches="tight")

In [None]:
# 6b. Calibration of TOF spectrum to convert from TOF-domain to m/z domain (user input for predicted m/z values and corresponding
#     TOF values from previous TOF spectrum assignment) via linear-regression. Add extra column to df for m/z values using linear
#     regression parameters obtained, and plot m/z-domain spectrum.
#     Adapted from: "on the fly ion images + delay + rdf NOT CENTROIDED v6 EMW C4H4Se.ipynb"

# For FLU - the uncommented calibration points work well so do not change!
# ToF_list = [(2275,2285),(2380,2400),(2423,2426),(2217,2223),(2248,2252),(2300,2310),(2458,2459)] # known TOF windows for ions seen in TOF spectrum above - more windows = better calib.!
# ion_list=['C4', 'C9', 'C11','C2','C3','C5','$C_{13}H_{10}^{+}$']
# mz_arr=np.array([51,116,140,27,39,64,166]) # approximate m/z values we would expect for above ions
# ToF_list = [(2085,2087),(2117,2219),(2190,2192),(2192,2194),(2195,2197),(2219,2221),(2249,2251),(2279,2281),(2289,2291),(2304,2306),(2317,2319),(2324,2326),(2339,2341),(2347,2349),(2366,2368),(2391,2393),(2422,2424),(2456.5,2457.5),(2458,2458.5),(2458,2460)]
# ion_list_total = ['$H^{+}$','$He^{+}$','$O^{+}$','$OH^{+}$','$H_{2}O^{+}$','C2','C3','C4', '$C_{13}H_{9}^{3+}$','C5','$C_{11}H_{7}^{2+}$','C6','$C_{13}H_{x}^{2+}$', 'C8', 'C9','C10', 'C11','$C_{13}H_{7}^{+}$','$C_{13}H_{9}^{+}$','$C_{13}H_{10}^{+}$']
# mz_arr = np.array([1,4,16,17,18,27,39,51,55,64,70,76,82.5,100,116,128,140,163,165,166])

# for fluorene, 2018 data
# ToF_list = [(2085,2087),(2458,2460)]
# ion_list = ['$H^{+}$','$C_{13}H_{10}^{+}$']
# mz_arr = np.array([1,166])

# for triphenylene, 2021 data 
#ToF_list = [(1009,1011),(1312,1314)]
#ion_list = ['$H^{+}$','$C_{18}H_{12}^{+}$']
#mz_arr = np.array([1,228])

# for fluorene, 2023 data
# ToF_list = [(1613, 1616),(1617, 1621),(1631, 1634),(1643, 1646),(1650, 1652), (1657, 1662),(1662,1671),(1670, 1673), (1679,1683),(1688,1695),(1699,1705), (1708,1714), (1716,1723), (1725,1735),(1734,1741),(1743,1750), (1749,1755),(1756,1765),(1770,1776),(1783,1793), (1794,1803), (1810,1816),(1831,1841)]
# ion_list = ['N(++)','O(++)','C2(++)','O(+)','water(+)','carbon_dioxide(++)','C2(+)','dinitrogen(+)','dioxygen(+)','C3(+)','carbon_dioxide(+)','C4(+)','PAH(+++)','C5(+)','C11(++)','C6(+)','PAH(++)','C7(+)','C8(+)','C9(+)','C10(+)','C11(+)','PAH-H(+)']
# mz_arr = np.array([7,8,12,16,18,22,26,28,32,38,44,50,55.5,63,70,75,83,88,100,116,127,141,165])

# for indene, 2023 data 
#ToF_list = [(1789,1791),(1760,1763),(1743,1746),(1728,1730),(1721,1723),(1711,1713),(1700,1702),(1691,1693)]
#ion_list = ['indene(+)','C7(+)','C6(+)','C5(+)','indene(2+)','C4(+)','C7(2+)','C3(+)']
#mz_arr = np.array([116.2,87,75,62,58.1,50,43.5,38])

# for fluorene, 2023 data
# ToF_list = [(1831,1840),(1820,1830),(1810,1818),(1798,1802),(1780,1792),(1770,1778),(1755,1765),(1750,1752),(1742,1749),(1735,1740),(1725,1733),(1720,1722),(1707,1715),(1699,1704),(1685,1698),(1662,1672),(1630,1633)]
# ion_list = ['FLU(+)','C12(+)','C11(+)','C10(+)','C9(+)','C8(+)','C7(+)','FLU(++)','C6(+)','C11(++)','C5(+)','FLU(+++)/C9(++)','C4(+)','C3(+)','C7(++)/FLU(++++)','C2(+)','C1(+)']
# mz_arr = np.array([166,150,137,125,113,100,87,83,75,68,62,55,50,38,42,26,13])

# for CPP, 2023 data
ToF_list = [(1631,1636),(1665,1671),(1686,1694),(1701,1707),(1710,1715),(1718,1722),(1725,1733),(1736,1740),(1742,1749),(1752,1754),(1757,1765),(1766,1770),(1772,1778),(1780,1791),(1795,1805),(1806,1815),(1820,1828),(1830,1838),(1842,1850),(1851,1860),(1864,1869)]
ion_list = ['C1','C2','C3','CPP(4+)','C4','C9(2+)','C5','C11(2+)','C6','C13(2+)','C7','CPP(2+)','C8','C9','C10','C11','C12','C13','C14','CPP(+)','[CPP-dinitrogen](+)']
mz_arr = np.array([12,26,38,47.56,50,56.5,62,68,74,81.5,87,95.12,98,113,125,136,151,163,176,190.24,204])

print("m/z array size: " + str(mz_arr.size))
print("ToF array size: " + str(len(ToF_list)))
print("ion array size: " + str(len(ion_list)))

# Linear regression
t_list=[]
for (ti,tf) in ToF_list:
    t_list.append((ti+tf)/2)

fig,ax = plt.subplots(figsize=(10,4))
sqmz_arr = np.sqrt(mz_arr)
plt.scatter(t_list, sqmz_arr, marker='x', color='red', s=150)
ax.set_xlim(min(t_list)-5,max(t_list)+5)
ax.set_ylim(min(sqmz_arr)-1, max(sqmz_arr)+1)
fit = np.polyfit(t_list,  sqmz_arr, deg=1)
fit2 = np.polyfit(sqmz_arr, t_list, deg=1)
print('ToF fitted params:')
print(fit)
print(fit2)
plt.plot(fit[0]*np.arange(15000)+fit[1],color='blue',linewidth=2)
ax.set_xlim(min(t_list)-100,max(t_list)+100)
ax.set_xlabel('ToF / a.u.', fontsize=14)
ax.set_ylabel('${\sqrt{m/z}}$', fontsize=14)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)

#plt.title('Indene - time-of-flight mass calibration', fontsize=16)
#plt.title('Fluorene - time-of-flight mass calibration', fontsize=16)
plt.title('CPP - time-of-flight mass calibration', fontsize=16)

#fig.savefig(f"indene_mass_calibration.png", bbox_inches="tight")
#fig.savefig(f"fluorene_mass_calibration.png", bbox_inches="tight")
fig.savefig(f"CPP_mass_calibration.png", bbox_inches="tight")

plt.show()

print(f"H+ calibrated m/z: {(fit[0]*1580 + fit[1])**2}")
print(f"He+ calibrated m/z: {(fit[0]*1602 + fit[1])**2}")
print(f"{(fit[0]*1856 + fit[1])**2}")

for df,file in zip(FEL_BID_filtered_df_list,file_list):
    
    df['m_z'] = (fit[0]*df['ToF'] + fit[1])**2 # appending m/z column to each df
    df['m_z'] = round(2*df['m_z'])/2 # rounding m/z values to nearest 0.5
    print(df)

In [None]:
# 6c. Plot time-of-flight spectra for each run on same plot, this time in m/z domain. This plot can then be used to identify
#    more fragments of interest (to then cross-reference their ToF values in initial ToF spectrum). (JHM)

fig, ax = plt.subplots(figsize=(15, 6))
plt.xlabel('m/z', fontsize=14)
plt.tick_params(axis='x', labelsize=12)
plt.ylabel('Intensity / a.u.', fontsize=14)
plt.tick_params(axis='y', labelsize=12)
ax.set_xlim(0,210) # alter as necessary for molecule of interest
ax.grid(which='major')
ax.grid(which='minor')
ax.minorticks_on()
tof_appended = []

for df,file in zip(FEL_BID_filtered_df_list,file_list):
    plt.title('Run-separated ToF spectrum (mass-calibrated)')
    df = df[(df['m_z']>0) & (df['m_z']<210)]
    tof = df.groupby(['m_z']).size().reset_index(name='intensity')
    plt.plot(tof.m_z,tof.intensity,label=str(file))
    ax.legend(loc='upper right', fontsize=18)
    tof_times_list = tof['m_z'].to_list()
    tof_appended.append(tof)
    
plt.show()

#fig.savefig(f"indene_run_separated_tof_spectrum_m_z.png", bbox_inches="tight")
#fig.savefig(f"fluorene_run_separated_tof_spectrum_m_z.png", bbox_inches="tight")
fig.savefig(f"CPP_run_separated_tof_spectrum_m_z.png", bbox_inches="tight")

In [None]:
# 7. Normalising delay bins by number of shots in each bin. For each delay, we want to find how many unique tIds there are, and
#     normalise by that amount. This produces fully corrected ion-yield plots - no more processing needs to be done. (JHM)

# indene, 2023 data
#ToF_list = [(1632,1633),(1665,1675),(1692,1694),(1700,1702),(1710,1712),(1720,1722),(1728,1730),(1743,1746),(1760,1762),(1789,1791)]
#ion_list = ['C1','C2','C3','C7(2+)','C4','IND(2+)','C5','C6','C7','IND(+)']

# fluorene, 2023 data
# ToF_list = [(1831,1840),(1820,1830),(1810,1818),(1798,1802),(1780,1792),(1770,1778),(1755,1765),(1750,1752),(1742,1749),(1735,1740),(1725,1733),(1720,1722),(1707,1715),(1699,1704),(1685,1698),(1662,1672),(1630,1633)]
# ion_list = ['FLU(+)','C12(+)','C11(+)','C10(+)','C9(+)','C8(+)','C7(+)','FLU(++)','C6(+)','C11(++)','C5(+)','FLU(+++)/C9(++)','C4(+)','C3(+)','C7(++)/FLU(++++)','C2(+)','C1(+)']

# CPP, 2023 data
ToF_list = [(1631,1636),(1665,1671),(1686,1694),(1701,1707),(1710,1715),(1718,1722),(1725,1733),(1736,1740),(1742,1749),(1752,1754),(1757,1765),(1766,1770),(1772,1778),(1780,1791),(1795,1805),(1806,1815),(1820,1828),(1830,1838),(1842,1850),(1851,1860),(1864,1869)]
ion_list = ['C1','C2','C3','CPP(4+)','C4','C9(2+)','C5','C11(2+)','C6','C13(2+)','C7','CPP(2+)','C8','C9','C10','C11','C12','C13','C14','CPP(+)','[CPP-dinitrogen](+)']

def TOF_filter(df, ti, tf): # function from "centroid files with delay.ipynb"
    df_filt = df[(df['ToF'] >= ti) & (df['ToF'] <= tf)]
    return(df_filt)

for run_number, FEL_BID_filtered_df in enumerate(FEL_BID_filtered_df_list): # loop over run numbers
    
    round_delay_array = np.unique(FEL_BID_filtered_df['Delay_round']).tolist() # unique delay values in run number data
    no_shots_per_round_delay = []

    for delay in round_delay_array: # loop over each delay value
        FEL_BID_filtered_df_delay = FEL_BID_filtered_df[(FEL_BID_filtered_df['Delay_round'] == delay)]
        number_of_shots = len(np.unique(FEL_BID_filtered_df_delay['BID'])) # number of shots per delay value
        no_shots_per_round_delay.append(number_of_shots)

    # print(round_delay_array)
    # print(no_shots_per_round_delay)
    # print(len(round_delay_array))
    # print(len(no_shots_per_round_delay))

    fig, ax = plt.subplots(figsize = (10,8))
    ax.bar(round_delay_array,no_shots_per_round_delay, width = 0.09, align = 'center') # for ALL ions
    ax.set_xlabel('Delay / ps')
    
    #ax.set_title(f"Delay v.s. number of shots (run-concatenated) - indene", fontsize = 16)
    ax.set_title(f"Delay v.s. number of shots (run-concatenated) - fluorene", fontsize = 16)
    
    ax.set_ylabel('Number of shots')
    ax.grid(which='major')
    ax.grid(which='minor')
    ax.minorticks_on()
    #fig.savefig(f"indene_delay_intensity_bargraph",bbox_inches="tight")

    df_delay_vs_no_shots = pd.DataFrame()
    df_delay_vs_no_shots['Delay_round'] = round_delay_array
    df_delay_vs_no_shots['No_of_shots'] = no_shots_per_round_delay
    # print(df_delay_vs_no_shots)

    #directory=os.chdir(r"/home/merrickj/Documents/indene_ion_yields")
    #directory=os.chdir(r"/home/merrickj/Documents/fluorene_ion_yields")
    directory=os.chdir(r"/home/merrickj/Documents/CPP_ion_yields")
    
    for i, (ti, tf) in enumerate(ToF_list): # plotting ion yields for separate run numbers
        ion = ion_list[i]
        FEL_BID_filtered_df_filt = TOF_filter(FEL_BID_filtered_df, ti, tf) # for dual-filtered data; filter by ToF
        ion_yield = FEL_BID_filtered_df_filt.groupby(['Delay_round']).size().reset_index(name='ion_yield')
        ion_yield = ion_yield.merge(df_delay_vs_no_shots,on = 'Delay_round')
        ion_yield['delay_normalised_yield'] = ion_yield['ion_yield']/ion_yield['No_of_shots']
    #     print(ion_yield)

        fig, ax = plt.subplots(figsize = (10,8))
        ax.scatter(ion_yield.Delay_round, ion_yield.delay_normalised_yield)

        ax.set_xlabel('Delay / ps')
        ax.set_title(f"Ion-yield for {ion} (file: {file_list[run_number]})", fontsize=16)
        ax.set_ylabel('Ion-yield / a.u.')
        
        #fig.savefig(f"ion_yield_{i}_run_{file_list[run_number]}",bbox_inches="tight")
    

In [None]:
# 8. Saving filtered and processed data as .npy arrays, and concatenating relevant run numbers to make a larger .npy file. (JHM)

#directory=os.chdir(r"/home/merrickj/Documents/indene_filtered_data_for_covariance")
#directory=os.chdir(r"/home/merrickj/Documents/fluorene_filtered_data_for_covariance")
#directory=os.chdir(r"/home/merrickj/Documents/CPP_filtered_data_for_covariance")

directory=os.chdir(r"/home/merrickj/Documents/fluorene_2018_all")

#for run_number, FEL_BID_filtered_df in enumerate(FEL_BID_filtered_df_list):
for run_number, FEL_BID_filtered_df in enumerate(centroided_df_list):
    print(FEL_BID_filtered_df)
    array = FEL_BID_filtered_df.to_numpy()
    np.save(f"{file_list[run_number]}.npy",array)
    print(f"Saved run number {file_list[run_number]} to numpy array")
    
concatenate_answer_centroided = input("Do you wish to concatenate all files to make one large .npy file (y/n)?")

if concatenate_answer_centroided in ['y', 'Y', 'yes', 'Yes', 'YES']:
    
    #full_data_centroided = pd.concat(FEL_BID_filtered_df_list, axis=0) # makes one large dataframe
    full_data_centroided = pd.concat(centroided_df_list, axis=0) # makes one large dataframe
    print(full_data_centroided)
    
    full_data_centroided_array = full_data_centroided.to_numpy()
    
    np.save(f"fluorene_full_data_centroided_uncorrected.npy",full_data_centroided_array)
    #np.save(f"fluorene_full_data_centroided_corrected.npy",full_data_centroided_array)
    #np.save(f"CPP_full_data_centroided_corrected.npy",full_data_centroided_array)
    
    print(f"Saved numpy array of concatenated data.")

In [None]:
# 9. From the concatenated data file, print run-concatenated ion-yield v.s. delay plots (JHM)

round_delay_array = np.unique(full_data_centroided['Delay_round']).tolist() # unique delay values in run number data
print(f"Total numbebr of unique BIDs (total number of shots) in concatenated dataframe: {len(np.unique(full_data_centroided['BID']))}")
no_shots_per_round_delay = []

for delay in round_delay_array: # loop over each delay value
    FEL_BID_filtered_df_delay = full_data_centroided[(full_data_centroided['Delay_round'] == delay)]
    number_of_shots = len(np.unique(FEL_BID_filtered_df_delay['BID'])) # number of shots per delay value
    print(f"Number of shots for delay {delay}: {number_of_shots}")
    no_shots_per_round_delay.append(number_of_shots)

# print(round_delay_array)
# print(no_shots_per_round_delay)
# print(len(round_delay_array))
# print(len(no_shots_per_round_delay))

fig, ax = plt.subplots(figsize = (10,8))
ax.bar(round_delay_array,no_shots_per_round_delay, width = 0.04, align = 'center') # for ALL ions
ax.set_xlabel('Delay / ps')

#ax.set_title(f"Delay v.s. number of shots (run-concatenated) - indene", fontsize = 16)
#ax.set_title(f"Delay v.s. number of shots (run-concatenated) - fluorene", fontsize = 16)
ax.set_title(f"Delay v.s. number of shots (run-concatenated) - CPP", fontsize = 16)

ax.set_ylabel('Number of shots')
ax.grid(which='major')
ax.grid(which='minor')
ax.minorticks_on()

#fig.savefig(f"indene_delay_intensity_bargraph.png",bbox_inches="tight")
#fig.savefig(f"fluorene_delay_intensity_bargraph.png",bbox_inches="tight")
fig.savefig(f"CPP_delay_intensity_bargraph.png",bbox_inches="tight")

df_delay_vs_no_shots = pd.DataFrame()
df_delay_vs_no_shots['Delay_round'] = round_delay_array
df_delay_vs_no_shots['No_of_shots'] = no_shots_per_round_delay
# print(df_delay_vs_no_shots)

#directory=os.chdir(r"/home/merrickj/Documents/indene_ion_yields")
#directory=os.chdir(r"/home/merrickj/Documents/fluorene_ion_yields")
directory=os.chdir(r"/home/merrickj/Documents/CPP_ion_yields")

for i, (ti, tf) in enumerate(ToF_list): # plotting ion yields for separate run numbers
    ion = ion_list[i]
    FEL_BID_filtered_df_filt = TOF_filter(full_data_centroided, ti, tf) # for dual-filtered data; filter by ToF
    ion_yield = FEL_BID_filtered_df_filt.groupby(['Delay_round']).size().reset_index(name='ion_yield')
    ion_yield = ion_yield.merge(df_delay_vs_no_shots,on = 'Delay_round')
    ion_yield['delay_normalised_yield'] = ion_yield['ion_yield']/ion_yield['No_of_shots']
    
    delay_round_to_plot = ion_yield['Delay_round'].to_list()
    delay_normalised_yield_to_plot = ion_yield['delay_normalised_yield'].to_list()

    fig, ax = plt.subplots(figsize = (10,8))
    ax.scatter(delay_round_to_plot[:-2], delay_normalised_yield_to_plot[:-2])

    ax.set_xlabel('Delay / ps')
    
    #ax.set_title(f"Ion-yield v.s. pump-probe delay for {ion} (run-concatenated) - indene", fontsize=16)
    #ax.set_title(f"Ion-yield v.s. pump-probe delay for {ion} (run-concatenated) - fluorene", fontsize=16)
    ax.set_title(f"Ion-yield v.s. pump-probe delay for {ion} (run-concatenated) - fluorene", fontsize=16)
    
    ax.set_ylabel('Ion-yield / a.u.')
    #ax.set_xlim(92.7,94.0)

    fig.savefig(f"ion_yield_{i}_runconcatenated_total",bbox_inches="tight")


In [None]:
# 10a. (Optional) for saving concatenated uncentroided data (JHM)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import os

def read_bin_torder(filename):
    
    directory=os.chdir(r"/asap3/flash/gpfs/bl1/2023/data/11013421/raw/PImMS/")
    file = open(str(filename)+'.bin', 'rb')
    xyt_data = []

    delays = 1000*((np.loadtxt(str(filename)+'-settings.txt', usecols=[3]))) # read in settings file and chooses delay value
    delays = delays.astype(int)
    BID = np.around(np.loadtxt(filename+'-settings.txt', usecols=[0]),3) # reads in BID values from settings file
    BID = BID.astype(int)
    
    lasershot=0
    
    for block,d,b in zip(iter(lambda: file, ''),delays,BID):

        try:
            m, n = np.fromfile(file, dtype='<i', count=2)
            frame = np.reshape(np.fromfile(file, dtype='<u2', count=m*n), (m, n))
            lasershot += 1
            if len(frame)>0:

                ls_col = np.zeros((len(frame),1), dtype='int16')
                ls_col = ls_col + lasershot
                frame = np.append(frame,ls_col,1) # appends counter-generated tag IDs (starting from 0) to array

                col_zeros = np.zeros((len(frame),1), dtype='int16')
                col_tId = col_zeros + b
                frame = np.append(frame,col_tId,axis=1) # appends settings file BIDs to array

                col_zeros = np.zeros((len(frame),1), dtype='int16')
                col_delays = col_zeros + d
                frame = np.append(frame,col_delays,axis=1) # appends delay values to array
                frame = frame[frame[:,2].argsort()] # orders by time-of-flight
                xyt_data.append(frame)
        except ValueError:
            break
    file.close()
    return np.vstack(xyt_data)

#file_list=['222','223','224','226','227'] # run numbers for indene 2023
#file_list = ['147','148','150','151'] # run numbers for fluorene 2023
file_list = ['129','130'] # run numbers for CPP 2023

centroided_df_list = []

for f in file_list:
    counter=1
    print('opening file '+ str(f))
    start_time = time.time()
    xyt=read_bin_torder(f)
    xyt_df=pd.DataFrame(xyt, columns=['x','y','ToF','counter_tagID','BID','delay'])
    print(xyt_df)
    centroided_df_list.append(xyt_df)
    print("Opening took %s seconds" % round((time.time()-start_time),3))

In [None]:
### Then run cells 2, 4 and 5 to get BID- and FEL-filtered data ###

In [None]:
# 10b. (Optional) for saving concatenated uncentroided data (after BID- and FEL-filtering) (JHM)

#directory=os.chdir(r"/home/merrickj/Documents/indene_filtered_data_for_covariance")
#directory=os.chdir(r"/home/merrickj/Documents/fluorene_filtered_data_for_covariance")
directory=os.chdir(r"/home/merrickj/Documents/CPP_filtered_data_for_covariance")

full_data = pd.concat(FEL_BID_filtered_df_list, axis=0) # makes one large dataframe
print(full_data)

answer2 = input("Do you want to save just one frame's worth of data (y/n)?")

if answer2 in ['y', 'Y', 'yes', 'Yes', 'YES']: # for just for one BID/tId frame saving
    
    first_tId = full_data['BID'].iloc[0]
    df_test = full_data[(full_data['BID'] == first_tId)]
    print(df_test)

    full_data_array = df_test.to_numpy()
    
    #np.save(f"indene_full_data_uncentroided_corrected_justoneBID.npy",full_data_array)
    #np.save(f"fluorene_full_data_uncentroided_corrected_justoneBID.npy",full_data_array)
    np.save(f"CPP_full_data_uncentroided_corrected_justoneBID.npy",full_data_array)
    
    print(f"Saved numpy array of concatenated data.")
    
elif answer2 in ['n', 'N', 'no', 'No', 'NO']:
    
    full_data_array = full_data.to_numpy()
    
    #np.save(f"indene_full_data_uncentroided_corrected.npy",full_data_array)
    #np.save(f"fluorene_full_data_uncentroided_corrected.npy",full_data_array)
    np.save(f"CPP_full_data_uncentroided_corrected.npy",full_data_array)
    
    print(f"Saved numpy array of concatenated data.")