In [1]:
import numpy as np
import json
import os
import pandas as pd
# note: pyarrow is also required for saving to parquet in pandas

# Data storage path
data_path = '../../../tap_data'

In [2]:
def init_pulse_iteration() -> dict:
    out = dict({
        'amu': 40,
        'gain': 8,
        'injected_time': 0,
        'probe_time': 0,
        'type': 'inert'
        })
    
    return out

def init_metadata() -> dict:
    out = dict({
        'catalyst': '',
        'catalyst_amt_mg': 1,
        'catalyst_percent_wt': 1,
        'catalyst_zone_length_cm': 1,
        'creator': '',
        'date_created': 'YYYY-MM-DD',
        'injection_amt_nmol': 1,
        'ID': '0001',
        'name':'test',
        'paper_DOI': 'none',
        'preparation_notes': '',
        'pulse_iteration' :[init_pulse_iteration()],
        'reactor_length_cm': 1,
        'support': 'SiO2',
        'time_delta_s': 0.001
        })
    
    return out
    

# pulse_iteration, pulse_index, and time_index start at 0, 
# This is especially important when dealing with time.
# Note that the pulse_index, time_index, and pulse_iteration are all integers.
# This is done for faster data queries and compression.
# If the data is not collected evenly within time, please use the time variable.

def matrix2table(x:pd.DataFrame, sig_figs = 5, pulse_iteration = 0, time = None):
    n = x.shape[0]
    p = x.shape[1]
    pulse_index = np.repeat(np.arange(p), n)

    
    time_index = np.tile(np.arange(n), p)
    if time is not None:
        time_index = np.tile(time, p)

    out = pd.DataFrame({'time_index': time_index, 'pulse_index': pulse_index})
    out['pulse_iteration'] = pulse_iteration
    out['flux'] = x.T.to_numpy().flatten().round(sig_figs)

    # Ordering the columns for compression.
    out = out[['pulse_iteration', 'pulse_index', 'time_index', 'flux']]

    # Ensuring integer columns
    out['pulse_iteration'] = out['pulse_iteration'].astype(int)
    out['pulse_index'] = out['pulse_index'].astype(int)
    if time is None:
        out['time_index'] = out['time_index'].astype(int)

    return out

Create the metadata first

In [3]:
meta_data = init_metadata()
meta_data['catalyst'] = 'Pt'
meta_data['catalyst_amt_mg'] = 15.6
meta_data['creator'] = 'Zongtang Fang'
meta_data['name'] = 'irreversibleO2'
meta_data['paper_DOI'] = 'https://doi.org/10.1016/j.cattod.2022.02.010'
meta_data['date_created'] = '2021-12-16'
meta_data['time_delta_s'] = 0.001


argon_meta = init_pulse_iteration()
oxygen_meta = init_pulse_iteration()
oxygen_meta['amu'] = 32
oxygen_meta['type'] = 'reactant'
meta_data['pulse_iteration'] = [argon_meta, oxygen_meta]
meta_data['preparation_notes'] ='The Strong Electrostatic Adsorption (SEA) method was used for the synthesis of 1.0 wt% Pt/SiO2 catalyst.  A commercial silica (AEROSIL OX50, $50 m2/g) from EVONIK was chosen as the support and a precursor of tetraamine platinum (II) hydroxide (Pt(NH3)4(OH)2, 99%, from Aldrich) was employed to deposit the metal. The precursor was dissolved in deionized (DI) water and the initial pH was adjusted to 11.5 with NaOH. Silica was added to the solution and the contents were shaken for one hour.  The resulting mixture was washed with DI water, filtered, and dried overnight under vacuum.  The material was pressed and sieved, retaining the 250 - 300  mu m fraction.  Next, the catalyst was pretreated ex-situ in 50% oxygen and argon flow (30 mL/min) at 400C for $30 min$ followed by reduction in 4% hydrogen and argon flow (50 mL/min) at 400C for one hour. The ex-situ oxidation and reduction was performed with three cycles. Approximately 15.6 mg of pretreated catalyst with the particle size of 250 - 300 mu m was loaded between two zones of the same particle size quartz sand (Sigma Aldrich).  The total length of the reactor was 0.0564m, with a catalyst zone of 0.002 m, and a cross sectional area of 1.256X10-5  m2. The TAP reactor was evacuated at 300C to a pressure of 1X10-7 torr and the catalyst was subjected to at least three cycles of alternating pulses of 200 pulses of carbon monoxide and argon and 200 pulses of oxygen and argon to activate the platinum and reach a reproducible starting point for pulsing experiments. Prior to oxygen adsorption, the catalyst was again reduced at 300C by introducing a sequence of 50% carbon monoxide and argon pulses until no carbon dioxide formation was detected. The TAP reactor was subsequently heated to 500C and kept for 30 min to remove adsorbed carbon monoxide and then cooled to the desired temperature for testing oxidation. The adsorption of oxygen on the catalyst was recorded in separate experiments by pulsing a 1:1 oxygen and argon mixture at 300C with different pulsing intervals of 2.0, 2.5, 3.0, 3.5, and 4.0 s. The time evolution of three mass fragments was followed, namely argon (AMU 40), oxygen (AMU 32), and carbon dioxide (AMU 44). There was no carbon dioxide production detected at the beginning of each oxidation experiment.'


In [9]:
tmp_path = data_path + '/metadata/' + meta_data['ID'] + '.json'
with open(tmp_path, 'w') as f:
    json.dump(meta_data, f)

In [10]:
# read in the data
argon = pd.read_csv("~/Documents/TAP/random_walk/data/argon.csv")
oxygen = pd.read_csv("~/Documents/TAP/random_walk/data/oxygen.csv")

# This data includes time as the first column.
# Removing the time and keeping the time delta.
argon = argon.iloc[:, 1:argon.shape[1]]
oxygen = oxygen.iloc[:, 1:oxygen.shape[1]]

argon = matrix2table(argon, pulse_iteration=0)
oxygen = matrix2table(oxygen, pulse_iteration=1)

In [11]:
dat = pd.concat([argon, oxygen], ignore_index=True)

In [12]:
dat.to_parquet("~/Documents/tap_data/timeseries/0001.parquet",index=False)

# Size in CSV: ~ 50 MB
# Size in Parquet ~ 7.5 MB

Example for TDMS file

In [13]:
import nptdms
tmp_path = '../../../TAP/random_walk/data/0.5Pt_CO-25C-set1.tdms'
dat = nptdms.TdmsFile(tmp_path).as_dataframe()

In [None]:
tdms_names = base_file.keys()
tdms_names_split = [str(i).replace("'", "") for i in tdms_names]
tdms_names_split = [str(i).split("/") for i in tdms_names_split]
group_info = [i[1] for i in tdms_names_split]
unique_group_info = list(unique(array(group_info)))
# removing secondary and meta data where the remainder are the individual measured masses
unique_group_info = unique_group_info[:len(unique_group_info) - 2]

group_df = base_file.iloc[:, group_locs]
flux_df = group_df.iloc[1:, 3:]

Pulling the data

In [None]:
import duckdb
test = duckdb.execute("SELECT * FROM '~/Documents/tap_data/timeseries/0001.parquet' WHERE pulse_index = 200 AND pulse_iteration = 2").df()
test.head()