In [11]:
import os
import pandas as pd
import os
import h5py
from tqdm import tqdm
from omegaconf import OmegaConf

In [None]:
# ----CONFIG ----
model_path = "data/t15_pretrained_rnn_baseline"
data_dir = "data/hdf5_data_final"
csv_path = "data/t15_copyTaskData_description.csv"
output_file = os.path.join(model_path, "phoneme_text_training_data_final.csv")
# ---------------
# Load metadata
b2txt_csv_df = pd.read_csv(csv_path)
model_args = OmegaConf.load(os.path.join(model_path, "checkpoint/args.yaml"))

def load_h5py_file(file_path, b2txt_csv_df):
    data = {
        'neural_features': [],
        'n_time_steps': [],
        'seq_class_ids': [],
        'seq_len': [],
        'transcriptions': [],
        'sentence_label': [],
        'session': [],
        'block_num': [],
        'trial_num': [],
        'corpus': [],
    }
    # Open the hdf5 file for that day
    with h5py.File(file_path, 'r') as f:

        keys = list(f.keys())

        # For each trial in the selected trials in that day
        for key in keys:
            g = f[key]

            neural_features = g['input_features'][:]
            n_time_steps = g.attrs['n_time_steps']
            seq_class_ids = g['seq_class_ids'][:] if 'seq_class_ids' in g else None
            seq_len = g.attrs['seq_len'] if 'seq_len' in g.attrs else None
            transcription = g['transcription'][:] if 'transcription' in g else None
            sentence_label = g.attrs['sentence_label'][:] if 'sentence_label' in g.attrs else None
            session = g.attrs['session']
            block_num = g.attrs['block_num']
            trial_num = g.attrs['trial_num']

            # match this trial up with the csv to get the corpus name
            year, month, day = session.split('.')[1:]
            date = f'{year}-{month}-{day}'
            row = b2txt_csv_df[(b2txt_csv_df['Date'] == date) & (b2txt_csv_df['Block number'] == block_num)]
            corpus_name = row['Corpus'].values[0]

            data['neural_features'].append(neural_features)
            data['n_time_steps'].append(n_time_steps)
            data['seq_class_ids'].append(seq_class_ids)
            data['seq_len'].append(seq_len)
            data['transcriptions'].append(transcription)
            data['sentence_label'].append(sentence_label)
            data['session'].append(session)
            data['block_num'].append(block_num)
            data['trial_num'].append(trial_num)
            data['corpus'].append(corpus_name)
    return data

LOGIT_TO_PHONEME = [
    'BLANK',
    'AA', 'AE', 'AH', 'AO', 'AW',
    'AY', 'B',  'CH', 'D', 'DH',
    'EH', 'ER', 'EY', 'F', 'G',
    'HH', 'IH', 'IY', 'JH', 'K',
    'L', 'M', 'N', 'NG', 'OW',
    'OY', 'P', 'R', 'S', 'SH',
    'T', 'TH', 'UH', 'UW', 'V',
    'W', 'Y', 'Z', 'ZH',
    ' | ',
]

# Decode true phoneme sequence from seq_class_ids
def decode_phoneme_sequence(seq_ids, seq_len):
    seq = seq_ids[:seq_len]
    seq = [int(p) for p in seq if p != 0]  # remove blanks
    seq = [seq[i] for i in range(len(seq)) if i == 0 or seq[i] != seq[i-1]]  # remove repeats
    return " ".join([LOGIT_TO_PHONEME[p] for p in seq])

# Gather all examples
records = []

for split in ["train", "val"]:  # these contain ground truth phonemes and text
    print(f"\nProcessing split: {split}")

    for session in model_args["dataset"]["sessions"]:
        session_file = os.path.join(data_dir, session, f"data_{split}.hdf5")
        if not os.path.exists(session_file):
            continue

        # Use helper from repo to extract HDF5 contents
        data = load_h5py_file(session_file, b2txt_csv_df)
        n_trials = len(data["neural_features"])
        print(f"  - {session}: {n_trials} trials")

        for i in tqdm(range(n_trials), desc=f"{session}", unit="trial"):
            seq_class_ids = data["seq_class_ids"][i]
            seq_len = data["seq_len"][i]
            sentence_label = data["sentence_label"][i]

            # skip if label missing
            if sentence_label is None or seq_class_ids is None:
                continue

            # decode phonemes
            phoneme_seq = decode_phoneme_sequence(seq_class_ids, seq_len)
            text_label = sentence_label.decode("utf-8") if isinstance(sentence_label, bytes) else str(sentence_label)
            text_label = text_label.strip().lower()

            records.append({
                "split": split,
                "session": data["session"][i],
                "block_num": data["block_num"][i],
                "trial_num": data["trial_num"][i],
                "phoneme_sequence": phoneme_seq,
                "sentence": text_label
            })

# Save to CSV
df = pd.DataFrame(records)
df.to_csv(output_file, index=False)
print(f"\nSaved {len(df)} phoneme–text pairs to: {output_file}")

print("\nSample:")
print(df.sample(5))



Processing split: train
  - t15.2023.08.11: 288 trials


t15.2023.08.11: 100%|██████████| 288/288 [00:00<00:00, 169800.33trial/s]


  - t15.2023.08.13: 348 trials


t15.2023.08.13: 100%|██████████| 348/348 [00:00<00:00, 137557.04trial/s]


  - t15.2023.08.18: 197 trials


t15.2023.08.18: 100%|██████████| 197/197 [00:00<00:00, 149390.33trial/s]


  - t15.2023.08.20: 278 trials


t15.2023.08.20: 100%|██████████| 278/278 [00:00<00:00, 141868.42trial/s]


  - t15.2023.08.25: 88 trials


t15.2023.08.25: 100%|██████████| 88/88 [00:00<00:00, 95770.30trial/s]


  - t15.2023.08.27: 150 trials


t15.2023.08.27: 100%|██████████| 150/150 [00:00<00:00, 114933.43trial/s]


  - t15.2023.09.01: 297 trials


t15.2023.09.01: 100%|██████████| 297/297 [00:00<00:00, 157764.47trial/s]


  - t15.2023.09.03: 322 trials


t15.2023.09.03: 100%|██████████| 322/322 [00:00<00:00, 114883.11trial/s]


  - t15.2023.09.24: 245 trials


t15.2023.09.24: 100%|██████████| 245/245 [00:00<00:00, 130988.46trial/s]


  - t15.2023.09.29: 153 trials


t15.2023.09.29: 100%|██████████| 153/153 [00:00<00:00, 121793.23trial/s]


  - t15.2023.10.01: 218 trials


t15.2023.10.01: 100%|██████████| 218/218 [00:00<00:00, 150264.30trial/s]


  - t15.2023.10.06: 174 trials


t15.2023.10.06: 100%|██████████| 174/174 [00:00<00:00, 140753.89trial/s]


  - t15.2023.10.08: 284 trials


t15.2023.10.08: 100%|██████████| 284/284 [00:00<00:00, 151299.67trial/s]


  - t15.2023.10.13: 155 trials


t15.2023.10.13: 100%|██████████| 155/155 [00:00<00:00, 167125.22trial/s]


  - t15.2023.10.15: 239 trials


t15.2023.10.15: 100%|██████████| 239/239 [00:00<00:00, 107029.54trial/s]


  - t15.2023.10.20: 98 trials


t15.2023.10.20: 100%|██████████| 98/98 [00:00<00:00, 156826.32trial/s]


  - t15.2023.10.22: 134 trials


t15.2023.10.22: 100%|██████████| 134/134 [00:00<00:00, 118748.52trial/s]


  - t15.2023.11.03: 149 trials


t15.2023.11.03: 100%|██████████| 149/149 [00:00<00:00, 152501.54trial/s]


  - t15.2023.11.04: 80 trials


t15.2023.11.04: 100%|██████████| 80/80 [00:00<00:00, 150198.89trial/s]


  - t15.2023.11.17: 100 trials


t15.2023.11.17: 100%|██████████| 100/100 [00:00<00:00, 147013.81trial/s]


  - t15.2023.11.19: 60 trials


t15.2023.11.19: 100%|██████████| 60/60 [00:00<00:00, 143068.93trial/s]


  - t15.2023.11.26: 198 trials


t15.2023.11.26: 100%|██████████| 198/198 [00:00<00:00, 113499.00trial/s]


  - t15.2023.12.03: 228 trials


t15.2023.12.03: 100%|██████████| 228/228 [00:00<00:00, 152083.54trial/s]


  - t15.2023.12.08: 198 trials


t15.2023.12.08: 100%|██████████| 198/198 [00:00<00:00, 112912.60trial/s]


  - t15.2023.12.10: 131 trials


t15.2023.12.10: 100%|██████████| 131/131 [00:00<00:00, 141284.09trial/s]


  - t15.2023.12.17: 135 trials


t15.2023.12.17: 100%|██████████| 135/135 [00:00<00:00, 146920.35trial/s]


  - t15.2023.12.29: 198 trials


t15.2023.12.29: 100%|██████████| 198/198 [00:00<00:00, 146856.27trial/s]


  - t15.2024.02.25: 193 trials


t15.2024.02.25: 100%|██████████| 193/193 [00:00<00:00, 122875.03trial/s]


  - t15.2024.03.03: 219 trials


t15.2024.03.03: 100%|██████████| 219/219 [00:00<00:00, 191365.12trial/s]


  - t15.2024.03.08: 163 trials


t15.2024.03.08: 100%|██████████| 163/163 [00:00<00:00, 138255.12trial/s]


  - t15.2024.03.15: 239 trials


t15.2024.03.15: 100%|██████████| 239/239 [00:00<00:00, 137282.75trial/s]


  - t15.2024.03.17: 246 trials


t15.2024.03.17: 100%|██████████| 246/246 [00:00<00:00, 148310.88trial/s]


  - t15.2024.04.25: 364 trials


t15.2024.04.25: 100%|██████████| 364/364 [00:00<00:00, 193256.54trial/s]


  - t15.2024.04.28: 150 trials


t15.2024.04.28: 100%|██████████| 150/150 [00:00<00:00, 184554.30trial/s]


  - t15.2024.05.10: 110 trials


t15.2024.05.10: 100%|██████████| 110/110 [00:00<00:00, 149991.37trial/s]


  - t15.2024.06.14: 90 trials


t15.2024.06.14: 100%|██████████| 90/90 [00:00<00:00, 151723.22trial/s]


  - t15.2024.07.19: 169 trials


t15.2024.07.19: 100%|██████████| 169/169 [00:00<00:00, 143926.37trial/s]


  - t15.2024.07.21: 160 trials


t15.2024.07.21: 100%|██████████| 160/160 [00:00<00:00, 132809.94trial/s]


  - t15.2024.07.28: 161 trials


t15.2024.07.28: 100%|██████████| 161/161 [00:00<00:00, 152848.11trial/s]


  - t15.2025.01.10: 106 trials


t15.2025.01.10: 100%|██████████| 106/106 [00:00<00:00, 97392.38trial/s]


  - t15.2025.01.12: 163 trials


t15.2025.01.12: 100%|██████████| 163/163 [00:00<00:00, 94221.55trial/s]


  - t15.2025.03.14: 59 trials


t15.2025.03.14: 100%|██████████| 59/59 [00:00<00:00, 116016.85trial/s]


  - t15.2025.03.16: 101 trials


t15.2025.03.16: 100%|██████████| 101/101 [00:00<00:00, 108677.45trial/s]


  - t15.2025.03.30: 165 trials


t15.2025.03.30: 100%|██████████| 165/165 [00:00<00:00, 136285.97trial/s]


  - t15.2025.04.13: 69 trials


t15.2025.04.13: 100%|██████████| 69/69 [00:00<00:00, 124690.64trial/s]



Processing split: val
  - t15.2023.08.13: 35 trials


t15.2023.08.13: 100%|██████████| 35/35 [00:00<00:00, 132372.08trial/s]


  - t15.2023.08.18: 49 trials


t15.2023.08.18: 100%|██████████| 49/49 [00:00<00:00, 122919.20trial/s]


  - t15.2023.08.20: 48 trials


t15.2023.08.20: 100%|██████████| 48/48 [00:00<00:00, 124352.43trial/s]


  - t15.2023.08.25: 25 trials


t15.2023.08.25: 100%|██████████| 25/25 [00:00<00:00, 114723.85trial/s]


  - t15.2023.08.27: 25 trials


t15.2023.08.27: 100%|██████████| 25/25 [00:00<00:00, 56405.38trial/s]


  - t15.2023.09.01: 49 trials


t15.2023.09.01: 100%|██████████| 49/49 [00:00<00:00, 113988.30trial/s]


  - t15.2023.09.03: 34 trials


t15.2023.09.03: 100%|██████████| 34/34 [00:00<00:00, 108942.96trial/s]


  - t15.2023.09.24: 35 trials


t15.2023.09.24: 100%|██████████| 35/35 [00:00<00:00, 135549.99trial/s]


  - t15.2023.09.29: 48 trials


t15.2023.09.29: 100%|██████████| 48/48 [00:00<00:00, 110923.74trial/s]


  - t15.2023.10.01: 44 trials


t15.2023.10.01: 100%|██████████| 44/44 [00:00<00:00, 105819.60trial/s]


  - t15.2023.10.06: 36 trials


t15.2023.10.06: 100%|██████████| 36/36 [00:00<00:00, 73085.65trial/s]


  - t15.2023.10.08: 17 trials


t15.2023.10.08: 100%|██████████| 17/17 [00:00<00:00, 84783.79trial/s]


  - t15.2023.10.13: 44 trials


t15.2023.10.13: 100%|██████████| 44/44 [00:00<00:00, 128248.35trial/s]


  - t15.2023.10.15: 44 trials


t15.2023.10.15: 100%|██████████| 44/44 [00:00<00:00, 122218.13trial/s]


  - t15.2023.10.20: 9 trials


t15.2023.10.20: 100%|██████████| 9/9 [00:00<00:00, 74898.29trial/s]


  - t15.2023.10.22: 33 trials


t15.2023.10.22: 100%|██████████| 33/33 [00:00<00:00, 130208.87trial/s]


  - t15.2023.11.03: 50 trials


t15.2023.11.03: 100%|██████████| 50/50 [00:00<00:00, 127254.37trial/s]


  - t15.2023.11.04: 15 trials


t15.2023.11.04: 100%|██████████| 15/15 [00:00<00:00, 103991.01trial/s]


  - t15.2023.11.17: 25 trials


t15.2023.11.17: 100%|██████████| 25/25 [00:00<00:00, 113851.90trial/s]


  - t15.2023.11.19: 20 trials


t15.2023.11.19: 100%|██████████| 20/20 [00:00<00:00, 94148.24trial/s]


  - t15.2023.11.26: 44 trials


t15.2023.11.26: 100%|██████████| 44/44 [00:00<00:00, 107671.75trial/s]


  - t15.2023.12.03: 34 trials


t15.2023.12.03: 100%|██████████| 34/34 [00:00<00:00, 115658.02trial/s]


  - t15.2023.12.08: 50 trials


t15.2023.12.08: 100%|██████████| 50/50 [00:00<00:00, 128659.63trial/s]


  - t15.2023.12.10: 25 trials


t15.2023.12.10: 100%|██████████| 25/25 [00:00<00:00, 96111.46trial/s]


  - t15.2023.12.17: 30 trials


t15.2023.12.17: 100%|██████████| 30/30 [00:00<00:00, 109990.49trial/s]


  - t15.2023.12.29: 50 trials


t15.2023.12.29: 100%|██████████| 50/50 [00:00<00:00, 120664.67trial/s]


  - t15.2024.02.25: 23 trials


t15.2024.02.25: 100%|██████████| 23/23 [00:00<00:00, 100908.99trial/s]


  - t15.2024.03.08: 24 trials


t15.2024.03.08: 100%|██████████| 24/24 [00:00<00:00, 104639.60trial/s]


  - t15.2024.03.15: 48 trials


t15.2024.03.15: 100%|██████████| 48/48 [00:00<00:00, 89998.48trial/s]


  - t15.2024.03.17: 48 trials


t15.2024.03.17: 100%|██████████| 48/48 [00:00<00:00, 105517.08trial/s]


  - t15.2024.05.10: 25 trials


t15.2024.05.10: 100%|██████████| 25/25 [00:00<00:00, 117685.30trial/s]


  - t15.2024.06.14: 25 trials


t15.2024.06.14: 100%|██████████| 25/25 [00:00<00:00, 69905.07trial/s]


  - t15.2024.07.19: 48 trials


t15.2024.07.19: 100%|██████████| 48/48 [00:00<00:00, 119340.01trial/s]


  - t15.2024.07.21: 46 trials


t15.2024.07.21: 100%|██████████| 46/46 [00:00<00:00, 112500.28trial/s]


  - t15.2024.07.28: 48 trials


t15.2024.07.28: 100%|██████████| 48/48 [00:00<00:00, 124583.29trial/s]


  - t15.2025.01.10: 23 trials


t15.2025.01.10: 100%|██████████| 23/23 [00:00<00:00, 107068.80trial/s]


  - t15.2025.01.12: 47 trials


t15.2025.01.12: 100%|██████████| 47/47 [00:00<00:00, 131861.06trial/s]


  - t15.2025.03.14: 24 trials


t15.2025.03.14: 100%|██████████| 24/24 [00:00<00:00, 117050.34trial/s]


  - t15.2025.03.16: 24 trials


t15.2025.03.16: 100%|██████████| 24/24 [00:00<00:00, 106073.02trial/s]


  - t15.2025.03.30: 30 trials


t15.2025.03.30: 100%|██████████| 30/30 [00:00<00:00, 82836.81trial/s]


  - t15.2025.04.13: 25 trials


t15.2025.04.13: 100%|██████████| 25/25 [00:00<00:00, 101213.90trial/s]


Saved 9498 phoneme–text pairs to: data/t15_pretrained_rnn_baseline/phoneme_text_training_data_final.csv

Sample:
      split         session  block_num  trial_num  \
3562  train  t15.2023.10.22          2         12   
2344  train  t15.2023.09.29          9          3   
6697  train  t15.2024.04.28         10         28   
3165  train  t15.2023.10.13          4         18   
5406  train  t15.2024.03.03          3         49   

                                       phoneme_sequence  \
3562  G UH D  |  L AH K  |  AA N  |  Y AO R  |  G R ...   
2344  JH AH S T  |  AH B AW T  |  EH N IY  |  K AY N...   
6697             D UW  |  Y UW  |  HH AE V  |  IH T  |    
3165  AY  |  W AA Z  |  IH N  |  DH AH  |  S ER V AH...   
5406  AY  |  HH OW P  |  M AY  |  N ER S  |  IH Z  |...   

                             sentence  
3562    good luck on your graduation.  
2344    just about any kind of music.  
6697                  do you have it?  
3165            i was in the service.  
5406  i hope


