### Imports

In [1]:
import re 
import os
from praatio import textgrid 
from pydub import AudioSegment



### Helper function to clean transcriptions

In [2]:
def clean_transcription(transcript):
    transcript = ' '.join(line.strip() for line in transcript)
    transcript = transcript.lower()
    remove = [r'\(ppb\)|\(ppc\)|\(ppl\)|\(ppo\)', r'_', r'\[|\]', r'\(|\)', r'!', 
            r'#', r'<unk>', r'~', r'<s>', r'<z>', r'<nen>', r'<fil/>', r'<spk/>',
            r'\*', r'<non/>', r'<s/>', r'<c/>']
    replace = ['-']
    for e in remove:
        transcript = re.sub(e, '', transcript)
    for e in replace:
        transcript = re.sub(e, ' ', transcript)
    transcript = re.sub(r'\s+', ' ', transcript).strip()
    return transcript

### Create 30s segments of audio and clean transcriptions given an audio and TextGrid file pair

```
- imda_nsc_p3.tar.gz
    - imda_nsc_p3.tar
        - train
            - prompts.txt: Contains transcriptions for all the .wav files
            - waves
                - 3000-1.tar
                    - 3000-1_1.wav
                    - 3000-1_2.wav
                    - 3000-1_3.wav
- prompts-train.txt.gz
    - prompts-train.txt: Contains transcriptions for all the train .wav files
```

In [None]:
# Input paths
audio_filename = '3000-1'
audio_path = os.path.join(os.getcwd(), 'dataset', 'part3', 'simple_example', f'{audio_filename}.wav')
textgrid_path = os.path.join(os.getcwd(), 'dataset', 'part3', 'simple_example', f'{audio_filename}.TextGrid')

# Output paths
output_dir_train_wav = os.path.join(os.getcwd(), 'dataset', 'imda_nsc_prototype', 'train', 'waves', f'{audio_filename}')
os.makedirs(output_dir_train_wav, exist_ok=True)
output_dir_train_text = os.path.join(os.getcwd(), 'dataset', 'imda_nsc_prototype', 'train', 'prompts.txt')

# https://github.com/jiaaro/pydub
# https://github.com/timmahrt/praatIO
# https://timmahrt.github.io/praatIO/praatio.html
# Extract the audio and text grid
audio = AudioSegment.from_wav(audio_path)
tg = textgrid.openTextgrid(textgrid_path, False) # do not include intervals and points with empty labels

# pydub does things in milliseconds
segment_duration_ms = 30 * 1000  

# Get total duration of the audio in milliseconds
audio_duration = len(audio)

# Initialize start time and segment index
start_time = 0
segment_index = 1

while start_time < audio_duration:
    # Initialise end time of the segment
    end_time = min(start_time + segment_duration_ms, audio_duration)

    # Extract audio segment given the current start and end timing
    audio_segment = audio[start_time:end_time]

    # Save the audio segment
    audio_segment_path = os.path.join(output_dir_train_wav, f'{audio_filename}_{segment_index}.wav')
    audio_segment.export(audio_segment_path, format="wav")

    # Extract the corresponding TextGrid segment
    # https://timmahrt.github.io/praatIO/praatio/data_classes/textgrid.html
    tg_segment = tg.crop(start_time / 1000, end_time / 1000, mode="truncated", rebaseToZero=False)

    # Collect transcriptions from the TextGrid segment
    transcriptions = []
    for tier_name in tg_segment.tierNames: # For each tier (in order) in the TextGrid segment
        tier = tg_segment.getTier(tier_name) # Get the tier
        for entry in tier.entries: # For each of its entries, extract the labels 
            if entry.label.strip():  # Only include non-empty transcriptions -> but should be handled above already
                transcriptions.append(entry.label)

    #print(f"Dirty transcription: {transcriptions}")
    # Clean the transcriptions
    transcriptions_clean = clean_transcription(transcriptions)
    #print(f"Clean transcription: {transcriptions_clean}")
    #print("")

    # Save the transcriptions to a text file, append mode
    with open(output_dir_train_text, 'a') as f:
        f.write(f'{audio_filename}_{segment_index} {transcriptions_clean}\n')

    start_time+=segment_duration_ms
    segment_index+=1