In [1]:
from docx import Document
from pathlib import Path
import json
from datetime import datetime

In [12]:
parent_dir = Path('./')
working_dir = parent_dir / 'data'
output_dir = parent_dir / 'output'

# Create directories if they don't exist
working_dir.mkdir(exist_ok=True)
output_dir.mkdir(exist_ok=True)

In [13]:
def iter_segments(json_obj, interval_length=5 * 60):
    start_range = 0
    end_range = interval_length
    segment_text = []

    for result in json_obj['results']:
        end_time = float(result['resultEndTime'][:-1])
        text = ''.join([
            x.get('transcript', '')
            for x in result['alternatives']
            if 'transcript' in x
        ])

        # Ignore empty text
        if not text:
            continue

        # If within time range, add to segment
        if start_range < end_time < end_range:
            segment_text.append(text)

        else:
            # Yield segment
            yield segment_text, start_range, end_range

            # Reset segment
            start_range = end_range
            end_range += interval_length
            segment_text = [text]

    # Yield last segment
    yield segment_text, start_range, end_range

In [14]:
def trim_time(t, divisor=60):
    return (t // divisor) % divisor

In [15]:
def to_time_str(time_in_sec):
    timestamp = datetime(year=2000, month=1, day=1,
                         hour=trim_time(time_in_sec, 3600),
                         minute=trim_time(time_in_sec, 60),
                         second=int(time_in_sec % 60))
    ts_string = timestamp.strftime('%H:%M:%S')
    return ts_string

In [18]:
for file in working_dir.glob('*.m4a-[0-9]*.json'):
    print(file)
    doc = Document()
    doc.add_heading(file.stem, 0)

    json_obj = json.loads(file.read_text())
    for segment_text, start_range, end_range in iter_segments(json_obj):

        doc.add_page_break()

        # Add time stamp as section header
        time_str = to_time_str(start_range)
        h1 = doc.add_heading(time_str, 1)

        # Metadata on volunteer/status of segment
        h2 = doc.add_heading('Volunteer:', 2)
        h2.bold = True
        h2.add_run(' Unassigned, ').bold = False
        h2.add_run('Status:').bold = True
        h2.add_run(' Incomplete').bold = False

        # Add each chunk from segment as a new paragraph
        for text in segment_text:
            doc.add_paragraph(text)

        # Save to output directory
        doc.save(output_dir / f'{file.stem}.docx')


../data/transcripts_Chip.ZFS-Server-Installation-&-Configuration.Part01.m4a-20220506014835.json
../data/transcripts_Chip.Active-Directory-Group-Management.m4a-20220506032909.json
../data/transcripts_Chip.Puppet-deploy-walkthrough.m4a-20220506024121.json
