In [None]:
import logging
from datetime import datetime

current_file_name = "7_2_Elaborations_Transcripts_From_Chunks"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

import argparse
import io

from google.cloud import speech

import grpc

from openai import OpenAI

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
with open("tokens/openai_key.txt", "r") as file:
    OPENAI_API_KEY = file.read().rstrip()

# Set environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client = OpenAI()

In [None]:
GLOBAL_MODE = "openai"

In [None]:
GLOBAL_MODE == "openai"
GLOBAL_FORMAT = ".wav"

In [None]:
def get_dict_of_paths(root_path):
    # There are folders in the root path named after the respondents
    # Each of these folders contains folders for elaborations 
    # Those folders contains the audio files in aac/wav format
    # Create dictionary with the paths to the audio files, where the key is subfolder name and the value is the list of audio files
    # FG
    #     respondent_104
    #         elaboration_1_1
    #             elaboration_1_1_chunk_0.wav
    #             elaboration_1_1_chunk_1.wav

    dict_of_paths = {}
    for respondent in os.listdir(root_path):
        dict_of_paths[respondent] = {}
        for elaboration in os.listdir(f"{root_path}/{respondent}"):
            dict_of_paths[respondent][elaboration] = []
            for audio_file in os.listdir(f"{root_path}/{respondent}/{elaboration}"):
                if audio_file.endswith(GLOBAL_FORMAT):
                    dict_of_paths[respondent][elaboration].append(f"{root_path}/{respondent}/{elaboration}/{audio_file}")

    return dict_of_paths

In [None]:
extracted_recordings_fg_path = "data/6_2_Remove_Pauses/FG"
extracted_recordings_h_path = "data/6_2_Remove_Pauses/H"

In [None]:
fg_paths = get_dict_of_paths(extracted_recordings_fg_path)
h_paths = get_dict_of_paths(extracted_recordings_h_path)

In [None]:
@timer
def transcribe_onprem_openai(local_file_path: str):
    logging.info(f"Transcribing {local_file_path}")

    audio_file = open(local_file_path, "rb")

    transcript = client.audio.transcriptions.create(
        file=audio_file,
        model="whisper-1",
        language="en",
        response_format="verbose_json",
        temperature=0.0, 
        timestamp_granularities=["word", "segment"],
        prompt="Umm, let me think like, hmm... Okay, here's what I'm, like, thinking. Uh. Um. Well. Er. Ah. You know, like. Erm."
    )

    logging.info(f"Transcription of {local_file_path} complete")

    return transcript, transcript.text

In [None]:
def get_transcript(path_dict, variant, fixing=[]):
    # For each respondent, transcribe all the audio files and save the transcript
    for respondent, paths in path_dict.items():
        logging.info(f"Transcribing {respondent}")
        respondent_path = f"data\\7_2_Elaborations_Transcripts_From_Chunks\\{variant}\\{respondent}"

        if os.path.exists(respondent_path):
            logging.info(f"Folder {respondent_path} already exists")
            # continue
        else:
            os.makedirs(respondent_path, exist_ok=True)

        for elaboration, audio_files in paths.items():
            for path in audio_files:
                logging.info(f"Transcribing {path} using {GLOBAL_MODE}")
                print(f"Transcribing {path} using {GLOBAL_MODE}")

                response, transcript = transcribe_onprem_openai(path)

                file_name_transcript = path.split("\\")[-1].replace(GLOBAL_FORMAT, ".txt")
                file_name_response = path.split("\\")[-1].replace(GLOBAL_FORMAT, "_response.json")

                transcript_path = file_name_transcript.replace("6_2_Remove_Pauses", current_file_name)
                response_path = file_name_response.replace("6_2_Remove_Pauses", current_file_name)

                # Create folders on path if they don't exist
                os.makedirs(f"{respondent_path}\\{elaboration}", exist_ok=True)

                with open(transcript_path, "w") as f:
                    # Sanitaze transcript to remove \ufffd
                    transcript = transcript.replace("\ufffd", "")
                    f.write(transcript)

                with open(response_path, "w") as f:
                    try:
                        f.write(response.model_dump_json())
                    except:
                        dump = response.model_dump_json()
                        dump = dump.replace("\ufffd", "")
                        f.write(dump)

In [None]:
get_transcript(fg_paths, "FG")

In [None]:
get_transcript(h_paths, "H")