## Gemini chaptering experiments
* With this code I demonstrate how to do STT with different models
* And generate chapter based on this transcript. 

In [None]:
!pip install --upgrade datasets nltk evaluate tokenizers seqeval sequence-evaluate sentence-transformers rouge jiwer pydub google-cloud-aiplatform google-cloud-aiplatform[all]


In [128]:
PROJECT_ID = "cloud-llm-preview1"  # @param {type:"string"}

BUCKET_NAME = "julien-us" # @param {type:"string"}
REGION = "us-central1" # @param {type:"string"}
BQ_REGION = "us" # @param {type:"string"}
#table_id = "julienmiquel_us.stt_v10" # @param {type:"string"}

debug = True  # @param {type:"boolean"}
import vertexai

vertexai.init(project=PROJECT_ID, location=REGION)

from google.api_core import retry
import datetime

import os
import json
import base64
import vertexai

from vertexai.preview.generative_models import GenerativeModel
from vertexai.generative_models import (
    GenerativeModel,
    HarmCategory,
    HarmBlockThreshold,
    Part,
    SafetySetting,
    FinishReason
)
from vertexai.preview import caching


In [129]:
wav_files = !gsutil ls gs://julien-us/stt_synthetic_tests_data/*.wav

In [None]:
text_files = [string.replace('.wav', '.txt') for string in wav_files]

wav_text_arr = zip(wav_files, text_files)
len(wav_files)

## Code Utils

### GCS code utils

In [131]:
from google.cloud import storage
import re

def split_gcs_uri(gcs_uri):
  """Splits a GCS URI into bucket name and blob path variables.

  Args:
    gcs_uri: The GCS URI to split.

  Returns:
    A tuple containing the bucket name and blob path.
  """

  match = re.match(r"gs://([^/]+)/(.+)", gcs_uri)
  if match:
    return match.groups()
  else:
    raise ValueError("Invalid GCS URI: {}".format(gcs_uri))

def write_file_to_gcs(gcs_bucket_name,  gcs_file_name, local_file_path, tags = None, verbose= False):
    """Writes a local file to GCS.

    Args:
    local_file_path: The path to the local file to write to GCS.
    gcs_bucket_name: The name of the GCS bucket to write the file to.
    gcs_file_name: The name of the GCS file to write the file to.

    Returns:
    The GCS file path.
    """
    if verbose: print(f"local_file_path = {local_file_path} - gcs_bucket_name = {gcs_bucket_name} - gcs_file_name = {gcs_file_name}")
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(gcs_bucket_name)
    blob = bucket.blob(gcs_file_name)
    if tags is not None:
        blob.metadata = tags

    if verbose: print(f"upload_from_filename : local_file_path = {local_file_path}")
    blob.upload_from_filename(local_file_path, )

    return blob


def store_temp_file_from_gcs(bucket_name, file_name, localfile):
    import tempfile
    import os

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    bytes_data = blob.download_as_bytes()

    # Create a temporary file.
    # tempDir = tempfile.gettempdir()
    tempDir = os.getcwd()

    temp_path = os.path.join(tempDir, localfile)
    # f, temp_path = tempfile.mkstemp()
    fp = open(temp_path, 'bw')
    fp.write(bytes_data)
    fp.seek(0)


    return temp_path

# Listen the first element in the dataset.
if debug:
  for wav_file, text_file in wav_text_arr:
    bucket, file = split_gcs_uri(wav_file)
    store_temp_file_from_gcs(bucket, file, "temp.wav")

    break

  # Needed imports
  import numpy as np
  from IPython.display import Audio
  from scipy.io import wavfile

  # Generate a player for mono sound
  Audio("temp.wav")

### Evaluate results with metrics
- wer
- semantic_textual_similarity

In [None]:
from seq_eval import SeqEval
import evaluate

wer_metric = evaluate.load("wer")

evaluator = SeqEval()

def evaluate_data(predictions, references, verbose= False):
    references = [x for x in references if x!= '']
    predictions = [x for x in predictions if x!= '']

    references = references
    predictions = predictions

    if len(references)!= len(predictions):

        min_arr = min(len(references), len(references))
        print(f"Reduce size to {min_arr}")
        predictions = predictions[0:min_arr]
        references = references[0:min_arr]


    scores = evaluator.evaluate(predictions, references, verbose=verbose)

    if verbose: print(scores)

    wer = wer_metric.compute(references=references, predictions=predictions)
    wer = round(100 * wer, 2)
    print("WER:", wer ,end='\n')
    print("semantic_textual_similarity:",scores['semantic_textual_similarity'],end='\n')
    return wer, scores['semantic_textual_similarity']




## Strategy max_token

In [None]:
# Get transcription with max token strategy
# Wait to reach the max output token finish raison and ask to continue generation from a prompt_continue prompt


# This code process a part object (from uri or data)
# if it reach the max_token limit, a continue_prompt is apply to continue the generation
@retry.Retry(timeout=3000.0)
def transcribe_with_gemini_from_part(prompt, audio, model_name, top_p):

  result = []

  if system_instruction is not None:
    model = GenerativeModel(
        model_name,
        system_instruction=[system_instruction]
    )
  else:
    model = GenerativeModel(model_name)

  generation_config = {
      "max_output_tokens": 8192,
      "temperature": 0.0,
      "top_p": top_p,
  }
  safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
      }


  isFinished = False

  while (isFinished == False):
    if debug: print("Generating...")

    if len(result) > 0:
      previous_text = "".join(result)
      if debug: print(previous_text)

      prompts = [audio, prompt_continue]
      if debug: print(80*"*+")
    else:
      prompts= [audio, prompt]
      if debug: print("First prompt")

    response = model.generate_content(
        contents=prompts,
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
    )



    finish_reason = response.candidates[0].finish_reason

    if finish_reason == FinishReason.RECITATION \
    or finish_reason == FinishReason.PROHIBITED_CONTENT:
        print(finish_reason, end="\n")
        continue
    try:
        value = response.candidates[0].text
        value = response.text
        if debug: print(value, end="\n")

        value = value.replace("```tsv", "").replace("```json", "").replace("```", "")

        result.append(value #+ " "
        )

    except (ValueError, AttributeError) as e:
        print("ERROR get the result")
        print(e, end="\n")
    # finish_reason = None
    # for response in responses:
    #     finish_reason = response.candidates[0].finish_reason

    #     if finish_reason == FinishReason.RECITATION \
    #     or finish_reason == FinishReason.PROHIBITED_CONTENT:
    #         print(finish_reason, end="\n")
    #         continue
    #     try:
    #         value = response.candidates[0].text
    #         value = response.text
    #         if debug: print(value, end="\n")

    #         value = value.replace("```tsv", "").replace("```json", "").replace("```", "")

    #         result.append(value #+ " "
    #         )

    #     except (ValueError, AttributeError) as e:
    #         print("ERROR get the result")
    #         print(e, end="\n")

    if debug: print(finish_reason)

    if finish_reason != FinishReason.MAX_TOKENS :
        isFinished = True
        if debug: print("isFinished")
        break

  if debug: print("Done")
  return "".join(result)


# @retry.Retry(timeout=3000.0)
# def transcribe_with_gemini(prompt, mime_type="audio/wav", audio_path=None, data=None, model_name=None, top_p=0.0):

#   if model_name is None:
#     raise Exception("Model name is required")

#   # data or URI
#   if audio_path is not None:
#     audio1 = Part.from_uri(
#       mime_type=mime_type,
#       uri=audio_path)
#   else:
#     audio1 = Part.from_data(data,
#         mime_type=mime_type)

#   return transcribe_with_gemini_from_part(prompt, audio1, model_name, top_p)

@retry.Retry(timeout=3000.0)
def transcribe_with_gemini_data(prompt, data=None, mime_type="audio/wav", model_name=None, top_p=0.0):

  audio1 = Part.from_data(data,
        mime_type=mime_type)

  return transcribe_with_gemini_from_part(prompt, audio1, model_name, top_p)

@retry.Retry(timeout=3000.0)
def transcribe_with_gemini_from_uri(prompt, audio_path, mime_type="audio/wav", model_name=None, top_p=0.0):

  audio1 = Part.from_uri(
    mime_type=mime_type,
    uri=audio_path)

  return transcribe_with_gemini_from_part(prompt, audio1, model_name, top_p)

if debug:
  # uri= 'gs://julien-us/stt_synthetic_tests_data/stt-synthetic-data-rate14-28.wav'
  uri=wav_files[0]
  result = transcribe_with_gemini_from_uri(prompt, audio_path=uri, model_name="gemini-1.5-flash")
  print(result)


In [None]:
%%time

import os
import json


# def generate_gemini_result(wav_text_arr, model_name):
#   print(f"Model: {model_name}")
#   i = 0
#   for wav_file, text_file in wav_text_arr:
#     print(f"Processing {i}")
#     bucket, file_wav = split_gcs_uri(wav_file)
#     bucket, file_txt = split_gcs_uri(text_file)
#     #local_file = f"{i}-temp.wav"
#     local_file_txt = f"{i}-ground-truth.txt"
#     #store_temp_file_from_gcs(bucket, file_wav, local_file)
#     store_temp_file_from_gcs(bucket, file_txt, local_file_txt)

#     result = transcribe_with_gemini(prompt, wav_file, model_name=model_name)
#     result= "".join(result)

#     if not "gemini" in model_name :
#       model_id = model_name.split("/")[-1]
#     else:
#       model_id = model_name
#     gemini_file = f"{i}-gemini_{model_id}_result.json"
#     with open(gemini_file, "w", encoding="UTF8") as f:
#         f.write(result)

#     print(80*"-")
#     print(result)
#     print(80*"-")

#     tags = { "model_name": model_name,
#             "file": file_wav,
#             "ground-truth": file_txt,
#             }
#     write_file_to_gcs(bucket,  text_file.replace("stt_synthetic_tests_data", "stt_synthetic_results").replace(".txt","") + f"-gemini-{model_id}.txt",
#                       gemini_file, tags )


#     with open(local_file_txt, 'r') as f:
#       ground_truth = f.read()

#     wer, semantic_textual_similarity = evaluate_data([result], [ground_truth])
#     print(f"Results:{wav_file}, WER: {wer}, semantic_textual_similarity: {semantic_textual_similarity}")

#     if system_instruction:
#       prompt_log = "system_instruction:" + system_instruction + "\nprompt:" +prompt
#     else:
#       prompt_log = prompt

#     data = {
#       "input_file": wav_file,
#       "ground_truth": ground_truth,
#       "model_name": "gemini_max_token:"+ model_name,
#       "prompt": prompt_log,
#       #"system_instruction": system_instruction,
#       "wer": wer,
#       "semantic_textual_similarity": semantic_textual_similarity,
#       "generated_file": gemini_file,
#       "generated_text": result
#     }
#     import pandas as pd
#     df = pd.DataFrame( data = [data], columns = ["input_file","ground_truth", "wer", "semantic_textual_similarity","generated_file","generated_text" , "model_name", "prompt"])
#     # return df
#     save_results_df_bq(df, table_id, truncate=False)

#     i += 1


# models_names = [
#   # "gemini-1.5-flash",
#   # "gemini-1.5-flash-001",
#   # "gemini-1.5-flash-002",

#   # "gemini-1.5-pro",
#   # "gemini-1.5-pro-001",
#   # "gemini-1.5-pro-002",
#         "projects/801452371447/locations/us-central1/endpoints/3103157164630343680",

#   ]


# for models_name in models_names:
#   generate_gemini_result(wav_text_arr, model_name=models_name)
# # df = generate_gemini_result(wav_text_arr, model_name)

## Strategy split by silence
 - Truncate the audio file in segment to avoid the max output token finish raison

In [137]:
# utils function to truncate audio file
import io
import pydub
from pydub import AudioSegment

root_dir = '.'
output_dir = '.'


def splitAudio(root_dir, file, start, stop, output_dir):
    sound = AudioSegment.from_mp3(root_dir+file)

    sound = sound[start:stop]
    if debug:
      print(f"file = {file}")
      print(f"duration_seconds = {sound.duration_seconds}")
      print(f"sample_width = {sound.sample_width}")
      print(f"channels = {sound.channels}")
      print(f"frame_rate = {sound.frame_rate}")

    file_segment = output_dir+file+f"-{start}-{stop}.wav"

    sound.export(file_segment, format="wav")
    return file_segment



In [138]:
# generate sequence from audio file split by silences
def get_audio_sequence_split_by_silences(file, min_silence_len=500):
    from pydub import AudioSegment, silence

    myaudio = AudioSegment.from_mp3(file)
    dBFS=myaudio.dBFS

    # speak = silence.detect_nonsilent(myaudio, min_silence_len=min_silence_len, silence_thresh=dBFS-20, seek_step=10)
    # speak_sequences = speak

    #speak_sequences = [((start/1000),(stop/1000)) for start,stop in speak] #convert to sec

    # print(speak_sequences)

    min_silence_len = 600
    max_silence_len = 59000
    speak_sequences = silence.detect_nonsilent(myaudio, min_silence_len=min_silence_len, silence_thresh=dBFS-20, seek_step=10)

    # filter speak_sequences when stop - start are more than 59 secondes
    speak_sequences_too_big = [(start, stop) for start, stop in speak_sequences if stop - start > max_silence_len]

    while(len(speak_sequences_too_big) > 0 and min_silence_len >=100 ):

      min_silence_len =     min (min_silence_len-100, 100)
      speak_sequences = silence.detect_nonsilent(myaudio, min_silence_len=min_silence_len, silence_thresh=dBFS-20, seek_step=10)

      # filter speak_sequences when stop - start are more than 59 secondes
      speak_sequences_too_big = [(start, stop) for start, stop in speak_sequences if stop - start > max_silence_len]
      print("Sequence more than 59s : ", len(speak_sequences_too_big))
      print(f"min_silence_len = {min_silence_len}")

    # filter speak_sequences when stop - start are less than 1,5 secondes
    #speak_sequences_filtered = [(start, stop) for start, stop in speak_sequences if stop - start >= 1500]

    return speak_sequences


In [None]:
# generate sequence from audio file split by hard split defined by the increment variable
def get_audio_sequence_hard_split(file,     INCREMENT = 59*1000):
    from pydub import AudioSegment
    import math

    sound = AudioSegment.from_mp3(file)

    if debug: print(f"duration_seconds = {sound.duration_seconds}")
    duration_ms =  math.ceil(sound.duration_seconds * 1000)
    if debug: print(f"duration_ms = {duration_ms}")
    return [(start, min(start + INCREMENT, duration_ms))
            for start in range(0, duration_ms, INCREMENT)]

    # speak_sequences = []
    # while(finish == False):
    #     stop = start+(INCREMENT)
    #     if stop > duration_ms:
    #         stop = duration_ms
    #         finish = True
    #     speak_sequences.append((start, stop))
    # return speak_sequences
    # Instead of a while loop, use a more concise list comprehension


# generate one full sequence from audio file
def get_one_full_sequence(file):
    from pydub import AudioSegment, silence
    import math

    sound = AudioSegment.from_mp3(file)

    if debug: print(f"duration_seconds = {sound.duration_seconds}")
    #duration_ms = sound.duration_seconds * 1000

    duration_ms =  math.ceil(sound.duration_seconds * 1000)
    if debug: print(f"duration_ms = {duration_ms}")

    return [(0, duration_ms)]

if debug:
  get_audio_sequence_hard_split("1-temp.wav")
  get_one_full_sequence("1-temp.wav")

In [140]:
# Process a file with a prompt and gemini model apply to the _stt function in parameter
def process_local_file_by_chunk(file_name, _stt, _split_sequence_strategy, prompt, model_name):

    sound = AudioSegment.from_mp3(file_name )
    if debug:
      print(f"duration_seconds = {sound.duration_seconds}")
      print(f"sample_width = {sound.sample_width}")
      print(f"channels = {sound.channels}")
      print(f"frame_rate = {sound.frame_rate}")

    finish = False

    results = []

    speak_sequences = _split_sequence_strategy(file_name)
    for (start, stop) in speak_sequences:

        buffer = io.BytesIO()

        sound[start:stop].export(buffer, format="wav" )
        batch_result = _stt(data = buffer.read(), prompt=prompt, model_name=model_name)
        batch_result = "".join(batch_result)

        print(f"start = {start} - stop = {stop}")

        results.extend(batch_result+" ")


    return "".join(results)

In [141]:
import os
import json
import time

def gemini_stt(data, prompt, model_name, uri=None):
  return transcribe_with_gemini_data(data=data, prompt=prompt, model_name=model_name)


def gemini_stt_gcs(uri, prompt, model_name, data=None):
  return transcribe_with_gemini_from_uri(audio_path=uri, prompt=prompt, model_name=model_name)


def process_transcriptions(prompt, audio_extention = '.wav'):

  for split_strategy in split_strategies_dic:
    if debug: print(f"Split strategy: {split_strategy}")

    for model_name in models_dic:
      if debug: print(f"Model: {model_name}")
      text_files = [string.replace(audio_extention, '.txt') for string in wav_files]
      wav_text_arr = zip(wav_files, text_files)

      i = 0
      for wav_file, text_file in wav_text_arr:
        if debug: print(f"Processing {i}")
        # Store text file locally
        bucket, file_txt = split_gcs_uri(text_file)
        local_file_txt = f"{i}-ground-truth.txt"
        store_temp_file_from_gcs(bucket, file_txt, local_file_txt)

        _split_strategy = split_strategies_dic[split_strategy]

        start_time = time.perf_counter()
        if _split_strategy is not None:
          # Store local audio file locally
          bucket, file_wav = split_gcs_uri(wav_file)
          local_file = f"{i}-temp{audio_extention}"
          store_temp_file_from_gcs(bucket, file_wav, local_file)

          result = process_local_file_by_chunk(local_file,
                                _stt=models_dic[model_name],
                                _split_sequence_strategy=split_strategies_dic[split_strategy],
                                prompt=prompt,
                                model_name=model_name)
        else:
          result = transcribe_with_gemini_from_uri(audio_path=wav_file, prompt=prompt, model_name=model_name)

        end_time = time.perf_counter()
        elapsed_time = end_time - start_time

        if debug: print(f"Elapsed time: {elapsed_time} seconds")
        if not "gemini" in model_name :
          model_id = model_name.split("/")[-1]
        else:
          model_id = model_name

        gemini_file = f"{i}-gemini_{split_strategy}_{model_id}_result.txt"
        #gemini_file = f"{i}-speech_{model_name}_result.json"
        with open(gemini_file, "w", encoding="UTF8") as f:
            f.write(result)

        tags = { "model_name": model_name,
                "file": wav_file,
                "ground-truth": file_txt,
                }

        #TODO: #FixMe ugly specific code
        write_file_to_gcs(bucket,  text_file.replace("stt_synthetic_tests_data", "stt_synthetic_results").replace(".txt","") + f"-gemini_{split_strategy}_{model_id}.txt",
                          gemini_file, tags )

        with open(local_file_txt, 'r') as f:
          ground_truth = f.read()
        ground_truth = ground_truth.replace("\n", " ")

        wer, semantic_textual_similarity = evaluate_data([result], [ground_truth])
        if debug: print(f"Results:{wav_file}, WER: {wer}, semantic_textual_similarity: {semantic_textual_similarity}")

        if system_instruction:
          prompt_log = "system_instruction:" + system_instruction + "\nprompt:" +prompt
        else:
          prompt_log = prompt

        data = {
          "input_file": wav_file,
          "ground_truth": ground_truth,
          "model_name": split_strategy+model_name,
          "prompt": prompt_log,
          "wer": wer,
          "processing_time": elapsed_time,
          "semantic_textual_similarity": semantic_textual_similarity,
          "generated_file": gemini_file,
          "generated_text": result
        }
        import pandas as pd
        df = pd.DataFrame( data = [data], columns = ["input_file","ground_truth", "wer", "semantic_textual_similarity","generated_file","generated_text" , "model_name", "prompt", "processing_time"])

      #   break
      # break
        save_results_df_bq(df, table_id, truncate=False)

        i += 1



In [None]:
%%time

models_dic = {
  "gemini-1.5-pro": gemini_stt,
  "gemini-1.5-pro-001": gemini_stt,
  "gemini-1.5-pro-002": gemini_stt,
  "gemini-1.5-flash-002": gemini_stt,
  "gemini-1.5-flash": gemini_stt,
  "projects/801452371447/locations/us-central1/endpoints/3103157164630343680": gemini_stt,
}

split_strategies_dic = {

  "no_split:"         : get_one_full_sequence,
  "gcs_max_token:"    : None,
  "split_by_silences:": get_audio_sequence_split_by_silences,
  "hard_split:"       : get_audio_sequence_hard_split,
}

process_transcriptions(prompt)

In [None]:
%%time

system_instruction = """<ai_role>
  You are an AI transcriptionist specializing in interviews.
  Your primary function is to convert spoken language from audio files into accurate, well-formatted text. Audio files are in french, transcriptions are in french.
  Ensure the transcribed text is clear and readable.
  Add ponctuation like comma, question mark, exclamation mark, etc.
  Ignore background audio.
</ai_role>
<answer_format>
Output full word only.
Do not generate any other text.
Do not truncate words.
</answer_format>
"""

prompt = """Generate a transcription in French of the audio, only extract speech and ignore background audio.
Transcribe spoken words.
"""

process_transcriptions(prompt)

In [144]:


# %%time

# import os
# import json

# text_files = [string.replace('.wav', '.txt') for string in wav_files]

# wav_text_arr = zip(wav_files, text_files)
# # prompt = "transcribe spoken words"
# # prompt = """Generate a transcription in French of the audio, only extract speech and ignore background audio.
# # """

# def generate_gemini_result(wav_text_arr, model_name):
#   print(f"Model: {model_name}")
#   i = 0
#   for wav_file, text_file in wav_text_arr:
#     print(f"Processing {i}")
#     bucket, file_wav = split_gcs_uri(wav_file)
#     bucket, file_txt = split_gcs_uri(text_file)
#     local_file = f"{i}-temp.wav"
#     local_file_txt = f"{i}-ground-truth.txt"
#     store_temp_file_from_gcs(bucket, file_wav, local_file)
#     store_temp_file_from_gcs(bucket, file_txt, local_file_txt)

#     result = transcribe_with_gemini(prompt, wav_file)
#     result= "".join(result)

#     gemini_file = f"{i}-gemini_{model_name}_result.json"
#     with open(gemini_file, "w", encoding="UTF8") as f:
#         f.write(result)

#     print(80*"-")
#     print(result)
#     print(80*"-")

#     tags = { "model_name": model_name,
#             "file": file_wav,
#             "ground-truth": file_txt,
#             }
#     write_file_to_gcs(bucket,  text_file.replace("stt_synthetic_tests_data", "stt_synthetic_results").replace(".txt","") + f"-gemini-{model_name}.txt",
#                       gemini_file, tags )


#     with open(local_file_txt, 'r') as f:
#       ground_truth = f.read()

#     wer, semantic_textual_similarity = evaluate_data([result], [ground_truth])
#     print(f"Results:{wav_file}, WER: {wer}, semantic_textual_similarity: {semantic_textual_similarity}")

#     data = {
#       "input_file": wav_file,
#       "ground_truth": ground_truth,
#       "model_name": #"gemini:"+
#                     model_name,
#       "prompt": "system_instruction:" + system_instruction + "\nprompt:" +prompt,
#       #"system_instruction": system_instruction,
#       "wer": wer,
#       "semantic_textual_similarity": semantic_textual_similarity,
#       "generated_file": gemini_file,
#       "generated_text": result
#     }
#     import pandas as pd
#     df = pd.DataFrame( data = [data], columns = ["input_file","ground_truth", "wer", "semantic_textual_similarity","generated_file","generated_text" , "model_name", "prompt"])
#     # return df
#     save_results_df_bq(df, table_id, truncate=False)

#     i += 1





# # df = generate_gemini_result(wav_text_arr, model_name)


In [145]:
# import jiwer

# def process_words(df, idx):
#   out = jiwer.process_words(
#       [df['ground_truth'][idx]],
#       [df['generated_text'][idx]],
#   )

#   print(jiwer.visualize_alignment(out))

# def process_words_str(ground_truth, generated_text):
#   out = jiwer.process_words(
#       [ground_truth],
#       [generated_text],
#   )

#   print(jiwer.visualize_alignment(out))

# def process_words_list(ground_truth, generated_text):
#   out = jiwer.process_words(
#       ground_truth,
#       generated_text,
#   )

#   print(jiwer.visualize_alignment(out))

# process_words(df, 0)

In [146]:
# idx = 0
# ground_truths = df['ground_truth'][idx].split("\n")
# generated_texts = df['generated_text'][idx].split("\n")
# for ground_truth, generated_text in zip(ground_truths, generated_texts):
#   if len(generated_text) > 0:
#     process_words_str(ground_truth.replace("  "," "), generated_text.replace("  "," "))

In [147]:
# idx = 0
# ground_truths = df['ground_truth'][idx].replace("\n"," ")
# generated_texts = df['generated_text'][idx].replace("\n"," ")
# process_words_str(ground_truths.replace("  "," "), generated_texts.replace("  "," "))

# # for ground_truth, generated_text in zip(ground_truths, generated_texts):
# #   if len(generated_text) > 0:
# #     process_words_str(ground_truth, generated_text)

In [148]:
# df['generated_text'][idx].split("\n")

In [149]:
# df['ground_truth'][idx].split("\n")

In [150]:
# df['generated_text'][idx].split("\n")

# And generate chapter based on this transcript. 
## TODO: Implement prompt based on result of previous STT

#TODO