## Google Speech API

In [None]:
!pip install --upgrade datasets nltk evaluate tokenizers seqeval sequence-evaluate sentence-transformers rouge jiwer google-cloud-aiplatform google-cloud-aiplatform[all] google-cloud-speech librosa jiwer protobuf pydub  google-cloud-storage

In [27]:
PROJECT_ID = "cloud-llm-preview1"  # @param {type:"string"}

BUCKET_NAME = "julien-us" # @param {type:"string"}
REGION = "us-central1" # @param {type:"string"}
BQ_REGION = "us" # @param {type:"string"}
table_id = "julienmiquel_us.stt_speech2" # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=REGION)


In [28]:
# Where you stored your audio and your ground of truth
wav_files = !gsutil ls gs://julien-us/stt_synthetic_tests_data/*.wav


In [None]:
text_files = [string.replace('.wav', '.txt') for string in wav_files]

wav_text_arr = zip(wav_files, text_files)
len(wav_files)

In [29]:
from google.cloud import storage
import re

def split_gcs_uri(gcs_uri):
  """Splits a GCS URI into bucket name and blob path variables.

  Args:
    gcs_uri: The GCS URI to split.

  Returns:
    A tuple containing the bucket name and blob path.
  """

  match = re.match(r"gs://([^/]+)/(.+)", gcs_uri)
  if match:
    return match.groups()
  else:
    raise ValueError("Invalid GCS URI: {}".format(gcs_uri))

def write_file_to_gcs(gcs_bucket_name,  gcs_file_name, local_file_path, tags = None, verbose= False):
    """Writes a local file to GCS.

    Args:
    local_file_path: The path to the local file to write to GCS.
    gcs_bucket_name: The name of the GCS bucket to write the file to.
    gcs_file_name: The name of the GCS file to write the file to.

    Returns:
    The GCS file path.
    """
    if verbose: print(f"local_file_path = {local_file_path} - gcs_bucket_name = {gcs_bucket_name} - gcs_file_name = {gcs_file_name}")
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(gcs_bucket_name)
    blob = bucket.blob(gcs_file_name)
    if tags is not None:
        blob.metadata = tags

    if verbose: print(f"upload_from_filename : local_file_path = {local_file_path}")
    blob.upload_from_filename(local_file_path, )

    return blob


def store_temp_video_from_gcs(bucket_name, file_name, localfile):
    import tempfile
    import os

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    # try:
    bytes_data = blob.download_as_bytes()

    # Create a temporary file.
    # tempDir = tempfile.gettempdir()
    tempDir = os.getcwd()

    temp_path = os.path.join(tempDir, localfile)
    # f, temp_path = tempfile.mkstemp()
    fp = open(temp_path, 'bw')
    fp.write(bytes_data)
    fp.seek(0)


    return temp_path



In [None]:
from seq_eval import SeqEval
import evaluate

wer_metric = evaluate.load("wer")

evaluator = SeqEval()

def evaluate_data(predictions, references, verbose=False, cleanup = False):
    if cleanup:
        references = [x for x in references if x!= '']
        predictions = [x for x in predictions if x!= '']

    if len(references)!= len(predictions):

        min_arr = min(len(references), len(references))
        print(f"Reduce size to {min_arr}")
        predictions = predictions[0:min_arr]
        references = references[0:min_arr]


    scores = evaluator.evaluate(predictions, references, verbose=verbose)
    if verbose: print(scores)

    wer = wer_metric.compute(references=references, predictions=predictions)
    wer = round(100 * wer, 2)
    print("WER:", wer ,end='\n')
    print("semantic_textual_similarity:",scores['semantic_textual_similarity'],end='\n')
    return wer, scores['semantic_textual_similarity']

evaluate_data(["Hello"], ["Hello"])


In [31]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID, location=BQ_REGION)

def save_results_df_bq(df, table_id, truncate = True):
    job_config = bigquery.LoadJobConfig(

    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.

    schema=[
bigquery.SchemaField("input_file","STRING", mode="NULLABLE"),
bigquery.SchemaField("ground_truth","STRING", mode="NULLABLE"),
bigquery.SchemaField("model_name","STRING", mode="NULLABLE"),
bigquery.SchemaField("prompt","STRING", mode="NULLABLE"),

bigquery.SchemaField("wer","FLOAT", mode="NULLABLE"),
bigquery.SchemaField("semantic_textual_similarity","FLOAT", mode="NULLABLE"),

bigquery.SchemaField("generated_file","STRING", mode="NULLABLE"),
bigquery.SchemaField("generated_text","STRING", mode="NULLABLE"),


    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    #write_disposition="WRITE_TRUNCATE",

    )
    if truncate:
        print('truncate table: ' + table_id)
        job_config.write_disposition="WRITE_TRUNCATE"

    job = client.load_table_from_dataframe(
        df, table_id, job_config=job_config
    )  # Make an API request.
    job.result()  # Wait for the job to complete.



In [None]:
text_files = [string.replace('.wav', '.txt') for string in wav_files]

wav_text_arr = zip(wav_files, text_files)

for wav_file, text_file in wav_text_arr:
  print(wav_file)


### One time init

In [None]:
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
from google.api_core.client_options import ClientOptions

def create_recognizer(language_code = "fr-FR", model_name = "latest_long", location = "global") -> cloud_speech.Recognizer:
    # Instantiates a client
    if location == "global":
        client = SpeechClient()
    else:
        client = SpeechClient(
            client_options=ClientOptions(api_endpoint=f"{location}-speech.googleapis.com")
        )

    recognizer_id = get_recognizer(language_code, model_name, location)


    request = cloud_speech.CreateRecognizerRequest(
        parent=f"projects/{PROJECT_ID}/locations/{location}",
        recognizer_id=recognizer_id,
        recognizer=cloud_speech.Recognizer(
            default_recognition_config=cloud_speech.RecognitionConfig(
                language_codes=[language_code], model=model_name
            ),
        ),
    )
    try:
        operation = client.create_recognizer(request=request)
        recognizer = operation.result()

        print("Created Recognizer:", recognizer.name)
    except Exception as e:
        if e.__class__.__name__ == 'AlreadyExists':
            print("Recognizer already exists:", e)

            return recognizer_id
        else:
            print("Failed to create recognizer:", e)
            return None
    return recognizer_id

def get_recognizer(language_code, model_name, location):
    recognizer_id = f"{model_name}-{language_code.lower()}-{location}".replace("_","-")
    return recognizer_id


recognizer_chirp2 = create_recognizer("fr-FR", "chirp_2", "us-central1")
recognizer_long = create_recognizer("fr-FR", "long", "global")
recognizer_short = create_recognizer("fr-FR", "short", "global")
recognizer_latest_long = create_recognizer("fr-FR", "latest_long", "global")


In [None]:
import os
from google.api_core import retry


os.environ["TOKENIZERS_PARALLELISM"] = "true"

def stt_chirp(audio_content, channels, frame_rate):
    return stt_googleapi(audio_content, channels, frame_rate, model_name="chirp_2")


def stt_latest_long(audio_content, channels, frame_rate):
    return stt_googleapi(audio_content, channels, frame_rate, model_name="latest_long")

@retry.Retry(timeout=3000.0)
def stt_googleapi(audio_content, channels, frame_rate, model_name="default", long_operation = False, verbose = False):
    # Import the Speech-to-Text client library
    from google.cloud import speech

    # Instantiates a client
    client = speech.SpeechClient()

    # transcribe speech
    audio = speech.RecognitionAudio(content=audio_content)

    config = speech.RecognitionConfig(
        #encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code="fr-FR",
        model=model_name,
        audio_channel_count=channels,
        # enable_word_confidence=True,
        # enable_word_time_offsets=True,
        # enable_automatic_punctuation = True,
        # enable_spoken_punctuation=True
    )

    if long_operation:
        # Detects speech in the audio file
        operation = client.long_running_recognize(config=config, audio=audio)

        # print("Waiting for operation to complete...")
        response = operation.result(timeout=90)
    else:
        response = client.recognize(
                    recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
            config=config, audio=audio)
        
    if verbose:
        print(response)

    results = []
    for result in response.results:
        if verbose:
            print("Transcript: {}".format(result.alternatives[0].transcript))
        results.append(result.alternatives[0].transcript)

    return results


@retry.Retry(timeout=3000.0)
def stt_googleapi_v2(audio_content, channels, frame_rate, model_name="default", verbose = False):
    # Import the Speech-to-Text client library
    from google.cloud.speech_v2 import SpeechClient
    from google.cloud.speech_v2.types import cloud_speech

    # Instantiates a client
    client = SpeechClient()

    # transcribe speech
    config = cloud_speech.RecognitionConfig(
        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
        language_codes=["fr-FR"],
        model=model_name,
    )

    response = client.recognize(
                recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
        config=config,
                content=audio_content)
    if verbose:
        print(response)

    results = []
    for result in response.results:
        if verbose:
            print("Transcript: {}".format(result.alternatives[0].transcript))
        results.append(result.alternatives[0].transcript)

    return results





In [None]:
def get_audio_sequence(file, min_silence_len=500):
    from pydub import AudioSegment, silence

    myaudio = AudioSegment.from_mp3(file)
    dBFS=myaudio.dBFS

    speak = silence.detect_nonsilent(myaudio, min_silence_len=min_silence_len, silence_thresh=dBFS-20, seek_step=10)
    speak_sequences = speak

    return speak_sequences

def splitAudio(root_dir, file, start, stop, output_dir, verbose = False):
    sound = AudioSegment.from_mp3(root_dir+file)
    
    sound = sound[start:stop]
    if verbose:
        print(80*"__")
        print(f"file = {file}")
        print(f"duration_seconds = {sound.duration_seconds}")
        print(f"sample_width = {sound.sample_width}")
        print(f"channels = {sound.channels}")
        print(f"frame_rate = {sound.frame_rate}")

    file_segment = output_dir+file+f"-{start}-{stop}.wav"

    sound.export(file_segment, format="wav")
    return file_segment

In [34]:
from google.api_core.client_options import ClientOptions

def stt_api_v2(content, recognizer, channels , frame_rate,model_name, language_code = "fr-FR", location = "us-central1"):


    if "global" == location:
        client = SpeechClient()
    else:
        client = SpeechClient(
            client_options=ClientOptions(api_endpoint=f"{location}-speech.googleapis.com")
        )

    short_audio_config = cloud_speech.RecognitionConfig(
        features=cloud_speech.RecognitionFeatures(
            enable_automatic_punctuation=True, enable_word_time_offsets=True
        ),
        auto_decoding_config={},
        #explicit_decoding_config = cloud_speech.ExplicitDecodingConfig(            sample_rate_hertz=frame_rate,            #audio_channel_count = channels             ),
        language_codes = [language_code],
        model = model_name
        # audio_channel_count = channels,
        # sample_rate_hertz  = frame_rate
    )

    short_audio_request = cloud_speech.RecognizeRequest(
        #recognizer=recognizer,
        recognizer=f"projects/{PROJECT_ID}/locations/{location}/recognizers/_",
        config=short_audio_config, content=content,
    )
    results = []
    try:
        response = client.recognize(request=short_audio_request)


        for result in response.results:
            print(f"Transcript: {result.alternatives[0].transcript}")
            results.append (result.alternatives[0].transcript+" ")
    except Exception as e:
        print("Exception: ",e)
    return "".join(results)



def stt_api_v2_chirp2(content, channels , frame_rate):
    return stt_api_v2(content, recognizer_chirp2, channels , frame_rate,"chirp_2", language_code = "fr-FR", location = "us-central1")

def stt_api_v2_long(content, channels , frame_rate):
    return stt_api_v2(content, recognizer_long, channels , frame_rate, "long", "fr-FR" , location = "global")

def stt_api_v2_short(content, channels , frame_rate):
    return stt_api_v2(content, recognizer_short, channels , frame_rate, "short","fr-FR" , location = "global")

def stt_api_v2_latest_long(content, channels , frame_rate):
    return stt_api_v2(content, recognizer_short, channels , frame_rate, "latest_long","fr-FR" , location = "global")



models_v2 = { "Speech_split:chirp_2":stt_api_v2_chirp2,
              "Speech:long": stt_api_v2_long,
              "#Speech:short": stt_api_v2_short,
              "Speech:latest_long": stt_api_v2_latest_long
             }


In [35]:
import io
import pydub
from pydub import AudioSegment


def process_file(file_name, _stt, verbose=False):

    sound = AudioSegment.from_mp3(file_name )
    if verbose:
        print(f"duration_seconds = {sound.duration_seconds}")
        print(f"sample_width = {sound.sample_width}")
        print(f"channels = {sound.channels}")
        print(f"frame_rate = {sound.frame_rate}")
        sound.get_sample_slice()
        
    finish = False
    INCREMENT = 59*1000
    start = 0
    stop = INCREMENT

    results = []

    while(finish == False):
        stop = start+(INCREMENT)
        buffer = io.BytesIO()
        sound[start:stop].export(buffer, format="wav" )
        batch_result = _stt(buffer.read(), channels = sound.channels, frame_rate=sound.frame_rate)
        print(f"start = {start} - stop = {stop}")

        
        results.extend(" "+batch_result)
        if stop > (sound.duration_seconds*1000):
            finish = True
        else:
            start += INCREMENT


    return "".join(results)



In [36]:
import io
import pydub
from pydub import AudioSegment

root_dir = '.'
output_dir = '.'

def process_fileV2(file_name, _stt, verbose=False):

    sound = AudioSegment.from_mp3(file_name )
    if verbose:
      print(f"duration_seconds = {sound.duration_seconds}")
      print(f"sample_width = {sound.sample_width}")
      print(f"channels = {sound.channels}")
      print(f"frame_rate = {sound.frame_rate}")

    results = []
    min_silence_len = 600
    speak_sequences = get_audio_sequence(file_name, min_silence_len)

    # filter speak_sequences when stop - start are more than 59 secondes
    speak_sequences_too_big = [(start, stop) for start, stop in speak_sequences if stop - start > 59000]

    while(len(speak_sequences_too_big) > 0 and min_silence_len !=100 ):

      min_silence_len =     min (min_silence_len-100, 100)
      speak_sequences = get_audio_sequence(file_name, min_silence_len)
      # filter speak_sequences when stop - start are more than 59 secondes
      speak_sequences_too_big = [(start, stop) for start, stop in speak_sequences if stop - start > 59000]
      print("Sequence more than 59s : "+len(speak_sequences_too_big))
      print(f"min_silence_len = {min_silence_len}")

    # filter speak_sequences when stop - start are less than 1,5 secondes
    #speak_sequences_filtered = [(start, stop) for start, stop in speak_sequences if stop - start >= 1500]


    for (start, stop) in speak_sequences:

        buffer = io.BytesIO()

        sound[start:stop].export(buffer, format="wav" )
        batch_result = _stt(buffer.read(), channels = sound.channels, frame_rate=sound.frame_rate)
        print(f"start = {start} - stop = {stop}")

        results.extend(batch_result+" ")


    return "".join(results)




In [None]:
import os
import json
import pandas as pd

for model_name in models_v2:
  print(f"Model: {model_name}")
  text_files = [string.replace('.wav', '.txt') for string in wav_files]
  wav_text_arr = zip(wav_files, text_files)

  i = 0
  for wav_file, text_file in wav_text_arr:
    print(f"Processing {i}")
    bucket, file_wav = split_gcs_uri(wav_file)
    bucket, file_txt = split_gcs_uri(text_file)
    local_file = f"{i}-temp.wav"
    local_file_txt = f"{i}-ground-truth.txt"
    store_temp_video_from_gcs(bucket, file_wav, local_file)
    store_temp_video_from_gcs(bucket, file_txt, local_file_txt)

    result = process_file(local_file,models_v2[model_name])

    gemini_file = f"{i}-speech_{model_name}_result.json"
    with open(gemini_file, "w", encoding="UTF8") as f:
        f.write(result)

    tags = { "model_name": model_name,
            "file": file_wav,
            "ground-truth": file_txt,
            }
    write_file_to_gcs(bucket,  text_file.replace("stt_synthetic_tests_data", "stt_synthetic_results").replace(".txt","") + f"-gemini-{model_name}.txt",
                      gemini_file, tags )


    with open(local_file_txt, 'r') as f:
      ground_truth = f.read()
    ground_truth = ground_truth.replace("\n", " ")

    wer, semantic_textual_similarity = evaluate_data([result], [ground_truth])
    print(f"Results:{wav_file}, WER: {wer}, semantic_textual_similarity: {semantic_textual_similarity}")

    data = {
      "input_file": wav_file,
      "ground_truth": ground_truth,
      "model_name": model_name,
      "prompt": "",
      "wer": wer,
      "semantic_textual_similarity": semantic_textual_similarity,
      "generated_file": gemini_file,
      "generated_text": result
    }
    
    df = pd.DataFrame( data = [data], columns = ["input_file","ground_truth", "wer", "semantic_textual_similarity","generated_file","generated_text" , "model_name", "prompt"])

    save_results_df_bq(df, table_id, truncate=False)

    i += 1


