In [1]:
import json
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
from ibm_watsonx_ai.foundation_models import Model
import numpy as np

# Transcription Process

This notebook walks through creating a proper transcription to be used in the summarization step. We will transcribe an audio file and pass the output through an LLM to add punctuation signs. Finally we will save the output to a text file.

In [2]:
authenticator = IAMAuthenticator('< Your STT API KEY >')
speech_to_text = SpeechToTextV1(
    authenticator=authenticator
)

# URL of the STT Service
speech_to_text.set_service_url('< Your STT service URL >')

## Speech to Text

We will use the chosen model with the custom language model to generate a transcription. This process could take between 5-10mins depending on the length of the audio.

In [3]:
# Transcription
base_model = '< Your base STT Model >' # en-US_Telephony_LSM or en-US_Multimedia_LSM
lm_id = "< Your custom language ID >"
audio_name = 'audio.mp3'
with open(audio_name, 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio=audio_file,
        content_type='audio/mp3', # Change if needed
        model= base_model,
        smart_formatting=True,
        language_customization_id=lm_id
    ).get_result()

In [4]:
# Generating a single transcript variable
transcript = ""
for result in speech_recognition_results['results']:
    for alternative in result['alternatives']:
        transcript += alternative['transcript']

Here is the complete transcript:

In [5]:
print("Transcription: ")
print(transcript)

Transcription: 
ok okay where one steve are you there i think you're muted yes it's yeah my bad was all right so visual history analysis of the nba final project let's start let's go with the abstracts let's see ok is the objective still temporal visualization yeah yeah it's still still the same that's that's okay ok perfect and what all the keywords so keywords we have only these 4 i think i think we should add more specific visualizations into the keyword so let's add dendrogram animated bubbles animated lines bars and lines as well ok sounds good let's go with that and or so now let's move on to the introduction tell me a little bit about it okay so let's go paragraph by paragraph i think this first one it's okay you think these are the correct references that we are using this date is correct ok i think i think that that is good 3.revolution okay yeah i'll double check that but i think that that is fine as well this dupree okay i think i think this second one it's also ok let's jum

## Punctuation Prompt

Since the Speech to Text service doesnt add punctuation signs we need to use an LLM to generate them. This is necessary to increase the odds to generate the best summarization.

In [6]:
# IBM Cloud Credentials
my_credentials = { 
    "url"    : "https://us-south.ml.cloud.ibm.com", 
    "apikey" : '< Your IBM Cloud API KEY >'
}      

In [15]:
# This prompt template is an example for LLAMA2. You will need to tune it for your needs.
def punctuations_prompt_llama2(transcript):
    return ( '[INST]<<SYS>>\nYou are a helpful, and honest assistant. Always answer as helpfully and honestly as possible. You are asked to add the punctuation signs to the given text.\n\n'
            +'For example:\n\n'
            +'Text:\n'
            +'''i wanted to share an update on project X today project X will be completed at the end of the week that's great i heard from customer Y today and they agreed to buy our product Customer Z said they will too great news all around\n\n'''
            +'Output:\n'
            +'''I wanted to share an update on project X today. Project X will be completed at the end of the week. That's great! I heard from customer Y today, and they agreed to buy our product. Customer Z said they will too. Great news, all around.\n\n'''
            +'Text:\n'
            +'''the goal today is to agree on a design solution i think we should consider choice 1 i agree choice 2 has the advantage that it will take less time actually that's a good point so what should we do i'm good with choice 2 me too Done\n\n'''
            +'Output:\n'
            +'''The goal today is to agree on a design solution. I think we should consider choice 1. I agree. Choice 2 has the advantage that it will take less time. Actually, that's a good point. So, what should we do? I'm good with choice 2. Me too. Done!\n<</SYS>>\n\n'''
            +f"Text:\n{transcript}\n[/INST]\n"
            +'Output:\n')

In [7]:
# This prompt template is an example for LLAMA3. You will need to tune it for your needs.
def punctuations_prompt(transcript):
    return ( '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a writting AI assistant. Always answer as honestly as possible. You are asked to add the punctuation signs to the given text.\n\n'
            +'For example:\n\n'
            +'Text:\n'
            +'''i wanted to share an update on project X today project X will be completed at the end of the week that's great i heard from customer Y today and they agreed to buy our product Customer Z said they will too great news all around\n\n'''
            +'Output:\n'
            +'''I wanted to share an update on project X today. Project X will be completed at the end of the week. That's great! I heard from customer Y today, and they agreed to buy our product. Customer Z said they will too. Great news, all around.\n\n'''
            +'Text:\n'
            +'''the goal today is to agree on a design solution i think we should consider choice 1 i agree choice 2 has the advantage that it will take less time actually that's a good point so what should we do i'm good with choice 2 me too Done\n\n'''
            +'Output:\n'
            +'''The goal today is to agree on a design solution. I think we should consider choice 1. I agree. Choice 2 has the advantage that it will take less time. Actually, that's a good point. So, what should we do? I'm good with choice 2. Me too. Done!\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n'''
            +f"Text:\n{transcript}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
            +'Output:\n')

In [8]:
model_id    = ModelTypes.LLAMA_3_70B_INSTRUCT
gen_parms   = None
project_id  = '< Your watsonx.ai project ID >'
space_id    = None
verify      = False

model = Model( model_id, my_credentials, gen_parms, project_id, space_id, verify )   

We will set the watsonx.ai parameters fixed. You may want to double-check the minimum tokens. If it is too high the model might hallucinate to meet the token demand.

In [10]:
parameters = {
    GenParams.MIN_NEW_TOKENS: 1000,
    GenParams.MAX_NEW_TOKENS: 4095,
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    GenParams.REPETITION_PENALTY: 1
}

In [11]:
tokenized_base_prompt = model.tokenize(prompt=punctuations_prompt(''), return_tokens=True)
tokenized_transcript = model.tokenize(prompt=transcript, return_tokens=True)
ntokens_prompt = tokenized_base_prompt['result']['token_count']
ntokens_transcript = tokenized_transcript['result']['token_count']

if(ntokens_prompt+ntokens_transcript*2.5 >= 4095):
    # Coud be iterative process. Right now just splitting by 2.
    transcript_words = transcript.split(' ')
    _index = int(np.floor(len(transcript_words)/2))

    trans1 = " ".join(transcript_words[:_index])
    parameters[GenParams.MIN_NEW_TOKENS] = int(model.tokenize(prompt=trans1, return_tokens=True)['result']['token_count'])
    prompt_txt1 = punctuations_prompt(trans1)
    generated_response1 = model.generate_text( prompt_txt1, parameters )

    trans2 = " ".join(transcript_words[_index:])
    parameters[GenParams.MIN_NEW_TOKENS] = int(model.tokenize(prompt=trans2, return_tokens=True)['result']['token_count'])
    prompt_txt2 = punctuations_prompt(trans2)
    generated_response2 = model.generate_text( prompt_txt2, parameters )

    generated_response = generated_response1 + " " + generated_response2
else:
    prompt_txt = punctuations_prompt(transcript)
    generated_response = model.generate_text( prompt_txt, parameters )


In [12]:
print(generated_response)

Ok, okay. Where one, Steve, are you there? I think you're muted. Yes, it's yeah, my bad. Was all right. So, visual history analysis of the NBA final project. Let's start. Let's go with the abstracts. Let's see. Ok, is the objective still temporal visualization? Yeah, yeah, it's still still the same. That's that's okay. Ok, perfect. And what all the keywords? So, keywords, we have only these 4. I think I think we should add more specific visualizations into the keyword. So, let's add dendrogram, animated bubbles, animated lines, bars, and lines as well. Ok, sounds good. Let's go with that. And or so, now let's move on to the introduction. Tell me a little bit about it. Ok, so let's go paragraph by paragraph. I think this first one, it's okay. You think these are the correct references that we are using? This date is correct. Ok, I think I think that that is good. 3. Revolution, okay. Yeah, I'll double-check that, but I think that that is fine as well. This Dupree, I think I think this s

## Save to File

In [13]:
output_file = open('transcript_generated.txt','w')
output_file.write(generated_response)
output_file.close()

In [None]:
# Saving RAW Output (Optional)
raw_file = open('transcript_raw.txt','w')
raw_file.write(json.dumps(speech_recognition_results, indent=2))
raw_file.close()