# [Speech to Text] Create Custom Speech Model

This sample demonstrates how to create Custom Speech model calling REST API.

> ✨ **_Note_** <br>
> Please check the custom speech support for each language before you get started - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#:~:text=Custom%20speech%20support

## Prerequisites

Configure a Python virtual environment for 3.10 or later:

1.  open the Command Palette (Ctrl+Shift+P).
1.  Search for Python: Create Environment.
1.  select Venv / Conda and choose where to create the new environment.
1.  Select the Python interpreter version. Create with version 3.10 or later.


## 1. Check the synthetic dataset created by TTS in Azure AI Speech


In [None]:
import azure.cognitiveservices.speech as speechsdk
import os
import json
from openai import AzureOpenAI
import requests
from dotenv import load_dotenv
from utils.common import *

load_dotenv()

SPEECH_KEY = os.getenv("AZURE_AI_SPEECH_API_KEY")
SPEECH_REGION = os.getenv("AZURE_AI_SPEECH_REGION")
CUSTOM_SPEECH_LANG = os.getenv("CUSTOM_SPEECH_LANG")
CUSTOM_SPEECH_LOCALE = os.getenv("CUSTOM_SPEECH_LOCALE")
TTS_FOR_TRAIN = os.getenv("TTS_FOR_TRAIN")
TTS_FOR_EVAL = os.getenv("TTS_FOR_eval")

train_dataset_path = ""
%store -r train_dataset_path
eval_dataset_path = ""
%store -r eval_dataset_path
try:
    train_dataset_path
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the previous notebook again.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

train_dataset_path

In [None]:
import requests
import time
import json

# Base URL for the Speech Services REST API
base_url = f'https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext'

# Headers for authentication
headers = {
    'Ocp-Apim-Subscription-Key': SPEECH_KEY,
    'Content-Type': 'application/json'
}

In [None]:
def speech_recognition_from_file(file_path: str, lang:str):
    speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION, speech_recognition_language=lang)
    audio_config = speechsdk.AudioConfig(filename=file_path)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    speech_recognition_result = speech_recognizer.recognize_once_async().get()
    return speech_recognition_result.text

### Get the sorted wav files from the dataset folder


In [None]:
import os
from IPython.display import Audio, display

output_folder = 'synthetic_data'
files = os.listdir(output_folder)
wav_files = [file for file in files if file.endswith('.wav')]

# Sort wav_files by 'no' in ascending order
wav_files.sort(key=lambda x: int(x.split('_')[0]))
wav_files

In [None]:
for wav_file in wav_files[0:3]:
    print(speech_recognition_from_file(os.path.join(output_folder,wav_file), CUSTOM_SPEECH_LOCALE))

## 2. Upload training datasets

-   You can upload datasets for training, qualitative inspection, and quantitative measurement.
-   This lab covers two types (Acoustic and Plain text) of training and testing data that you can use for custom speech.
-   Check the other options on this link - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-custom-speech-test-and-train


### Create a project


In [None]:
import time
import pytz
from datetime import datetime
utc = pytz.timezone('UTC')
date = datetime.now(utc).strftime("%Y-%m-%d_%H-%M-%S")
display_name = f"[{CUSTOM_SPEECH_LANG}] My Custom Speech Project ({date})[UTC]"
description = f"Project for training and evaluating the {CUSTOM_SPEECH_LANG} base model"
project_id = create_project(base_url, headers, display_name, description, CUSTOM_SPEECH_LOCALE)

In [None]:
# Store the project_id for later use
%store project_id

### Upload the acoustic dataset to storage (zip files)


In [None]:
data_folder = "train_dataset"
account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")

uploaded_files, url = upload_dataset_to_storage(data_folder, container_name, account_name, account_key)

### Create datasets with the uploaded acoustic dataset


In [None]:
kind="Acoustic"
display_name = "acoustic dataset(zip) for training"
description = f"[training] Dataset for fine-tuning the {CUSTOM_SPEECH_LANG} base model"

zip_dataset_dict = {}

for display_name in uploaded_files:
    zip_dataset_dict[display_name] = create_dataset(base_url, headers, project_id, url[display_name], kind, display_name, description, CUSTOM_SPEECH_LOCALE)

### Upload the plain text dataset to storage (text files)


In [None]:
data_folder = "plain_text"
account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")

uploaded_files, url = upload_dataset_to_storage(data_folder, container_name, account_name, account_key)

### Create datasets with the uploaded plain text dataset


In [None]:
kind="Language"
display_name = "plain text dataset for training"
description = f"[training] Dataset for fine-tuning the {CUSTOM_SPEECH_LANG} base model"

plain_dataset_dict = {}

for display_name in uploaded_files:
    plain_dataset_dict[display_name] = create_dataset(base_url, headers, project_id, url[display_name], kind, display_name, description, CUSTOM_SPEECH_LOCALE)

## 3. Train Custom Speech Models with the uploaded datasets


> ✨ **_Note_** <br>
> Please check which version of base model support for adaptation from baseline model information. <br>
> For example, Italian language model 20230111 supports 'Language', 'LanguageMarkdown', 'Pronunciation', 'OutputFormatting' adaptation.<br>
> check the supports 'Language' Adaptations With feature of the base_model object.<br>
> If you don't specify the baseModel, the default base model for the locale is used.<br>
> The base model ids are vary from each language <br>
> check the model id from the train a new model (UI) in the Azure Speech Studio if you want to select a base sepecific model. <br>

-   Italian 2e5e70f1-960b-4509-a7c5-102b29227c0b
-   Vietnamese 8066b5fb-0114-4837-90b6-0c245928a896


In [None]:
import json

#option1. check the model id from the train a new model (UI) in the Azure Speech Studio. 
base_model_id = "8066b5fb-0114-4837-90b6-0c245928a896"  # Vietnamese base model id

#option2. check the model id from the API call
base_model = get_latest_base_model(base_url, headers, f"locale eq '{CUSTOM_SPEECH_LOCALE}' and status eq 'Succeeded'")
# Filter the base models to find the ones that support 'Language' adaptations and have the latest lastActionDateTime
filtered_models = [model for model in base_model['values'] if 'properties' in model  and 'Language' in model['properties']['features'].get('supportsAdaptationsWith', [])]
if filtered_models:
	latest_model = max(filtered_models, key=lambda x: x['createdDateTime'])
	print("Latest model supporting 'Language' adaptations:")
else:
	print("No models found that support 'Language' adaptations.")
print(latest_model)
# Check if you are charged for training this model. 
# Here is the reference document of the Charge for adaptation: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/migrate-v3-1-to-v3-2#charge-for-adaptation
print("Charge for Adaptation:", latest_model['properties']['chargeForAdaptation'])
# Get the latest model ID from the self link for example 8066b5fb-0114-4837-90b6-0c245928a896 is the model id in 'https://swedencentral.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/8066b5fb-0114-4837-90b6-0c245928a896' 
base_model_id = latest_model['self'].split('/')[-1]
print(base_model_id)

### Train the custom speech model with plain text datasets (txt)


In [None]:
display_name = f"[{CUSTOM_SPEECH_LOCALE}] custom_model_with_plain_text"
description = f"{CUSTOM_SPEECH_LANG} Custom model training with plain text dataset"
custom_model_with_plain_id = create_custom_model(base_url, headers, project_id, base_model_id, list(plain_dataset_dict.values()), display_name, description, CUSTOM_SPEECH_LOCALE)

### Train the custom speech model with acoustic datasets (zip)


In [None]:
display_name = f"[{CUSTOM_SPEECH_LOCALE}] custom_model_with_aocustic_dataset"
description = f"{CUSTOM_SPEECH_LANG} Custom model training with acoustic dataset"
custom_model_with_acoustic_id = create_custom_model(base_url, headers, project_id, base_model_id, list(zip_dataset_dict.values()), display_name, description, CUSTOM_SPEECH_LOCALE)

In [None]:
from tqdm import tqdm

# Monitor the status of the run_result
def monitor_training_status(custom_model_id):
    with tqdm(total=3, desc="Running Status", unit="step") as pbar:
        status = get_custom_model_status(base_url, headers, custom_model_id)
        if status == "NotStarted":
            pbar.update(1)
        while status != "Succeeded" and status != "Failed":
            if status == "Running" and pbar.n < 2:
                pbar.update(1)
            print(f"Current Status: {status}")
            time.sleep(10)
            status = get_custom_model_status(base_url, headers, custom_model_id)
        while(pbar.n < 3):
            pbar.update(1)
        print("Training Completed")

### monitor training status for each job


In [None]:
monitor_training_status(custom_model_with_plain_id)
monitor_training_status(custom_model_with_acoustic_id)

In [None]:
%store custom_model_with_plain_id
%store custom_model_with_acoustic_id