# WhisPeriment
Experimenting with the Whisper model
- Record an audio sequence in the defined language
- Allow to replay it
- Provide the text in the defined language

## Environement Set-up
Need to copy paste the text in a terminal to create the environement and install the ffmpeg if not already on your computer once done you need to set this environement on the jupyten notebook if you use it in vscode it is on the top right corner of the windows

In [None]:
# Things to execute to have the right environement to run this
"""
conda deactivate
conda env remove -n hugg
conda create --name hugg python=3.11 pyaudio -y
conda activate hugg
pip install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
pip install -U transformers[torch]
pip install -U numpy scipy ipywidgets ipykernel 
pip install -U accelerate 
pip install -U sentencepiece
pip install -U pvrecorder -q
pip install -U ffmpeg
pip install -U webrtcvad

"""
# to use with ROCKm (AMD boards) replace the torch install command by the following one (untested) 
# check this page for further install instructions: https://rocmdocs.amd.com/en/latest/ and https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.1/page/How_to_Install_ROCm.html
#
# pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2


# ffmpeg need to be installed on your computer on linux with: 
"""
sudo apt install ffmpeg
"""
# for mac and windows get it there: 
"""
https://ffmpeg.org/download.html
"""


## Inport the libraries

In [1]:
import struct
import wave
import torch 
import sys

import numpy as np

from scipy.io        import wavfile
from pvrecorder      import PvRecorder
from transformers    import WhisperProcessor, WhisperForConditionalGeneration, __version__ 
from IPython.display import Audio, display

print(f"Env Name     : {sys.executable.split('/')[-3]}")
print(f"Python       : {                  sys.version}")
print(f"wave         : Nov version infos available"    )
print(f"torch        : {            torch.__version__}")
print(f"numpy        : {               np.__version__}")
print(f"transformers : {                 __version__ }")

Env Name     : hugg
Python       : 3.11.0 | packaged by conda-forge | (main, Oct 25 2022, 06:24:40) [GCC 10.4.0]
wave         : Nov version infos available
torch        : 1.13.1+cu117
numpy        : 1.24.1
transformers : 4.25.1


## Write the whisper model documentation to "Help.txt"
Use the autodoc feature to learn about the availiable methods in the whisper model and write it into a file

In [29]:
import contextlib
with open("model_WhisperProcessor_Help.txt", "w") as f:
  with contextlib.redirect_stdout(f):
      help(WhisperProcessor)


## Identify your microphone devices

In [2]:
for index, device in enumerate(PvRecorder.get_audio_devices()):
    print(f"[{index}] {device}")

[0] Monitor of Family 17h (Models 10h-1fh) HD Audio Controller Analog Stereo
[1] Family 17h (Models 10h-1fh) HD Audio Controller Analog Stereo


# Parameter Set-up 
Choose the model depending of execution time and Available GPU mem:
- large  : 3.1GB FP16 / 6.2GB FP32 'thanks to the large-v2 model there has been a improvments of the large model)
- medium : 1.5GB FP16 / 3.1GB FP32

In [23]:

device_index      = 1                          # the device where your microphone is
t_record_s        = 10                         # the duration to record

#prerecorded_audio = None                       # set this to None to record your audio
#prerecorded_audio = "test_record_english.wav"  # test of a English pre-recorded audio
prerecorded_audio = "test_record_french.wav"   # test of a French pre-recorded audio

language          = 'fr'                       # language which is spoken if not set translation to English will be done even if transcribe set
#language          = 'es'                       # language which is spoken (optional)
task_name         = "transcribe"               # transcribe the text in it's spoken language
#task_name         = "translate"                # translate only towards english

model_name        = "openai/whisper-large-v2"  # Only one working for non english
#model_name        = "openai/whisper-medium"  

channels          = 1
target_rate       = 16000                      # Whisper model was trained at 16khz 
frame_length      = 512                        # frame of 512 samples is 32ms 
audio_file        = 'record.wav'               # Working audio file 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on device: {device}")

if str(device) == 'cuda':
  model_dtype   = torch.bfloat16               # bfloat16 fastest if supported by GPU
  #model_type   = torch.float16                 # float16 for GPU that do not support bfloat
else:
  model_dtype   = torch.float32                # float32 for running on cpu 
print(f"Datatype used is : {model_dtype}")


Running on device: cuda
Datatype used is : torch.bfloat16


## Load the pretrained Whisper model and weights
Using Huggingface transformers

In [5]:
processor = WhisperProcessor.from_pretrained(model_name)
model     = WhisperForConditionalGeneration.from_pretrained(model_name, torch_dtype=model_dtype).to(device)
mem_use   = model.get_memory_footprint(return_buffers=True)
print(f"Model Memory useage: {mem_use/1000000:6.1f} MB")


Model Memory useage: 3086.6 MB


## Configure the model
- Confugure the output language
- Configure the task (Translation or transcription)

In [24]:
languages = processor.tokenizer.additional_special_tokens
try:
  language
  if f"<|{language}|>"  in languages: 
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language = language, task = task_name)
  else:
    print(f"language [{language}] not in model available languages leting the model on it's own")
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(task = task_name)
except:
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(task = task_name)

## Record some audio or use a pre recorded file

In [25]:
if prerecorded_audio != None:
    audio_file = prerecorded_audio
    
else:
    n_frames = int(1+t_record_s * target_rate / frame_length)
    recorder = PvRecorder(device_index=device_index, frame_length=frame_length) #(32 milliseconds of 16 kHz audio)
    audio    = []

    recorder.start()                        # Record the audio from the device
    for i in range(0,n_frames):
        frame = recorder.read()
        audio.extend(frame)
    recorder.stop()

    with wave.open(audio_file, 'w') as f:   # Write the recorded audio to a file at the target sample rate
        f.setparams((1, 2, target_rate, frame_length, "NONE", "NONE"))
        f.writeframes(struct.pack("h" * len(audio), *audio))

    recorder.delete()

## Play the Audio that will be analysed

In [26]:
display(Audio(audio_file, autoplay=True, rate=target_rate))


## Loading and pre process
- load the audio file to be analysed
- Preprocess the audio to get the input features


In [27]:
sample_rate, input_speech = wavfile.read(audio_file)
print(f"Audio file recorded at: {sample_rate} samples per seconds")

input_features = processor(input_speech, return_tensors="pt", sampling_rate=sample_rate).input_features.to(torch.bfloat16).to(device)
input_features = input_features.to(model_dtype)


Audio file recorded at: 16000 samples per seconds


## Run the model
- Actualy tun the model to get the predicted tokens
- Decode the tokens through a dictionary to get actual words

In [33]:
predicted_ids  = model.generate(input_features, max_new_tokens=100)
transcription  = processor.batch_decode(predicted_ids   , skip_special_tokens = True )[0]
print(transcription)


 Test d'enregistrement en français pour la reconnaissance vocale avec Whisper


# DUMPSTER

In [None]:
# identify the audio input devices
import pyaudio
p          = pyaudio.PyAudio()
info       = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
        if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
            print( "Input Device id ", i, " - ", p.get_device_info_by_host_api_device_index(0, i).get('name'))


In [None]:
def rms(frame):
  """Return the RMS value of the frame content"""
  count       = len(frame) / swidth
  format      = "%dh" % (count)
  shorts      = struct.unpack(format, frame)
  sum_squares = 0.0
  for sample in shorts:
      n            = sample * short_normalize
      sum_squares += n * n
  rms = math.pow(sum_squares / count, 0.5)

  return rms * 1000

def resample(audio, input_rate, output_rate):
  """
  ALSA only support 44100 or 48000 sampling rate, resampleing from input_rate to output_rate 
  Args:
      audio (binary)   : Input audio stream
      input_rate (int) : Input audio rate to resample from
      output_rate (int): Input audio rate to resample from   
  Return:
      a numpy array of int16 resampled at the proper sample rate
  """
  audio_i16     = np.frombuffer(buffer=audio, dtype=np.int16)
  resample_size = int(len(audio_i16) / input_rate * output_rate)
  resample      = signal.resample(audio_i16, resample_size)
  out_i16       = np.array(resample, dtype=np.int16)
  #print(f"input size: {len(audio_i16)}, output zize:{len(out_i16)}")
  
  return out_i16

In [None]:
n_chucnk    = int(t_record_s * sample_rate / chunk)
p           = pyaudio.PyAudio()
stream      = p.open(    
   format   = format,  channels          = channels,         rate = sample_rate,
   input    = True,    frames_per_buffer = frames_per_buffer, input_device_index=device_index)
   
frames = []
print(f"-----Now Recording for {t_record_s} s-----")
for i in range(0,400):
  audio_data   = stream.read(chunk, exception_on_overflow = False)
  rms_val      = rms(audio_data)
  frames.append(resample(audio_data, sample_rate, target_rate))

print(f'-----End Recording----- Last RMS: {rms_val:6.2f} ')
stream.stop_stream()    # Stop Audio Recording  IMPORTANT
stream.close()          # Close Audio Recording IMPORTANT
#print(frames)


wf = wave.open(audio_file, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(format))
wf.setframerate(target_rate)
wf.writeframes(b''.join(frames))
wf.close()