# WhisPeriment
Experimenting with the Whisper model
- Record an audio sequence in the defined language
- Allow to replay it
- Provide the text in the defined language

In [None]:
# Things to execute to have the right environement to run this
"""
conda env remove -n hugg
conda create --name hugg python=3.10 -y
conda activate hugg
pip install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
pip install transformers[torch]
pip install -U numpy scipy ipywidgets ipykernel 
pip install -U accelerate 
pip install -U sentencepiece
pip install -U pvrecorder -q
pip install -U ffmpeg
"""

# ffmpeg need to be installed on your computer with 
"""
sudo apt install ffmpeg
"""


In [1]:
import struct
import wave

import numpy as np

from scipy.io        import wavfile
from pvrecorder      import PvRecorder
from transformers    import WhisperProcessor, WhisperForConditionalGeneration, __version__ 
#from transformers    import pipeline, WhisperTokenizer, WhisperModel, WhisperFeatureExtractor, __version__ 
from IPython.display import Audio, display

#print(f"pyaudio      : {      pyaudio.__version__}")
print(f"wave         : Nov version infos available")
print(f"numpy        : {           np.__version__}")
print(f"transformers : {             __version__ }")

wave         : Nov version infos available
numpy        : 1.24.1
transformers : 4.25.1


# Load the Whisper model
Choose the model depending of execution time and Available GPU mem:
- 12G -> large 'thanks to the large-v2 model there has been a improvments of the large model)
-  6G -> medium

In [2]:
#List all audio input devices 

for index, device in enumerate(PvRecorder.get_audio_devices()):
    print(f"[{index}] {device}")

[0] Monitor of Starship/Matisse HD Audio Controller Analog Stereo
[1] Starship/Matisse HD Audio Controller Analog Stereo
[2] Monitor of TU106 High Definition Audio Controller Digital Stereo (HDMI)
[3] WH-1000XM3
[4] Monitor of WH-1000XM3


In [3]:
device_index      = 3
t_record_s        = 15

channels          = 1
target_rate       = 16000
frame_length      = 512
language          = 'fr'
audio_file        = 'record.wav'


In [4]:
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model     = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language = language, task = "transcribe")

In [18]:
n_frames = int(1+t_record_s * target_rate / frame_length)
recorder = PvRecorder(device_index=device_index, frame_length=frame_length) #(32 milliseconds of 16 kHz audio)
audio    = []

recorder.start()
for i in range(0,n_frames):
    frame = recorder.read()
    audio.extend(frame)

recorder.stop()
with wave.open(audio_file, 'w') as f:
    f.setparams((1, 2, target_rate, frame_length, "NONE", "NONE"))
    f.writeframes(struct.pack("h" * len(audio), *audio))

recorder.delete()

In [6]:
display(Audio(audio_file, autoplay=True, rate=target_rate))


In [20]:

sample_rate, input_speech = wavfile.read(audio_file)
print(f"Audio file recorded at: {sample_rate} samples per seconds")


Audio file recorded at: 16000 samples per seconds


In [21]:
input_features = processor(input_speech, return_tensors="pt", sampling_rate=sample_rate).input_features 
predicted_ids  = model.generate(input_features, max_new_tokens=100)
transcription  = processor.batch_decode(predicted_ids, skip_special_tokens = True)
print(transcription)


[" Ceci est un enregistrement d'essai de 15 secondes pour tester les fonctionnalités de Whisper et voir si la transcription est bien faite. Donc au bout de 15 secondes cet enregistrement s'arrêtera."]


# DUMPSTER

In [None]:
help(whisper)

In [None]:
text = whisper(audio_file, sample_rate=target_rate, framework="pt", max_new_tokens=100, max_length=100)
print(text)


In [None]:


import ipywidgets as widgets

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian", "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech", "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian", "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa", "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian", "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer", "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian", "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese", "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish", "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian", "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu", "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese", "yo_ng": "Yoruba"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="ko_kr",
    description='Language:',
    disabled=False,
)

selection



In [None]:


lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"
print(f"Selected language: {language} ({lang})")



In [None]:
#whis      = WhisperProcessor.from_pretrained('openai/whisper-medium', language="French")
#tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="French")

processor         = WhisperProcessor.from_pretrained("openai/whisper-base")
model             = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
#model             = WhisperModel.from_pretrained("openai/whisper-base")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language = "fr", task = "transcribe")
configuration     = model.config
#print(configuration)

In [None]:
input_speech   = frames
input_features = processor(input_speech, ,return_tensors="pt").input_features 
predicted_ids  = model.generate(input_features)
transcription  = processor.batch_decode(predicted_ids)
print(transcription)

In [None]:
model.__doc__

In [None]:
model.ge

audio = WhisperModel. .load_audio(audio_file)
audio = WhisperModel.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

In [None]:

text = whis(frames, sampling_rate=target_rate)
print(text)

In [None]:
# instantiate the tokenizer and set the prefix token
#tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="French")

# now switch the prefix token from Spanish to French
#tokenizer.set_prefix_tokens(language="french")

In [None]:
# identifu the audio input devices
import pyaudio
p          = pyaudio.PyAudio()
info       = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
        if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
            print( "Input Device id ", i, " - ", p.get_device_info_by_host_api_device_index(0, i).get('name'))


# TESTS
https://github.com/amrrs/pyaudiorec/blob/main/audio_rec.ipynb

!pip3 install pvrecorder -q #install pvrecorder

In [None]:
#load PvRecorder

from pvrecorder import PvRecorder
import wave, struct 

In [None]:
#load all audio input devices 

for index, device in enumerate(PvRecorder.get_audio_devices()):
    print(f"[{index}] {device}")

In [None]:
def rms(frame):
  """Return the RMS value of the frame content"""
  count       = len(frame) / swidth
  format      = "%dh" % (count)
  shorts      = struct.unpack(format, frame)
  sum_squares = 0.0
  for sample in shorts:
      n            = sample * short_normalize
      sum_squares += n * n
  rms = math.pow(sum_squares / count, 0.5)

  return rms * 1000

def resample(audio, input_rate, output_rate):
  """
  ALSA only support 44100 or 48000 sampling rate, resampleing from input_rate to output_rate 
  Args:
      audio (binary)   : Input audio stream
      input_rate (int) : Input audio rate to resample from
      output_rate (int): Input audio rate to resample from   
  Return:
      a numpy array of int16 resampled at the proper sample rate
  """
  audio_i16     = np.frombuffer(buffer=audio, dtype=np.int16)
  resample_size = int(len(audio_i16) / input_rate * output_rate)
  resample      = signal.resample(audio_i16, resample_size)
  out_i16       = np.array(resample, dtype=np.int16)
  #print(f"input size: {len(audio_i16)}, output zize:{len(out_i16)}")
  
  return out_i16

In [None]:
n_chucnk    = int(t_record_s * sample_rate / chunk)
p           = pyaudio.PyAudio()
stream      = p.open(    
   format   = format,  channels          = channels,         rate = sample_rate,
   input    = True,    frames_per_buffer = frames_per_buffer, input_device_index=device_index)
   
frames = []
print(f"-----Now Recording for {t_record_s} s-----")
for i in range(0,400):
  audio_data   = stream.read(chunk, exception_on_overflow = False)
  rms_val      = rms(audio_data)
  frames.append(resample(audio_data, sample_rate, target_rate))

print(f'-----End Recording----- Last RMS: {rms_val:6.2f} ')
stream.stop_stream()    # Stop Audio Recording  IMPORTANT
stream.close()          # Close Audio Recording IMPORTANT
#print(frames)


wf = wave.open(audio_file, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(format))
wf.setframerate(target_rate)
wf.writeframes(b''.join(frames))
wf.close()

In [None]:
import transformers
help(transformers.models.whisper.feature_extraction_whisper)
