In [1]:
!pip install moshi

Collecting moshi
  Downloading moshi-0.2.11-py3-none-any.whl.metadata (8.2 kB)
Collecting safetensors<0.6,>=0.4.0 (from moshi)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting huggingface-hub<0.34,>=0.24 (from moshi)
  Downloading huggingface_hub-0.33.5-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes<0.46,>=0.45 (from moshi)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting sentencepiece==0.2 (from moshi)
  Downloading sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting sounddevice==0.5 (from moshi)
  Downloading sounddevice-0.5.0-py3-none-any.whl.metadata (1.4 kB)
Collecting sphn<0.2.0,>=0.1.4 (from moshi)
  Downloading sphn-0.1.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting torch<2.8,>=2.2.0 (from moshi)
  Downloading torch-2.7.1-cp312-cp312-manylinux_2_28_x86_

In [3]:
import torch
import librosa
import soundfile as sf
import subprocess
import os
import sys
from IPython.display import display, Audio
import warnings
warnings.filterwarnings('ignore')

In [4]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
print(f"{device}")

cuda


In [5]:
try:
  import moshi
  print("moshi is installed")
  print(f"Moshi Version: {moshi.__version__}")
except:
  print("moshi is not installed")

moshi is installed
Moshi Version: 0.2.11


In [6]:
def download_sample_files():
  sample_files = [
      ("https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3", "bria.mp3"),
        ("https://github.com/kyutai-labs/moshi/raw/refs/heads/main/data/sample_fr_hibiki_crepes.mp3", "sample_fr_hibiki_crepes.mp3")

  ]
  downloaded_files = []
  for url, filename in sample_files:
    if not os.path.exists(filename):
      print(f"Downloading {filename} from {url}")
      subprocess.call(["wget", url, "-O", filename])
      print(f"Downloaded {filename}")
      downloaded_files.append(filename)
    else:
      print(f"{filename} already exists or failed to download")
  return downloaded_files


In [7]:
sample_files = download_sample_files()

Downloading bria.mp3 from https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3
Downloaded bria.mp3
Downloading sample_fr_hibiki_crepes.mp3 from https://github.com/kyutai-labs/moshi/raw/refs/heads/main/data/sample_fr_hibiki_crepes.mp3
Downloaded sample_fr_hibiki_crepes.mp3


In [11]:
def run_kyutai_stt(audio_file,model="kyutai/stt-1b-en_fr",show_output=True,force_cpu=None):
  try:
    print(f"Processiing {audio_file}")
    print(f"Using Model {model}")

    if force_cpu is None:
      use_cpu = not torch.cuda.is_available()
    else:
      use_cpu = force_cpu

      if use_cpu:
        print("Using CPU")
      else:
        print("Using GPU")

    cmd = [
        sys.executable,
        "-m",
        "moshi.run_inference",
        "--hf-repo",
        model,
        "--device", "cpu" if use_cpu else "cuda",
        audio_file
    ]
    result = subprocess.run(cmd, capture_output=True, text=True,check=True)
    if show_output:
      print("Transcript")
      print(result.stdout)
    return result.stdout.strip()

  except subprocess.CalledProcessError as e:
    print(f"Error: {e}")
    print(f"Error output: {e.stderr}")
    if "NVIDIA driver" in str(e.stderr) or "cuda" in str(e.stderr).lower():
      print("CUDA Failed, trying with CPU")
      return run_kyutai_stt(audio_file,model,show_output,force_cpu=False)
    return None
  except Exception as e:
    print(f"Unexpected error: {e}")
    return None

In [12]:
def run_kyutai_stt_with_timestamps(audio_file,model="kyutai/stt-1b-en_fr"):
  print(f"Processing with timestamp: {audio_file}")
  try:
    result = run_kyutai_stt(audio_file,model,show_output=False)
    if result:
      print("Transcription")
      print(result)
    return result
  except Exception as e:
    print(f"Error: {e}")
    return None

In [13]:
print("\n" + "="*60)
print("Testing KYUTAI STT")
print("="*60)

if "bria.mp3" in sample_files:
  print("\n Testing English audio (bria.mp3):")
  display(Audio("bria.mp3"))
  english_result = run_kyutai_stt("bria.mp3","kyutai/stt-1b-en_fr")

if "sample_fr_hibiki_crepes.mp3" in sample_files:
    print("\n Testing French audio (sample_fr_hibiki_crepes.mp3):")
    display(Audio("sample_fr_hibiki_crepes.mp3"))
    french_result = run_kyutai_stt("sample_fr_hibiki_crepes.mp3", "kyutai/stt-1b-en_fr")


Testing KYUTAI STT

 Testing English audio (bria.mp3):


Processiing bria.mp3
Using Model kyutai/stt-1b-en_fr
Transcript
[1;34m[Info][0m retrieving checkpoint
[1;34m[Info][0m loading mimi
[1;34m[Info][0m mimi loaded
[1;34m[Info][0m loading moshi
[1;34m[Info][0m moshi loaded
[1;34m[Info][0m loading input file bria.mp3
 In the heart of an ancient forest where the trees whispered secrets of the past, there lived a peculiar rabbit named Luna. Unlike any other rabbit, Luna was born with wings, a rare gift that she had yet to understand the purpose of. Each night, under the glow of the moon, she would gaze up at the stars, wondering if there was more to her existence. One evening, as the forest bathed in silvery moonlight, Luna discovered a clearing she had never seen before. In the center stood a crystal clear pond that mirrored the night sky. Drawn to its beauty, Luna approached the pond and, for the first time, unfolded her wings. As she touched the water's surface with her paw, the pond rippled, and the reflection of the stars bega

Processiing sample_fr_hibiki_crepes.mp3
Using Model kyutai/stt-1b-en_fr
Transcript
[1;34m[Info][0m retrieving checkpoint
[1;34m[Info][0m loading mimi
[1;34m[Info][0m mimi loaded
[1;34m[Info][0m loading moshi
[1;34m[Info][0m moshi loaded
[1;34m[Info][0m loading input file sample_fr_hibiki_crepes.mp3
 Bonjour, aujourd'hui, nous allons préparer des crêpes. Pour cela, il vous fera de la farine, des œufs, du lait, une pincée de sel, du sucre et du beurre. Pour commencer, mettez la farine dans un saladier avec le sel et le sucre. Faites un puits au milieu et versez-y les œufs. Commencez à mélanger doucement. Quand le mélange devient épais, ajoutez le lait froid petit à petit. Quand tout le lait est mélangé, la pâte de tétras est fluide. Si elle vous paraît trop épaisse, rajoutez un peu de lait. Ajoutez ensuite le beurre fondu refroidi, mélangez bien. Faites cuire les crêpes dans une poêle chaude, versez une petite couche de pâte dans la poêle, faites un mouvement de rotation pour

Hell_nah
None


In [14]:
def upload_and_transcribe_kyutai():
  from google.colab import files
  print("#Utility to upload your file")
  uploaded_files = files.upload()

  for filename in uploaded_files.keys():
    print(f"Processing {filename}")
    try:
      display(Audio(filename))
    except:
      print(f"Failed to display {filename}")
      sys.exit(1)

    model = "kyutai/stt-1b-en_fr"
    result = run_kyutai_stt(filename,model)
    if result:
      output_filename = f"{os.path.splitext(filename)[0]}_transcription.txt"
      with open(output_filename, "w",encoding='utf-8') as f:
        f.write(f"File: {filename}\n")
        f.write(f"Model: {model}\n")
        f.write(f"Transcription: {result}\n")
      print(f"Saved Transcription to: {output_filename}")

    if os.path.exists(filename):
      os.remove(filename)


In [15]:
upload_and_transcribe_kyutai()

#Utility to upload your file


Saving harvard.wav to harvard.wav
Processing harvard.wav


Processiing harvard.wav
Using Model kyutai/stt-1b-en_fr
Transcript
[1;34m[Info][0m retrieving checkpoint
[1;34m[Info][0m loading mimi
[1;34m[Info][0m mimi loaded
[1;34m[Info][0m loading moshi
[1;34m[Info][0m moshi loaded
[1;34m[Info][0m loading input file harvard.wav
 The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite. A zestful food is the hot cross bun.
Saved Transcription to: harvard_transcription.txt


In [17]:
def run_kyutai_stt_with_vad(audio_file,model="kyutai/stt-1b-en_fr",show_vad=True,show_timestamps=True):
  if model != "kyutai/stt-1b-en_fr":
    print("VAD is not supported for this model")
    model = "kyutai/stt-1b-en_fr"
  try:
    print(f"Processiing {audio_file}")
    print(f"Using Model {model}")
    use_cpu = not torch.cuda.is_available()
    device_str = "cpu" if use_cpu else "cuda"
    print(f"Using Device: {device_str}")

    cmd = [
        sys.executable,
        "-m",
        "moshi.run_inference",
        "--hf-repo",
        model,
        "--device", device_str,
        audio_file
    ]
    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        check=True
    )
    transcript= result.stdout.strip()
    print("Transcription:",transcript)

    if show_timestamps and transcript:
      print("Processing:")
      analyze_speech_patterns(transcript)

    if show_vad:
      print("VAD Processing:")
      simulate_vad_from_transcription(audio_file,transcript)

    return {
      "transcript": transcript,
      "model" : model,
      "device" : device_str,
      "vad_enabled" : True
    }
  except subprocess.CalledProcessError as e:
    print(f"Error: {e}")
    print(f"Error output: {e.stderr}")

    if "NVIDIA driver" in str(e.stderr) or "cuda" in str(e.stderr).lower():
      print("CUDA Failed, trying with CPU")
      return run_kyutai_stt_with_vad(audio_file,model,show_vad,show_timestamps)
    return None
  except Exception as e:
    print(f"Unexpected error: {e}")
    return None

def analyze_speech_patterns(transcript):
  if not transcript:
    return

  words = transcript.split()
  word_count = len(words)
  print(f"Total Words: {word_count}")
  print(f"Speech segments detected: {word_count} words")
  print(f"Estimated speaking time: ~{word_count * 0.4:.1f} seconds")

  pause_indicator = transcript.count('.')+transcript.count(',')+transcript.count('?')+transcript.count('!')
  print(f"Pauses: {pause_indicator}")

  if word_count>0:
    print(f"Speech Density: {"High" if word_count>50 else "Medium" if word_count>20 else "Low"}")

import numpy as np
def simulate_vad_from_transcription(audio_file,transcript):
  try:
    audio,sr = librosa.load(audio_file, sr=16000)
    duration = len(audio)/sr
    rms = librosa.feature.rms(y=audio)[0]
    threshold = np.mean(rms) * 0.5
    voice_frames = rms>threshold

    voice_percentage = np.mean(voice_frames) * 100

    print(f"Audio duration: {duration:.2f} seconds")
    print(f"Estimated voice activity: {voice_percentage:.1f}% of audio")
    print(f"Speech/Silence Ratio: {'Good' if 40<voice_percentage<60 else 'Bad'}")

    silence_threshold = np.mean(rms) * 0.1
    silence_threshold = rms < silence_threshold
    silence_percentage = np.mean(silence_threshold) * 100

    print(f"Estimated silence: {silence_percentage:.1f}% of audio")

    if transcript:
      words_per_second = len(transcript.split())/duration
      print(f"Estimated words per second: {words_per_second:.1f}")

  except Exception as e:
    print(f"Error: {e}")
    return None

In [24]:
def real_time_vad_simulation(audio_file, chunk_duration=2.0):
    print(f"Simulating real-time VAD for {audio_file}")
    try:
        audio, sr = librosa.load(audio_file, sr=16000)
        total_duration = len(audio) / sr
        chunk_samples = int(chunk_duration * sr)
        num_chunks = int(np.ceil(total_duration / chunk_duration))  # Added this line

        print(f"Total Duration: {total_duration:.2f} seconds")
        print(f"Real time VAD Simulation")

        for i, start_sample in enumerate(range(0, len(audio), chunk_samples)):
            end_sample = min(start_sample + chunk_samples, len(audio))
            chunk = audio[start_sample:end_sample]

            rms = np.sqrt(np.mean(chunk**2))
            is_speech = rms > 0.01
            start_time = start_sample / sr
            end_time = end_sample / sr

            status = "Speech" if is_speech else "Silence"
            print(f"[{i+1}/{num_chunks}] {start_time:.2f}s - {end_time:.2f}s: {status} (Energy: {rms:.4f})")

    except Exception as e:
        print(f"Error: {e}")

def batch_vad_analysis(audio_files):
    print("Batch VAD Analysis")
    results = []
    for audio_file in audio_files:
        print(f"Processing {audio_file}")
        if os.path.exists(audio_file):
            result = run_kyutai_stt_with_vad(audio_file, show_vad=True, show_timestamps=False)
            results.append(result)
        else:
            print(f"{audio_file} not found")
    return results

In [19]:
def test_vad_features():
  print("VAD-Semantic")

  sample_files = ['bria.mp3','sample_fr_hibiki_crepes.mp3']
  available_files= [f for f in sample_files if os.path.exists(f)]

  if not available_files:
    print("No files")
    return

  for audio_file in available_files:
    print(f"\nProcessing {audio_file}")
    display(Audio(audio_file))
    result= run_kyutai_stt_with_vad(audio_file)
    print(f"Real time VAD Simulation:")
    real_time_vad_simulation(audio_file)

In [20]:
def upload_and_analyze_vad():
    from google.colab import files

    print("Upload audio files for VAD analysis:")
    uploaded_files = files.upload()

    for filename in uploaded_files.keys():
        print(f"\nVAD Analysis for: {filename}")
        try:
            display(Audio(filename))
        except:
            print("Could not display audio preview")

        result = run_kyutai_stt_with_vad(filename, show_vad=True, show_timestamps=True)

        real_time_vad_simulation(filename, chunk_duration=1.5)

        if result:
            output_filename = f"{os.path.splitext(filename)[0]}_vad_analysis.txt"
            with open(output_filename, 'w', encoding='utf-8') as f:
                f.write(f"VAD Analysis Results\n")
                f.write(f"{'='*20}\n\n")
                f.write(f"File: {filename}\n")
                f.write(f"Model: {result['model']}\n")
                f.write(f"Device: {result['device']}\n")
                f.write(f"VAD Enabled: {result['vad_enabled']}\n\n")
                f.write(f"Transcription:\n{result['transcription']}\n")

            print(f"Saved VAD analysis to: {output_filename}")

        if os.path.exists(filename):
            os.remove(filename)

In [26]:
test_vad_features()

VAD-Semantic

Processing bria.mp3


Processiing bria.mp3
Using Model kyutai/stt-1b-en_fr
Using Device: cuda
Transcription: [1;34m[Info][0m retrieving checkpoint
[1;34m[Info][0m loading mimi
[1;34m[Info][0m mimi loaded
[1;34m[Info][0m loading moshi
[1;34m[Info][0m moshi loaded
[1;34m[Info][0m loading input file bria.mp3
 In the heart of an ancient forest where the trees whispered secrets of the past, there lived a peculiar rabbit named Luna. Unlike any other rabbit, Luna was born with wings, a rare gift that she had yet to understand the purpose of. Each night, under the glow of the moon, she would gaze up at the stars, wondering if there was more to her existence. One evening, as the forest bathed in silvery moonlight, Luna discovered a clearing she had never seen before. In the center stood a crystal clear pond that mirrored the night sky. Drawn to its beauty, Luna approached the pond and, for the first time, unfolded her wings. As she touched the water's surface with her paw, the pond rippled, and the refle

Processiing sample_fr_hibiki_crepes.mp3
Using Model kyutai/stt-1b-en_fr
Using Device: cuda
Transcription: [1;34m[Info][0m retrieving checkpoint
[1;34m[Info][0m loading mimi
[1;34m[Info][0m mimi loaded
[1;34m[Info][0m loading moshi
[1;34m[Info][0m moshi loaded
[1;34m[Info][0m loading input file sample_fr_hibiki_crepes.mp3
 Bonjour, aujourd'hui, nous allons préparer des crêpes. Pour cela, il vous fera de la farine, des œufs, du lait, une pincée de sel, du sucre et du beurre. Pour commencer, mettez la farine dans un saladier avec le sel et le sucre. Faites un puits au milieu et versez-y les œufs. Commencez à mélanger doucement. Quand le mélange devient épais, ajoutez le lait froid petit à petit. Quand tout le lait est mélangé, la pâte de tétras est fluide. Si elle vous paraît trop épaisse, rajoutez un peu de lait. Ajoutez ensuite le beurre fondu refroidi, mélangez bien. Faites cuire les crêpes dans une poêle chaude, versez une petite couche de pâte dans la poêle, faites un mou

In [25]:
real_time_vad_simulation("bria.mp3", chunk_duration=2.0)

Simulating real-time VAD for bria.mp3
Total Duration: 44.85 seconds
Real time VAD Simulation
[1/23] 0.00s - 2.00s: Speech (Energy: 0.1309)
[2/23] 2.00s - 4.00s: Speech (Energy: 0.0881)
[3/23] 4.00s - 6.00s: Speech (Energy: 0.1221)
[4/23] 6.00s - 8.00s: Speech (Energy: 0.1137)
[5/23] 8.00s - 10.00s: Speech (Energy: 0.1188)
[6/23] 10.00s - 12.00s: Speech (Energy: 0.1012)
[7/23] 12.00s - 14.00s: Speech (Energy: 0.0728)
[8/23] 14.00s - 16.00s: Speech (Energy: 0.0952)
[9/23] 16.00s - 18.00s: Speech (Energy: 0.1179)
[10/23] 18.00s - 20.00s: Speech (Energy: 0.1117)
[11/23] 20.00s - 22.00s: Speech (Energy: 0.0624)
[12/23] 22.00s - 24.00s: Speech (Energy: 0.1133)
[13/23] 24.00s - 26.00s: Speech (Energy: 0.1185)
[14/23] 26.00s - 28.00s: Speech (Energy: 0.1130)
[15/23] 28.00s - 30.00s: Speech (Energy: 0.0850)
[16/23] 30.00s - 32.00s: Speech (Energy: 0.1041)
[17/23] 32.00s - 34.00s: Speech (Energy: 0.0868)
[18/23] 34.00s - 36.00s: Speech (Energy: 0.1043)
[19/23] 36.00s - 38.00s: Speech (Energy: 0.