In [9]:
import json
import threading
import time
import pyaudio
import wave
import io
from IPython.display import clear_output
from websockets.sync.client import connect

# Your API key
api_key = "e2a6b6253d4175f8d992e2b63913d982daf8892951a97f55d461325fe294f0a3"
websocket_url = "wss://stt-rt.soniox.com/transcribe-websocket"

# Audio settings for Device 1 (Realtek)
CHUNK = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1  # Mono
RATE = 16000  # Standard rate for speech recognition
DEVICE_INDEX = 1  # Your Realtek microphone

class SimpleTranscriber:
    def __init__(self):
        self.is_recording = False
        self.p = pyaudio.PyAudio()
        
    def create_wav_header(self, sample_rate, channels, bits_per_sample):
        """Create WAV header for audio data"""
        byte_rate = sample_rate * channels * bits_per_sample // 8
        block_align = channels * bits_per_sample // 8
        
        header = b'RIFF'
        header += (36).to_bytes(4, 'little')  # File size - 8
        header += b'WAVE'
        header += b'fmt '
        header += (16).to_bytes(4, 'little')  # Format chunk size
        header += (1).to_bytes(2, 'little')   # PCM format
        header += channels.to_bytes(2, 'little')
        header += sample_rate.to_bytes(4, 'little')
        header += byte_rate.to_bytes(4, 'little')
        header += block_align.to_bytes(2, 'little')
        header += bits_per_sample.to_bytes(2, 'little')
        header += b'data'
        header += (0).to_bytes(4, 'little')   # Data size (will be updated)
        
        return header
    
    def start_transcription(self):
        print("🎤 Starting Real-time Malay/English Transcription")
        print("=" * 60)
        
        try:
            with connect(websocket_url, close_timeout=2) as ws:
                print("✓ Connected to Soniox")
                
                # Send simple configuration using auto-detect
                config = {
                    "api_key": api_key,
                    "audio_format": "auto",  # Let Soniox auto-detect WAV format
                    "model": "stt-rt-preview",
                    "language_hints": ["ms", "en"],  # Malay and English
                    "enable_dictation": True,
                    "enable_punctuation": True,
                }
                
                ws.send(json.dumps(config))
                print("✓ Configuration sent")
                
                # Wait for confirmation
                try:
                    response = ws.recv(timeout=5)
                    data = json.loads(response)
                    if data.get("error_code"):
                        print(f"❌ Error: {data['error_code']} - {data['error_message']}")
                        return
                    print("✓ Ready for transcription")
                except Exception as e:
                    print(f"❌ Setup error: {e}")
                    return
                
                # Start recording
                print("🎤 Opening microphone...")
                
                try:
                    stream = self.p.open(
                        format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        input_device_index=DEVICE_INDEX,
                        frames_per_buffer=CHUNK
                    )
                    
                    print("✓ Microphone ready")
                    self.is_recording = True
                    
                    # Send WAV header first
                    wav_header = self.create_wav_header(RATE, CHANNELS, 16)
                    ws.send(wav_header)
                    
                    print("\\n🎯 LIVE TRANSCRIPTION ACTIVE")
                    print("Speak in Malay or English...")
                    print("Press Ctrl+C to stop")
                    print("=" * 60)
                    
                    final_text = ""
                    
                    # Start audio recording thread
                    def record_audio():
                        try:
                            while self.is_recording:
                                data = stream.read(CHUNK, exception_on_overflow=False)
                                if self.is_recording:
                                    ws.send(data)
                                time.sleep(0.01)  # Small delay to prevent overwhelming
                        except Exception as e:
                            print(f"Recording error: {e}")
                    
                    audio_thread = threading.Thread(target=record_audio, daemon=True)
                    audio_thread.start()
                    
                    # Process transcription results
                    try:
                        while self.is_recording:
                            try:
                                message = ws.recv(timeout=2)
                                result = json.loads(message)
                                
                                if result.get("error_code"):
                                    print(f"❌ Error: {result['error_code']} - {result['error_message']}")
                                    break
                                
                                # Extract text from tokens
                                current_text = ""
                                provisional_text = ""
                                
                                for token in result.get("tokens", []):
                                    if token.get("text"):
                                        if token.get("is_final"):
                                            current_text += token["text"]
                                        else:
                                            provisional_text += token["text"]
                                
                                # Update display if we have new text
                                if current_text or provisional_text:
                                    if current_text:
                                        final_text += current_text
                                    
                                    clear_output(wait=True)
                                    print("🎯 LIVE MALAY/ENGLISH TRANSCRIPTION")
                                    print("=" * 60)
                                    print(final_text, end="")
                                    if provisional_text:
                                        print(f"\\033[94m{provisional_text}\\033[0m", end="")
                                    print("\\n" + "=" * 60)
                                    print("🎤 Listening... (Ctrl+C to stop)")
                                
                                if result.get("finished"):
                                    break
                                    
                            except TimeoutError:
                                continue
                                
                    except KeyboardInterrupt:
                        print("\\n⏹️ Stopping transcription...")
                    except Exception as e:
                        print(f"❌ Transcription error: {e}")
                    finally:
                        self.is_recording = False
                        stream.stop_stream()
                        stream.close()
                        ws.send(b"")  # End signal
                        
                except Exception as e:
                    print(f"❌ Microphone error: {e}")
                    
        except Exception as e:
            print(f"❌ Connection error: {e}")
        finally:
            self.p.terminate()

In [12]:
def test_microphone():
    """Test if we can access the microphone"""
    p = pyaudio.PyAudio()
    try:
        print("Testing microphone access...")
        stream = p.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            input_device_index=DEVICE_INDEX,
            frames_per_buffer=CHUNK
        )
        
        print("✓ Microphone accessible")
        
        # Test recording for 2 seconds
        print("Recording 2 second test...")
        for i in range(int(RATE / CHUNK * 2)):
            data = stream.read(CHUNK)
        
        print("✓ Recording test successful")
        stream.stop_stream()
        stream.close()
        
    except Exception as e:
        print(f"❌ Microphone test failed: {e}")
        
        # Show available devices
        print("\\nAvailable devices:")
        for i in range(p.get_device_count()):
            info = p.get_device_info_by_index(i)
            if info['maxInputChannels'] > 0:
                print(f"Device {i}: {info['name']}")
                
    finally:
        p.terminate()

# Run microphone test first
test_microphone()

Testing microphone access...
✓ Microphone accessible
Recording 2 second test...
✓ Recording test successful


In [13]:
print("\n" + "="*60)
print("STARTING REAL-TIME TRANSCRIPTION")
print("="*60)

transcriber = SimpleTranscriber()
transcriber.start_transcription()

🎯 LIVE MALAY/ENGLISH TRANSCRIPTION
🎤 Listening... (Ctrl+C to stop)
\n⏹️ Stopping transcription...


<h1>API-KAN this project</h1>

In [1]:
pip install fastapi uvicorn websockets pyaudio


Collecting fastapi
  Downloading fastapi-0.116.1-py3-none-any.whl (95 kB)
     ---------------------------------------- 95.6/95.6 kB 2.8 MB/s eta 0:00:00
Collecting uvicorn
  Downloading uvicorn-0.35.0-py3-none-any.whl (66 kB)
     ---------------------------------------- 66.4/66.4 kB 3.5 MB/s eta 0:00:00
Collecting starlette<0.48.0,>=0.40.0
  Downloading starlette-0.47.2-py3-none-any.whl (72 kB)
     ---------------------------------------- 73.0/73.0 kB 3.9 MB/s eta 0:00:00
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4
  Downloading pydantic-2.11.7-py3-none-any.whl (444 kB)
     -------------------------------------- 444.8/444.8 kB 3.1 MB/s eta 0:00:00
Collecting annotated-types>=0.6.0
  Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
Collecting pydantic-core==2.33.2
  Downloading pydantic_core-2.33.2-cp311-cp311-win_amd64.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 5.2 MB/s eta 0:00:00
Collecting typing-inspection>=0.4.0



[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
