In [2]:
# Record live audio 
from pyaudio import PyAudio, paInt16
import wave
import os
import time
import sys
import soundfile as sf
import sounddevice as sd



# Record live audio
duration = 5  # seconds
filename = "output.wav"
fs = 44100  # Sample rate

print(f"Recording for {duration} seconds...")
recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
sd.wait()  # Wait until recording is finished
sf.write(filename, recording, fs)
print(f"Recording saved to {filename}")

Recording for 5 seconds...
Recording saved to output.wav


In [7]:
!pip install --upgrade tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.40.0
    Uninstalling tqdm-4.40.0:
      Successfully uninstalled tqdm-4.40.0
Successfully installed tqdm-4.67.1


In [16]:
# Initialize the Whisper model
model = faster_whisper.WhisperModel("small", compute_type="int8")

In [None]:
import faster_whisper
import sounddevice as sd
import numpy as np

# Define the audio stream callback
def audio_callback(indata, frames, time, status):
    if status:
        print(f"Status: {status}")
    # Convert audio data to the required format
    audio_data = indata[:, 0].astype(np.float32) / 32768.0  # Normalize int16 to float32
    # Transcribe 1 second of audio at a time
    segments, _ =(audio_data, beam_size=5, without_timestamps=True)
    for segment in segments:
        print(f"Text: {segment.text}")

# Start the audio stream
chunk_duration = 1  # seconds
chunk_frames = int(chunk_duration * fs)
print(f"Recording and transcribing in {chunk_duration}-second chunks for {duration} seconds...")
with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, dtype='int16', blocksize=chunk_frames):
    sd.sleep(duration * 1000)  # Keep the stream open for the specified duration
print("Finished.")

Recording and transcribing in 1-second chunks for 5 seconds...
Text:  ვვ ვვ ვვ ვვ ვვ ვვ ვ ზ ი ვ ჵ ოვ ი მ ზე


KeyboardInterrupt: 

In [None]:
# USing faster whisper do text to speech
import faster_whisper
import sounddevice as sd
import numpy as np
import torch
from faster_whisper import WhisperModel
import soundfile as sf
import time
import os
import tempfile
import shutil
from gtts import gTTS


# Record live audio 
from pyaudio import PyAudio, paInt16
import wave
import os
import time
import sys
import soundfile as sf
import sounddevice as sd



# Record live audio
duration = 5  # seconds
filename = "output.wav"
fs = 44100  # Sample rate

print(f"Recording for {duration} seconds...")
recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
sd.wait()  # Wait until recording is finished
sf.write(filename, recording, fs)
print(f"Recording saved to {filename}")

# Load the model
model = WhisperModel("small", compute_type="int8")
# Faster-whisper is for speech-to-text (not text-to-speech)
# For text-to-speech we need to use a dedicated TTS library
# Let's install and use gTTS (Google Text-to-Speech)


def text_to_speech(text, output_file="tts_output.mp3"):
    """Convert text to speech using Google's Text-to-Speech API"""
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    print(f"Text-to-speech saved to {output_file}")
    
    # Play the generated audio
    data, samplerate = sf.read(output_file)
    sd.play(data, samplerate)
    sd.wait()

# Example usage
sample_text = "This is a demonstration of text to speech conversion using Google's TTS service."
text_to_speech(sample_text)

# You can also convert transcribed text to speech
# For example, after transcribing with faster-whisper:
# text_to_speech(transcription_result)


In [None]:

import sounddevice as sd
import soundfile as sf
import sys
sys.path.append('/content/Torch-KWT')

SAMPLE_RATE = 16000
DURATION = 1  # seconds

def record_clip(filename):
    print(f"Recording {filename} for {DURATION}s…")
    clip = sd.rec(int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1)  # sounddevice rec
    sd.wait()  # wait until recording is done
    sf.write(filename, clip, SAMPLE_RATE)

# Record five “yes” samples
for i in range(1, 6):
    record_clip(f"yes_{i}.wav")
# (Optionally) record five “not_yes” samples
for i in range(1, 6):
    record_clip(f"not_yes_{i}.wav")

Recording yes_1.wav for 1s…
Recording yes_2.wav for 1s…
Recording yes_3.wav for 1s…
Recording yes_4.wav for 1s…
Recording yes_5.wav for 1s…
Recording not_yes_1.wav for 1s…
Recording not_yes_2.wav for 1s…
Recording not_yes_3.wav for 1s…
Recording not_yes_4.wav for 1s…
Recording not_yes_5.wav for 1s…


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as T

class YesNoDataset(Dataset):
    def __init__(self, file_list, label, transform):
        self.files = file_list
        self.label = label
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        wav, sr = torchaudio.load(self.files[idx])
        if sr != SAMPLE_RATE:
            wav = T.Resample(sr, SAMPLE_RATE)(wav)  # resample if needed
        spec = T.MelSpectrogram(
            sample_rate=SAMPLE_RATE, n_fft=1024, win_length=640,
            hop_length=160, n_mels=40
        )(wav)  # (1, 40, T)
        log_mel = torch.log(spec + 1e-6)
        # pad/crop to 98 frames
        if log_mel.shape[-1] < 98:
            pad = 98 - log_mel.shape[-1]
            log_mel = F.pad(log_mel, (0, pad))
        log_mel = log_mel[:, :, :98]
        return log_mel, torch.tensor(self.label)

# Assemble datasets
yes_files = [f"yes_{i}.wav" for i in range(1,6)]
no_files  = [f"not_yes_{i}.wav" for i in range(1,6)]
transform = None  # already applied above

dataset = torch.utils.data.ConcatDataset([
    YesNoDataset(yes_files, 1, transform),
    YesNoDataset(no_files, 0, transform)
])
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
import torch
from kwt.models.kwt import kwt_from_name, KWT

# 1) Load original 35-class model to grab weights
base_model = kwt_from_name("kwt-1")  # default num_classes=35
ckpt = torch.load("kwt/kwt1_v01.pth", map_location="cpu")  # your downloaded weights :contentReference[oaicite:8]{index=8}
base_model.load_state_dict(ckpt, strict=False)

# 2) Create new model for 2 classes
cfg = {
    "input_res":[40,98],"patch_res":[40,1],"num_classes":2,
    "mlp_dim":256,"dim":64,"heads":1,"depth":12,
    "dropout":0.0,"emb_dropout":0.1,"pre_norm":False
}
model = KWT(**cfg)

# 3) Transfer matching weights (all but mlp_head)
state_dict = base_model.state_dict()
# remove the old head weights so shapes match
for key in list(state_dict):
    if key.startswith("mlp_head"):
        state_dict.pop(key)
model.load_state_dict(state_dict, strict=False)  # ignore missing head :contentReference[oaicite:9]{index=9}

_IncompatibleKeys(missing_keys=['mlp_head.0.weight', 'mlp_head.0.bias', 'mlp_head.1.weight', 'mlp_head.1.bias'], unexpected_keys=[])

In [None]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).train()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

EPOCHS = 10
for epoch in range(EPOCHS):
    total_loss = 0.0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)               # forward
        loss = criterion(logits, y)     # compute loss
        loss.backward()                 # backprop
        optimizer.step()                # update weights
        total_loss += loss.item()
    avg = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg:.4f}")
torch.save(model.state_dict(), "yesno_trained_model.pth")  # save model weights

Epoch 1/10, Loss: 0.7088
Epoch 2/10, Loss: 0.6596
Epoch 3/10, Loss: 0.7274
Epoch 4/10, Loss: 0.7196
Epoch 5/10, Loss: 0.5524
Epoch 6/10, Loss: 0.6404
Epoch 7/10, Loss: 0.5123
Epoch 8/10, Loss: 0.5973
Epoch 9/10, Loss: 0.4313
Epoch 10/10, Loss: 0.3963


In [None]:
sample = loader.dataset[0]

In [None]:
audio, label = sample

In [None]:
label

tensor(1)

In [None]:

model.eval()
model(audio.unsqueeze(0).to(device))

tensor([[-0.5854,  1.0904]], grad_fn=<AddmmBackward0>)

In [None]:
import pyttsx3
import platform
import time

def speak_text(text):
    try:
        # Initialize the TTS engine
        engine = pyttsx3.init()
        
        # Adjust properties like rate and volume
        rate = engine.getProperty('rate')
        engine.setProperty('rate', rate)  # Slow down the speech
        
        # Set volume
        engine.setProperty('volume', 1.0)
        
        # Choose a voice
        voices = engine.getProperty('voices')
        if voices:
            engine.setProperty('voice', 'en-us')  # Set to a specific voice (e.g., 'en-us')
        
        # Speak the text
        engine.say(text)
        
        # This is important - keep a reference to the engine until it's done
        engine.runAndWait()
        
        # Add a small delay to ensure processing is complete
        time.sleep(0.5)
        
        # Explicitly stop and dispose of the engine
        engine.stop()
        
        return True
    except Exception as e:
        print(f"Error in text-to-speech: {e}")
        return False

# Use the existing text variable
print(f"Speaking: {text}")
speak_result = speak_text(text)
print(f"Speech completed successfully: {speak_result}")


Speaking: Hello, this is an offline text-to-speech demonstration using pyttsx3.
Voice 0: Afrikaans - ['af']
Voice 1: Amharic - ['am']
Voice 2: Aragonese - ['an']
Voice 3: Arabic - ['ar']
Voice 4: Assamese - ['as']
Voice 5: Azerbaijani - ['az']
Voice 6: Bashkir - ['ba']
Voice 7: Bulgarian - ['bg']
Voice 8: Bengali - ['bn']
Voice 9: Bishnupriya Manipuri - ['bpy']
Voice 10: Bosnian - ['bs']
Voice 11: Catalan - ['ca']
Voice 12: Chinese (Mandarin) - ['cmn']
Voice 13: Czech - ['cs']
Voice 14: Welsh - ['cy']
Voice 15: Danish - ['da']
Voice 16: German - ['de']
Voice 17: Greek - ['el']
Voice 18: English (Caribbean) - ['en-029']
Voice 19: English (Great Britain) - ['en-gb']
Voice 20: English (Scotland) - ['en-gb-scotland']
Voice 21: English (Lancaster) - ['en-gb-x-gbclan']
Voice 22: English (West Midlands) - ['en-gb-x-gbcwmd']
Voice 23: English (Received Pronunciation) - ['en-gb-x-rp']
Voice 24: English (America) - ['en-us']
Voice 25: Esperanto - ['eo']
Voice 26: Spanish (Spain) - ['es']
Voice 2

In [2]:
import cv2
import time 
# Load the image
cap = cv2.VideoCapture(0)
# Give camera time to adjust to lighting conditions
print("Allowing camera to adjust...")
for _ in range(10):  # Capture and discard frames to let camera adjust
    cap.read()
    time.sleep(0.1)  # Short delay between frames

# Now capture the actual frame we want to use
ret, frame = cap.read()
cap.release()
# Check if image was loaded properly
cv2.imwrite("data/user_image.jpg", frame)
image = frame
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, minNeighbors=3)
print(f"Detected {len(faces)} face(s) for blurring")

for (x, y, w, h) in faces:
    face_region = image[y:y+h, x:x+w]
    blurred_face = cv2.GaussianBlur(face_region, (99, 99), 30)
    image[y:y+h, x:x+w] = blurred_face

# Save the image with blurred faces
cv2.imwrite('data/user_image_blurred.png', image)
print("Saved image with blurred faces as 'blurred_image.png'")

Allowing camera to adjust...
Detected 3 face(s) for blurring
Saved image with blurred faces as 'blurred_image.png'
