## Speech enhancement

In [2]:
import numpy as np
print(np.__version__)

1.24.1


In [5]:
import numpy as np
import noisereduce as nr
import librosa
import soundfile as sf

def enhance_speech(audio_file_path, output_file_path):
    # Load audio file
    y, sr = librosa.load(audio_file_path, sr=None)
    
    # Perform noise reduction
    reduced_noise = nr.reduce_noise(y=y, sr=sr)
    
    # Save the enhanced audio to a file
    # librosa.output.write_wav(output_file_path, reduced_noise, sr)
    sf.write(output_file_path, reduced_noise, sr)

# Usage example
enhance_speech('../data/recorded_audio.wav', '../data/recorded_audio_nr.wav')

In [17]:
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import io

def is_speech_present(audio_file_path, silence_thresh=-50, min_silence_len=500):
    try:
        # Load the audio file
        audio = AudioSegment.from_file(audio_file_path)
        
        # Detect non-silent chunks in the audio file
        non_silent_chunks = detect_nonsilent(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
        
        # Check if there are any non-silent chunks
        if len(non_silent_chunks) == 0:
            return False
        
        # Use speech recognition to confirm if there is speech in the non-silent chunks
        recognizer = sr.Recognizer()
        for chunk in non_silent_chunks:
            start, end = chunk
            audio_chunk = audio[start:end]
            
            # Export audio chunk to memory buffer
            audio_chunk_buffer = io.BytesIO()
            audio_chunk.export(audio_chunk_buffer, format="wav")
            audio_chunk_buffer.seek(0)
            
            with sr.AudioFile(audio_chunk_buffer) as source:
                audio_data = recognizer.record(source)
                try:
                    recognizer.recognize_google(audio_data)
                    return True
                except sr.UnknownValueError:
                    continue
                except sr.RequestError as e:
                    print(f"Could not request results from Google Speech Recognition service; {e}")
                    return False
        
        return False
    
    except Exception as e:
        print(f"Error in checking speech presence: {e}")
        return False


# Example usage
audio_file = "../data/noise.wav"
speech_present = is_speech_present(audio_file)
if speech_present:
    print("Speech signal detected.")
else:
    print("No speech signal detected.")


No speech signal detected.


In [43]:
from df import enhance, init_df
import librosa
import torchaudio
import torchaudio.transforms as T
import torchaudio.functional as F

def add_gain(waveform, gain_db):
    gain = 10 ** (gain_db / 20)
    return waveform * gain

def apply_filters(waveform, sample_rate):
    # Apply high-pass filter
    highpass_waveform = F.highpass_biquad(waveform, sample_rate, cutoff_freq=80)
    # Apply low-pass filter
    lowpass_waveform = F.lowpass_biquad(highpass_waveform, sample_rate, cutoff_freq=8000)
    return lowpass_waveform

mode_df, df_state, _ = init_df()  # Load default model

gain_db = 10
in_file_path="../data/kaegan.wav"
# y, sr = librosa.load("../data/recorded_audio.wav", sr=None)
waveform, sample_rate = torchaudio.load(in_file_path)
# highpass = T.Highpass(sample_rate=sample_rate, cutoff_freq=80)
# lowpass = T.Lowpass(sample_rate=sample_rate, cutoff_freq=8000)
enhanced_audio = enhance(mode_df, df_state, waveform)
filtered_waveform = apply_filters(enhanced_audio, sample_rate)
# enhanced_waveform_with_gain = add_gain(enhanced_audio, gain_db)
torchaudio.save('../data/kaegan_df.wav', filtered_waveform, sample_rate)

[32m2024-06-28 16:10:59[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2024-06-28 16:10:59[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /home/keagan/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2024-06-28 16:10:59[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2024-06-28 16:10:59[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /home/keagan/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2024-06-28 16:10:59[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2024-06-28 16:10:59[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


## Text to speech

In [2]:
import whisper

model = whisper.load_model("large-v3")

In [3]:
result = model.transcribe("../data/kaegan.wav")

In [4]:
print(result["text"])

 Hello this is a test for audio customer acquisition on Myntra So my name is Keegan William Denise Can you please use this message as a test


In [5]:
result

{'text': ' Hello this is a test for audio customer acquisition on Myntra So my name is Keegan William Denise Can you please use this message as a test',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 7.0,
   'text': ' Hello this is a test for audio customer acquisition on Myntra',
   'tokens': [50365,
    2425,
    341,
    307,
    257,
    1500,
    337,
    6278,
    5474,
    21668,
    322,
    1222,
    580,
    424,
    50715],
   'temperature': 0.0,
   'avg_logprob': -0.3989851152574694,
   'compression_ratio': 1.2636363636363637,
   'no_speech_prob': 0.012981951236724854},
  {'id': 1,
   'seek': 0,
   'start': 7.0,
   'end': 10.0,
   'text': ' So my name is Keegan William Denise',
   'tokens': [50715, 407, 452, 1315, 307, 3189, 43118, 6740, 38133, 50865],
   'temperature': 0.0,
   'avg_logprob': -0.3989851152574694,
   'compression_ratio': 1.2636363636363637,
   'no_speech_prob': 0.012981951236724854},
  {'id': 2,
   'seek': 0,
   'start': 10.0,
   'end': 15.

In [None]:
#  Hello, this is Kigal, we need to use the text for external acquisition for the trap. Please use this on the equipment.
# Hello this is Ken from the RG 2014 M<|ml|> Symbols Of Trap
#  Hello, this is Kenan. We're going to use the text for customer information for Kinect. Please use this for the peer-to-peer chat.
