<a href="https://colab.research.google.com/github/inachenyx/SpeechSNN/blob/main/MFCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preprocessing - Feature scaling (MFCC feature extraction)
#### sklearn.preprocessing

In [None]:
from sklearn import preprocessing

def extract_features(audio,rate):
   mfcc_feature = mfcc.mfcc(audio,rate, winlen=0.020,preemph=0.95,numcep=20,nfft=1024,ceplifter=15,highfreq=6000,nfilt=55,appendEnergy=False)
   mfcc_feature = preprocessing.scale(mfcc_feature)
   delta = calculate_delta(mfcc_feature)
   combined = np.hstack((mfcc_feature,delta))
   return combined

### Preprocessing - Stereo to mono format
#### pydub

In [None]:
from pydub import AudioSegment

mysound = AudioSegment.from_wav("stereo_infile.wav")
# set mono channel
mysound = mysound.set_channels(1)
# save the result
mysound.export("mono_outfile.wav", format="wav")

### VAD Voice Activity Detection
#### PyTorch

In [None]:
import torch

# loading vad model and tools to work with audio
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False)

(get_speech_ts_adaptive, save_audio, read_audio, VADIterator, collect_chunks) = utils

audio = read_audio('raw_voice.wav')


# get time chunks with voice
speech_timestamps = get_speech_ts_adaptive(wav, model)


# gather the chunks and save them to a file
save_audio('only_speech.wav',
         collect_chunks(speech_timestamps, audio))

### Noise Reduction
### noisereduce, SciPy

In [None]:
import noisereduce as nr
from scipy.io import wavfile

# load data
rate, data = wavfile.read("voice_with_noise.wav")

# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate)

### Feature Extraction
#### Ex. Delta MFCC combined with regular MFCC, numpy, scikit-learn, python_speech_features

In [None]:
import numpy as np
from sklearn import preprocessing
from python_speech_features import mfcc, delta

def extract_features(audio,rate):
  """extract 20 dim mfcc features from audio file, perform CMS and combine
  delta to make 40 dim feature vector"""

  mfcc_feature = mfcc.mfcc(audio, rate, winlen=0.020,preemph=0.95,numcep=20,nfft=1024,ceplifter=15,highfreq=6000,nfilt=55,appendEnergy=False)

  # feature scaling
  mfcc_feature = preprocessing.scale(mfcc_feature)
  delta_feature = delta(mfcc_feature, 2) # calculating delta
  # stacking delta features with common features
  combined_features = np.hstack((mfcc_feature, delta_feature))
  return combined_features

### Train with Gaussian Mixture Model
#### sklearn.mixture

In [None]:
sample_rate, data = read('denoised_vad_voice.wav')

# extract 40 dimensional MFCC & delta MFCC features
features = extract_features(audio, sr)

gmm = GMM(n_components=16,max_iter=200,covariance_type='diag',n_init=1, init_params='random')
gmm.fit(features)  # gmm training