<a href="https://colab.research.google.com/github/inachenyx/SpeechSNN/blob/main/MFCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preprocessing - Feature scaling (MFCC feature extraction)
#### sklearn.preprocessing

In [None]:
from sklearn import preprocessing

def extract_features(audio,rate):
   mfcc_feature = mfcc.mfcc(audio,rate, winlen=0.020,preemph=0.95,numcep=20,nfft=1024,ceplifter=15,highfreq=6000,nfilt=55,appendEnergy=False)
   mfcc_feature = preprocessing.scale(mfcc_feature)
   delta = calculate_delta(mfcc_feature)
   combined = np.hstack((mfcc_feature,delta))
   return combined

### Preprocessing - Stereo to mono format
#### pydub

In [None]:
from pydub import AudioSegment

mysound = AudioSegment.from_wav("stereo_infile.wav")
# set mono channel
mysound = mysound.set_channels(1)
# save the result
mysound.export("mono_outfile.wav", format="wav")

### VAD Voice Activity Detection
#### PyTorch

In [None]:
import torch

# loading vad model and tools to work with audio
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False)

(get_speech_ts_adaptive, save_audio, read_audio, VADIterator, collect_chunks) = utils

audio = read_audio('raw_voice.wav')


# get time chunks with voice
speech_timestamps = get_speech_ts_adaptive(wav, model)


# gather the chunks and save them to a file
save_audio('only_speech.wav',
         collect_chunks(speech_timestamps, audio))

### Noise Reduction
### noisereduce, SciPy

In [None]:
import noisereduce as nr
from scipy.io import wavfile

# load data
rate, data = wavfile.read("voice_with_noise.wav")

# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate)

### Feature Extraction
#### For example, Delta MFCC combined with regular MFCC
#### numpy, scikit-learn, python_speech_features

In [None]:
import numpy as np
from sklearn import preprocessing
from python_speech_features import mfcc, delta

def extract_features(audio,rate):
  """extract 20 dim mfcc features from audio file, perform CMS and combine
  delta to make 40 dim feature vector"""

  mfcc_feature = mfcc.mfcc(audio, rate, winlen=0.020,preemph=0.95,numcep=20,nfft=1024,ceplifter=15,highfreq=6000,nfilt=55,appendEnergy=False)

  # feature scaling
  mfcc_feature = preprocessing.scale(mfcc_feature)
  delta_feature = delta(mfcc_feature, 2) # calculating delta
  # stacking delta features with common features
  combined_features = np.hstack((mfcc_feature, delta_feature))
  return combined_features

### Train with Gaussian Mixture Model
#### sklearn.mixture

In [None]:
sample_rate, data = read('denoised_vad_voice.wav')

# extract 40 dimensional MFCC & delta MFCC features
features = extract_features(audio, sr)

gmm = GMM(n_components=16,max_iter=200,covariance_type='diag',n_init=1, init_params='random')
gmm.fit(features)  # gmm training

### GMM+UBM Universal Background Model

In [None]:
ubm = bob.kaldi.ubm_train(features, r'ubm_vad.h5', num_threads=4, num_gauss=2048, num_iters=100)


# training every gmm using ubm
user_model = bob.kaldi.ubm_enroll(features, ubm)


# scoring using ubm and a specified gmm
bob.kaldi.gmm_score(features, user_model, ubm)

### Deep Learning Models

In [None]:
# load model
def deep_model(model: str = 'speakernet', quantized: bool = False, **kwargs):
  """
  Load Speaker2Vec model.

  Parameters
  ----------
  model : str, optional (default='speakernet')
      Model architecture supported. Allowed values:

      * ``'vggvox-v1'`` - VGGVox V1, embedding size 1024
      * ``'vggvox-v2'`` - VGGVox V2, embedding size 512
      * ``'deep-speaker'`` - Deep Speaker, embedding size 512
      * ``'speakernet'`` - SpeakerNet, embedding size 7205

  quantized : bool, optional (default=False)
      if True, will load 8-bit quantized model.
      The quantized model isn’t necessarily faster, it totally depends on the machine.

  Returns
  -------
  result : malaya_speech.supervised.classification.load function
  """

model = malaya_speech.speaker_vector.deep_model('speakernet')

from glob import glob

speakers = ['voice_0.wav', 'vocie_1.wav', 'voice_2.wav']

# pipeline
def load_wav(file):
  return malaya_speech.load(file)[0]

p = Pipeline()
frame = p.foreach_map(load_wav).map(model)
r = p.emit(speakers)

# calculate similarity
from scipy.spatial.distance import cdist

1 - cdist(r['speaker-vector'], r['speaker-vector'], metric = 'cosine')

### Custom Transformer

In [None]:
import numpy as np
import warnings
from python_speech_features import mfcc, delta
from sklearn import preprocessing
from sklearn.utils.validation import check_is_fitted

warnings.filterwarnings('ignore')
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
def __init__(self, winlen=0.020,preemph=0.95,numcep=20,nfft=1024,ceplifter=15,highfreq=6000,nfilt=55,appendEnergy=False):
      self.winlen = winlen
      self.preemph = preemph
      self.numcep = numcep
      self.nfft = nfft
      self.ceplifter = ceplifter
      self.highfreq = highfreq
      self.nfilt = nfilt
      self.appendEnergy = appendEnergy

  def transform(self, x):
      """ A reference implementation of a transform function.
              Parameters
              ----------
              x : {array-like, sparse-matrix}, shape (n_samples, n_features)
                  The input samples.
              Returns
              -------
              X_transformed : array, shape (n_samples, n_features)
                  The array containing the element-wise square roots of the values
                  in ``X``.
              """
      # Check is fit has been called
      check_is_fitted(self, 'n_features_')

      # Check that the input is of the same shape as the one passed
      # during fit.
      if x.shape[1] != self.n_features_:
          raise ValueError('Shape of input is different from what was seen'
                           'in `fit`')
      return self.signal_to_mfcc(x)

  def fit(self, x, y=None):
      """A reference implementation of a fitting function for a transformer.
              Parameters
              ----------
              x : {array-like, sparse matrix}, shape (n_samples, n_features)
                  The training input samples.
              y : None
                  There is no need of a target in a transformer, yet the pipeline API
                  requires this parameter.
              Returns
              -------
              self : object
                  Returns self.
              """
      self.n_features_ = x.shape[1]
      return self

### Custom classifier class