### Visualizing The Audio File Spectrum

In [None]:
from scipy.fft import rfft, rfftfreq
import librosa
from matplotlib import pyplot as plt
import numpy as np
filepath = 'C:\\Users\\kipli\\Cedric\\speech_processing\\paper review\\SI681.WAV.wav'
data, sr = librosa.load(filepath, sr=8000, res_type='fft')
nfft = 256
overlap = nfft/2
frame_1 = data[0:256]

formants = np.array([[250, 2250, 2890],[400, 1920, 2560],[550, 1770, 2490],[690, 1660, 2490],
                     [710, 1100, 2540],[590, 880, 2540],[450, 1030, 2380],[310, 870, 2250]])

In [None]:
def plot_fft(audio_data):
    
    y = rfft(audio_data)
    x = rfftfreq(len(audio_data), 1/sr)
    
    plt.figure(figsize=(7,2))
    plt.bar(x, np.abs(y)/np.max(np.abs(y)), 125)
    plt.ylabel("")
    plt.xlabel("Frequency (Hz)")
    plt.show()

In [None]:
plot_fft(frame_1)

In [None]:
import IPython
IPython.display.Audio(data, rate=sr)

### Neighbour Search

In [None]:
def neighbour_peak_search(frame):
    # formants for each vowel
    v_peaks = []
    amplitudes = rfft(frame)
    frequencies = rfftfreq(len(frame), 1/sr)
    
    # inside the formants array
    for formant in formants:
        # inside the vowel formants array peaks for each formant
        fmt_freq = []
        for freq in formant:
            # minimum difference between the known formant frequencies and fft frequencies
            base_peak_index = np.argmin([np.abs(freq - fft_freq) for fft_freq in frequencies])
            modified_peak = np.max([amplitudes[base_peak_index - 4:base_peak_index + 4]])
            modified_peak_index = np.where(amplitudes == modified_peak)[0][0]
            fmt_freq.append(modified_peak_index)
            
        v_peaks.append(fmt_freq)
    
    return v_peaks

In [None]:
neighbour_peak_search(frame_1)

### Dividing the audio file into frames

In [None]:
def frame_splits(data, framesize, overlap):
    frames = []
    for index in range(0, len(data)-framesize, int(overlap)):
        frame = data[index:index + framesize]
        frames.append(frame)
    
    return frames

In [None]:
# total number of frames in the audio file
no_of_frames = len(frame_splits(data, nfft, overlap))
print(f"Number of frames: {no_of_frames}")
# number of samples per frame
no_of_samples = len(frame_splits(data, nfft, overlap)[0])
print(f"Number of samples per frame: {no_of_samples}")

### Peak Neighbour Difference Algorithm

In [None]:
def peak_neighbour_difference(audio_data):
    
    section_vowel_PND = [] # PND results for each spectral peak in the section
    frames = frame_splits(audio_data, nfft, overlap) # outputs an array containing sections of the audio file
    
    for frame in frames:
        section_fft_amplitudes = rfft(frame)
        section_fft_frequencies = rfftfreq(len(frame))
        vowel_peaks = neighbour_peak_search(frame) # peaks for the section/frame
        
        section_formant_PND = []
        
        for vowel_peak in vowel_peaks:
            current = np.array([section_fft_amplitudes[index] for index in vowel_peak])
            after = np.array([section_fft_amplitudes[index + 1] for index in vowel_peak])
            before = np.array([section_fft_amplitudes[index - 1] for index in vowel_peak])
            pnd_val = np.abs(current - after - before)
            
            section_formant_PND.append(list(pnd_val))
        
        section_vowel_PND.append(section_formant_PND)
        
    return section_vowel_PND

In [None]:
pnd = peak_neighbour_difference(data)

In [None]:
pnd[0]

### Weighted PND
Weights are applied to the calculated values of PND to boost the importance of some formants. For the paper reviewed, the weights used were:
1. w<sub>1</sub> = 2.5 for the first formant
2. w<sub>2</sub> = 1 for the second formant
3. w<sub>3</sub> = 1 for the third formant

In [None]:
vowels = ['vowel1', 'vowel2', 'vowel3', 'vowel4', 'vowel5', 'vowel6', 'vowel7', 'vowel8']
def weighted_formant_pnd(audio_data):
    weights = np.array([2.5, 1.0, 1.0])
    pnd_values = peak_neighbour_difference(audio_data)
    
    weighted_pnd_values = []
    for vowel_pnd in pnd_values:
        weighted_formants = [sum(formant_signicance*weights) for formant_signicance in vowel_pnd]
        pnd_vowel_pairs = list(zip(weighted_formants, vowels))
        weighted_pnd_values.append(max(pnd_vowel_pairs))
    
    return weighted_pnd_values

In [None]:
weighted = weighted_formant_pnd(data)
weighted[:20]

### Computing the Threshold Value

In [None]:
alpha = 1.7
beta = 7000

def threshold():
    summation = 0
    for pair in weighted[:20]:
        summation = summation + pair[0]
        
    threshold_value = summation/20 * alpha
    return threshold_value

In [None]:
thres = threshold()
thres

### Speech Data Extraction

In [None]:
def extract_speech():
    pnd_vowel_pairs = weighted_formant_pnd(data)
    
    speech_data = []
    for pair in pnd_vowel_pairs:
        if pair[0] >= thres:
            speech_data.append(pair)
    
    return speech_data
        

In [None]:
speech = extract_speech()
speech

In [None]:
len(speech)