# Оценяване на резонансните честоти (форманти) от речеви сигнали

#### Import required libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.io import wavfile
import librosa
import matplotlib.pyplot as plt

#### Extracting Formants from a Speech Signal

##### **Step 1. Load the Speech Signal**
Load the .wav file using scipy.io.wavfile.read(). If the signal is stereo, it's converted to mono by averaging the channels. <br>
***Tip:*** To open a WAV file you need to access its directory correctly: `speech_samples/FILE_NAME.wav`

##### **Step 2. Define Frame Parameters*
Define the parameters for processing the audio signal, including the `frame_length` (2048 samples) and `hop_length` (512 samples). These parameters dictate how the audio is segmented for analysis.

##### **Step 3. Extract Formants**
For each frame of the audio signal, perform the following:
- Compute the Fast Fourier Transform (FFT) to analyze the frequency spectrum of the frame.
- Identify the indices of the three highest peaks in the frequency spectrum, which correspond to the first three formant frequencies (F1, F2, F3).

##### **Step 4. Store Results**
Store the extracted formant frequencies along with their corresponding time (in milliseconds) in a list. After processing all frames, convert this list into a Pandas DataFrame for easier handling and analysis.
##### **Step 5. Extract Formants**
The roots of the polynomial are analyzed to determine the formants. Only the roots with positive imaginary parts are considered, as they represent the formants in the complex plane.

##### **Step 6. Plot Formants**
Visualize the extracted formants over time using `matplotlib`. Create line plots for each of the first three formants (F1, F2, F3) against time, allowing for a clear view of how these frequencies change throughout the speech signal.

In [None]:
# Load audio file
file_path = 'speech_samples/speech_test_01.wav'  # Replace with your .wav file path
sr, y = wavfile.read(file_path)

# If stereo, take only one channel
if len(y.shape) > 1:
    y = y[:, 0]

# Frame parameters
frame_length = 2048
hop_length = 512

# Initialize a list to store formants
formants = []

# Extract formants
for i in range(0, len(y), hop_length):
    if i + frame_length <= len(y):
        frame = y[i:i + frame_length]
        # Perform FFT on the frame
        spectrum = np.abs(np.fft.rfft(frame))
        freqs = np.fft.rfftfreq(len(frame), 1/sr)
        
        # Identify peaks in the spectrum to estimate formants
        peaks = np.argsort(spectrum)[-3:]  # Get indices of top 3 frequencies
        formants.append([i / sr * 1000, freqs[peaks[0]], freqs[peaks[1]], freqs[peaks[2]]])

# Create a DataFrame to store results
formants_df = pd.DataFrame(formants, columns=['Time (ms)', 'F1', 'F2', 'F3'])

# Visualization of formants using matplotlib
plt.figure(figsize=(12, 6))
plt.plot(formants_df['Time (ms)'], formants_df['F1'], label='F1', color='blue', linewidth=2)
plt.plot(formants_df['Time (ms)'], formants_df['F2'], label='F2', color='green', linewidth=2)
plt.plot(formants_df['Time (ms)'], formants_df['F3'], label='F3', color='red', linewidth=2)

plt.title('Formants Over Time')
plt.xlabel('Time (ms)')
plt.ylabel('Frequency (Hz)')
plt.legend()
plt.grid()
plt.show()


#### Comparing Formant Frequencies of Vowel Sounds

##### **Step 1. Load the Speech Signal**
Load the .mp3 file containing multiple vowels using librosa.load(). This function reads the audio file and returns two outputs: the audio time series (as a NumPy array) and the sample rate. <br>
***Tip:*** To open a WAV file you need to access its directory correctly: `speech_samples/FILE_NAME.wav`

##### **Step 2. Define Frame Parameters**
Define the parameters for processing the audio signal, including the `frame_length` (2048 samples) and `hop_length` (512 samples). These parameters dictate how the audio is segmented for analysis.

##### **Step 3. Initialize Storage for Formants**
Create a dictionary to store the extracted formants for each vowel. Also, define a list of vowel labels `('a', 'e', 'i', 'o', 'u')` that will be used to identify the formants.

##### **Step 4. Specify Vowel Intervals**
Specify the time intervals for each vowel within the audio file. These intervals define the sections of the audio that correspond to the individual vowels, given in seconds.

##### **Step 5.  Extract Formants for Each Vowel**
For each vowel interval, extract the formants:
1. Select the segment of audio corresponding to the vowel.
2. Process the segment in overlapping frames:
    - Compute the Fast Fourier Transform (FFT) of each frame to analyze its frequency spectrum.
    - Identify the indices of the three highest peaks in the frequency spectrum, corresponding to the first three formant frequencies (F1, F2, F3).
3. Store the time (in milliseconds) and the identified formants in a list, which is then converted to a Pandas DataFrame.

##### **Step 6. Plot Formants for Each Formnat**
For each vowel, plot the formant frequencies (F1, F2, F3) over time.

In [None]:
# Load the .mp3 file containing multiple vowels
file_path = 'speech_samples/vowels.mp3'  # Replace with your .mp3 file path
y, sr = librosa.load(file_path, sr=None)

# Define parameters
frame_length = 2048
hop_length = 512

# Initialize a list to store formants for each vowel
formants_dict = {}
vowel_labels = ['a', 'e', 'i', 'o', 'u'] 

# Assume the vowels are spaced out in the audio; you can specify the intervals
vowel_intervals = [(0.7, 1.5), (2, 2.8), (3.5, 4.1), (5, 5.4), (6.4, 7)]  # Adjust the intervals as necessary

# Extract formants for each vowel
for i, (start, end) in enumerate(vowel_intervals):
    # Select the segment corresponding to the vowel
    segment = y[int(start * sr):int(end * sr)]
    formants = []
    
    # Extract formants from the vowel segment
    for j in range(0, len(segment), hop_length):
        if j + frame_length <= len(segment):
            frame = segment[j:j + frame_length]
            spectrum = np.abs(np.fft.rfft(frame))
            freqs = np.fft.rfftfreq(len(frame), 1/sr)
            peaks = np.argsort(spectrum)[-3:]  # Get indices of top 3 frequencies
            formants.append([j / sr * 1000, freqs[peaks[0]], freqs[peaks[1]], freqs[peaks[2]]])
    
    # Store formants in the dictionary
    formants_dict[vowel_labels[i]] = pd.DataFrame(formants, columns=['Time (ms)', 'F1', 'F2', 'F3'])

# Create subplots for each vowel
fig, axs = plt.subplots(5, 1, figsize=(12, 20), sharex=True)

# Plotting each vowel's formants
for i, vowel in enumerate(vowel_labels):
    axs[i].plot(formants_dict[vowel]['Time (ms)'], formants_dict[vowel]['F1'], label='F1', color='blue', linewidth=2)
    axs[i].plot(formants_dict[vowel]['Time (ms)'], formants_dict[vowel]['F2'], label='F2', linestyle='--', color='green')
    axs[i].plot(formants_dict[vowel]['Time (ms)'], formants_dict[vowel]['F3'], label='F3', linestyle=':', color='red')
    axs[i].set_title(f'Formants for Vowel {vowel}')
    axs[i].set_ylabel('Frequency (Hz)')
    axs[i].set_xlabel('Time (ms)')  # Set x-label for each subplot
    axs[i].grid(True)
    axs[i].legend()

plt.tight_layout()
plt.show()


#### Automatic Classification of Vowel Sounds Based on Formant Frequencies

##### **Step 1. Load the Speech Signal**
Load the .mp3 file containing multiple vowels using librosa.load(). This function reads the audio file and returns two outputs: the audio time series (as a NumPy array) and the sample rate. <br>
***Tip:*** To open a WAV file you need to access its directory correctly: `speech_samples/FILE_NAME.wav`

##### **Step 2. Define Frame Parameters**
Define the parameters for processing the audio signal, including the `frame_length` (2048 samples) and `hop_length` (512 samples). These parameters dictate how the audio is segmented for analysis.

##### **Step 3. Initialize Storage for Formants**

##### **Step 4. Specify Vowel Intervals & Storage for Formants**
Specify the time intervals for each vowel within the audio file. These intervals define the sections of the audio that correspond to the individual vowels, given in seconds.
Create a dictionary to store the extracted formants for each vowel. Also, define a list of vowel labels `('a', 'e', 'i', 'o', 'u')` that will be used to identify the formants.

##### **Step 5.  Extract Formants and Classify Vowels*
For each vowel interval:
1. Extract the corresponding audio segment.
2. Analyze it in overlapping frames to calculate the formant frequencies.
3. Store the average formant frequencies for classification.
4. Use the average formants to classify each vowel based on the provided reference formant frequencies for the vowels.

##### **Step 6. Classify the Vowels**
For each vowel, compare its extracted average formants to the reference formants. Calculate the distance (e.g., Euclidean distance) from each vowel to each reference vowel, and classify it based on the closest match.

##### **Step 6. Display Results**
Display the classified vowels and their corresponding average formant frequencies.


NB! This model is NOT precise!!!

In [None]:
# Load the .mp3 file containing multiple vowels
file_path = 'speech_samples/vowels.mp3'  # Replace with your .mp3 file path
y, sr = librosa.load(file_path, sr=None)

# Define parameters
frame_length = 2048
hop_length = 512

# Define vowel intervals and labels
vowel_intervals = [(0.7, 1.5), (2, 2.8), (3.5, 4.1), (5, 5.4), (6.4, 7)]
vowel_labels = ['a', 'e', 'i', 'o', 'u'] 

# Example reference formant frequencies (F1, F2) for classification
# Updated reference formant frequencies (F1, F2) for classification
reference_formants = {
    'a': [180, 900],
    'e': [195, 205],
    'i': [134, 270],
    'o': [242, 320],
    'u': [133, 175],
}

# Initialize storage for extracted formants
extracted_formants = {}

# Extract formants and classify
for i, (start, end) in enumerate(vowel_intervals):
    segment = y[int(start * sr):int(end * sr)]
    formants = []
    
    # Extract formants from the vowel segment
    for j in range(0, len(segment), hop_length):
        if j + frame_length <= len(segment):
            frame = segment[j:j + frame_length]
            spectrum = np.abs(np.fft.rfft(frame))
            freqs = np.fft.rfftfreq(len(frame), 1/sr)
            peaks = np.argsort(spectrum)[-3:]  # Get indices of top 3 frequencies
            formants.append([freqs[peaks[0]], freqs[peaks[1]], freqs[peaks[2]]])  # Store F1, F2, F3
    
    # Calculate average F1 and F2
    avg_formants = np.mean(formants, axis=0)[:2]  # Average F1 and F2
    extracted_formants[vowel_labels[i]] = avg_formants

# Classification based on the closest formant distances
classified_vowels = {}

for vowel, avg_formant in extracted_formants.items():
    distances = {}
    for ref_vowel, ref_formants in reference_formants.items():
        distance = np.linalg.norm(np.array(avg_formant) - np.array(ref_formants[:2]))  # Distance between F1 and F2
        distances[ref_vowel] = distance
    
    classified_vowels[vowel] = min(distances, key=distances.get)  # Get the vowel with the smallest distance

# Print out classified results
for vowel, classification in classified_vowels.items():
    print(f'Vowel {vowel} classified as {classification} with average formants: {extracted_formants[vowel]} Hz')
