In [None]:
import parselmouth
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = "/workspaces/voice-analysis/notebooks-pocs/audio/fast.wav"

In [None]:
sns.set_theme() # Use seaborn's default style to make attractive graphs

# Plot nice figures using Python's "standard" matplotlib library
snd = parselmouth.Sound(file_path)
plt.figure()
plt.plot(snd.xs(), snd.values.T)
plt.xlim([snd.xmin, snd.xmax])
plt.xlabel("time [s]")
plt.ylabel("amplitude")
plt.show() # or plt.savefig("sound.png"), or plt.savefig("sound.pdf")

In [None]:
dfXs = pd.DataFrame(snd.xs())
print(dfXs.describe())



In [None]:
dfValues = pd.DataFrame(snd.values.T)
print(dfValues.describe())


In [None]:
import librosa
import librosa.display

In [None]:

# Function to extract audio features
def extract_features(file_path):
    y, sr = librosa.load(file_path)
    #print(y)
    #print(sr)
    
    # Pitch (F0) and Pitch Variation
    snd = parselmouth.Sound(file_path)
    pitch = snd.to_pitch()
    pitch_values = pitch.selected_array['frequency']
    pitch_values = pitch_values[pitch_values != 0]  # Remove unvoiced parts
    mean_pitch = np.mean(pitch_values)
    std_pitch = np.std(pitch_values)
    
    # Inflection (Pitch Range)
    min_pitch = np.min(pitch_values)
    max_pitch = np.max(pitch_values)
    pitch_range = max_pitch - min_pitch
    
    # Words Per Minute (WPM)
    duration = librosa.get_duration(y=y, sr=sr)
    words = librosa.effects.split(y)
    word_count = len(words)
    #print(words)
    wpm = (word_count / duration) * 60
    
    # Tone and Timbre (MFCCs)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    
    return {
        'min_pitch': min_pitch,        
        'mean_pitch': mean_pitch,
        'std_pitch': std_pitch,
        'max_pitch': max_pitch,
        'pitch_range': pitch_range,
        'wpm': wpm,
        'mfccs': mfccs,
        'duration': duration
    }

# Function to plot audio features
def plot_features(features):
    plt.figure(figsize=(14, 8))
    
    # Plot Pitch
    plt.subplot(2, 1, 1)
    plt.plot(features['mfccs'][0])
    plt.title('MFCCs')
    
    # Plot MFCCs
    plt.subplot(2, 1, 2)
    librosa.display.specshow(features['mfccs'], x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    
    plt.tight_layout()
    plt.show()

In [None]:
def show(file_path):
    features = extract_features(file_path)
    #plot_features(features)

    # Print extracted features
    print(f"Min Pitch: {features['min_pitch']}")
    print(f"Mean Pitch: {features['mean_pitch']}")
    print(f"Pitch Variation: {features['std_pitch']}")
    print(f"Max Pitch: {features['max_pitch']}")
    print(f"Pitch Range: {features['pitch_range']}")
    print(f"Words Per Minute: {features['wpm']}")

In [None]:
show("/workspaces/voice-analysis/notebooks-pocs/audio/fast.wav")

In [None]:
show("/workspaces/voice-analysis/notebooks-pocs/audio/slow.wav")

In [None]:
show("/workspaces/voice-analysis/notebooks-pocs/audio/tone-variety.wav")

In [53]:
print("***HIGH***")
show("/workspaces/voice-analysis/notebooks-pocs/audio/high.wav")

print("***LOW***")
show("/workspaces/voice-analysis/notebooks-pocs/audio/low.wav")

***HIGH***
Min Pitch: 293.30663065805834
Mean Pitch: 390.03246660734777
Pitch Variation: 26.84226478533146
Max Pitch: 446.65750082110793
Pitch Range: 153.3508701630496
Words Per Minute: 38.516405135520685
***LOW***
Min Pitch: 75.55221978702873
Mean Pitch: 112.77171178343373
Pitch Variation: 45.579179480323624
Max Pitch: 447.4033545486049
Pitch Range: 371.8511347615762
Words Per Minute: 48.56504037909632


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from scipy.signal import correlate
from scipy.stats import pearsonr

# Load the CSV file
data = pd.read_csv('/workspaces/voice-analysis/high_low.wav.csv')

# Separate the data based on labels
high = data[data['label'] == 'high']
low = data[data['label'] == 'low']

# Extract the frequency values
high_freq = high['frequency'].values
low_freq = low['frequency'].values

# Ensure both series have the same length for comparison
min_length = min(len(high_freq), len(low_freq))
high_freq = high_freq[:min_length]
low_freq = low_freq[:min_length]

# Dynamic Time Warping (DTW)
print(high_freq)
print(f"high_freq DIM: {np.asarray(high_freq, order='c').ndim}")
print(low_freq)
print(f"low_freq DIM: {np.asarray(low_freq, order='c').ndim}")
distance, path = fastdtw(high_freq, low_freq, dist=euclidean)

# Cross-Correlation
cross_corr = correlate(high_freq, low_freq)

# Euclidean Distance
euclidean_distance = np.linalg.norm(high_freq - low_freq)

# Statistical Measures: Pearson Correlation Coefficient
pearson_corr, _ = pearsonr(high_freq, low_freq)

# Plotting the results
plt.figure(figsize=(14, 10))

# Plot High and Low frequencies
plt.subplot(4, 1, 1)
plt.plot(high['seconds'], high_freq, label='High Frequency')
plt.plot(low['seconds'], low_freq, label='Low Frequency')
plt.title('High vs Low Frequency')
plt.xlabel('Time (seconds)')
plt.ylabel('Frequency (Hz)')
plt.legend()

# Plot DTW path
plt.subplot(4, 1, 2)
plt.plot(path)
plt.title(f'Dynamic Time Warping Path (Distance: {distance:.2f})')
plt.xlabel('Path Index')
plt.ylabel('Frequency Index')

# Plot Cross-Correlation
plt.subplot(4, 1, 3)
plt.plot(cross_corr)
plt.title('Cross-Correlation')
plt.xlabel('Lag')
plt.ylabel('Correlation')

# Plot Euclidean Distance and Pearson Correlation Coefficient
plt.subplot(4, 1, 4)
plt.bar(['Euclidean Distance', 'Pearson Correlation'], [euclidean_distance, pearson_corr])
plt.title('Euclidean Distance and Pearson Correlation Coefficient')
plt.ylabel('Value')

# Adjust layout and show plot
plt.tight_layout()
plt.show()

print(f"DTW Distance: {distance:.2f}")
print(f"Euclidean Distance: {euclidean_distance:.2f}")
print(f"Pearson Correlation Coefficient: {pearson_corr:.2f}")



[0. 0. 0. ... 0. 0. 0.]
high_freq DIM: 1
[0. 0. 0. ... 0. 0. 0.]
low_freq DIM: 1


ValueError: Input vector should be 1-D.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from dtaidistance import dtw
from scipy.signal import correlate

# Load the data from the CSV file
data = pd.read_csv('/workspaces/voice-analysis/high_low.wav.csv')

# Extract high and low frequency data
high_freq = data[data['label'] == 'high']['frequency'].values
low_freq = data[data['label'] == 'low']['frequency'].values

# Ensure both series are 1-D arrays
high_freq = np.asarray(high_freq, order='C')
low_freq = np.asarray(low_freq, order='C')

# Remove zero values from both series to avoid ValueError in DTW calculation
high_freq = high_freq[high_freq != 0]
low_freq = low_freq[low_freq != 0]

# Dynamic Time Warping (DTW)
distance, path = dtw.warping_paths(high_freq, low_freq)

# Cross-Correlation
cross_corr = correlate(high_freq, low_freq)

# Euclidean Distance
euclidean_distance = np.linalg.norm(high_freq - low_freq)

# Statistical Measures
mean_high = np.mean(high_freq)
mean_low = np.mean(low_freq)
std_high = np.std(high_freq)
std_low = np.std(low_freq)

# Plotting the results
plt.figure(figsize=(12, 8))

# Plot high and low frequencies
plt.subplot(3, 1, 1)
plt.plot(high_freq, label='High Frequency')
plt.plot(low_freq, label='Low Frequency')
plt.title('High vs Low Frequency')
plt.legend()

# Plot cross-correlation
plt.subplot(3, 1, 2)
plt.plot(cross_corr)
plt.title('Cross-Correlation')

# Plot DTW path
plt.subplot(3, 1, 3)
for (map_x, map_y) in path:
    plt.plot([map_x, map_y], [high_freq[map_x], low_freq[map_y]], color='gray')
plt.plot(high_freq, label='High Frequency')
plt.plot(low_freq, label='Low Frequency')
plt.title('DTW Path')
plt.legend()

plt.tight_layout()
plt.show()

print(f"DTW Distance: {distance}")
print(f"Euclidean Distance: {euclidean_distance}")
print(f"Mean High Frequency: {mean_high}")
print(f"Mean Low Frequency: {mean_low}")
print(f"Standard Deviation High Frequency: {std_high}")
print(f"Standard Deviation Low Frequency: {std_low}")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/local/python/3.10.15/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/python/3.10.15/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/vscode/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/vscode/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/

ValueError: operands could not be broadcast together with shapes (595,) (478,) 

## SPeech Tempo failed attemps


## Speech Tempo
The best measure I could find is Words per minute

In [None]:
sr = 2205000
y, sr = librosa.load(audio_filename, sr=sr)

### Undersetanding Librosa

Understanding the data Librosa has to offer
It looks like upon loading the audio file, it provides a relative amplitude time series

```python
y, sr = librosa.load(audio_filename, sr=sr)
snd = parselmouth.Sound(audio_filename)

y ==  snd.values.T
```

In [None]:

#duration = librosa.get_duration(y=y, sr=sr)
 #   words = librosa.effects.split(y)
  #  word_count = len(words)

#### Creating the time series array as Librosa doesn't provide one

In [None]:
time = np.arange(0, len(y)) / sr
time.size

125883450

: 

#### Plotting Librosa aplitude next to Parselmouth Sound Amplitude
Same information

In [None]:

fig, ax1 = plt.subplots(figsize=(20, 4), dpi=300)

# Plot the amplitude from snd parselmouth
ax1.plot(snd.xs(), snd.values.T, color='blue')
ax1.set_xlabel("time [s]")
ax1.set_ylabel("amplitude [Pa]", color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Plot amplitude frm librosa
ax2 = ax1.twinx()
ax2.plot(time,y, color='red')
ax2.set_ylabel("relative amplitude", color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Show the plot
fig.tight_layout()  # Adjust layout to prevent overlap
plt.show()

: 

: 

#### Getting Words per minute

In [None]:
words = librosa.effects.split(y)
words

In [None]:

for topdb in range(90, 200, 10):
    print(f"words with topdb = {topdb} =  {len(librosa.effects.split(y, top_db=topdb))}")
    print("*****")



In [None]:
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
print(f"Estimated Tempo: {tempo} beats per minute")

In [None]:
from parselmouth.praat import call
snd.to_pitch()
formant = call(snd, "To Formant (burg)", 0.0, 5, 5500, 0.025, 50.0)
print(formant)
#formant.values

In [None]:
pitch_threashold = snd.to_pitch(time_step=0.1, pitch_floor=75.0, pitch_ceiling=600.0)
pitch_threashold

In [None]:
len(pitch)

In [None]:
len(pitch_threashold)

In [None]:

pitch_threashold_values = pitch_threashold.selected_array['frequency']
# NOt plotting zero values
pitch_threashold_values[pitch_threashold_values==0] = np.nan    
plt.plot(pitch_threashold.xs(), pitch_threashold_values, '.', markersize=4, color="black")            
#plt.ylim(0, np.nanmax(pitch_threashold_values))  

plt.ylabel("fundamental frequency [Hz]")
plt.xlabel("time [secs]")

#draw_horizontal_shadows(plt, 0.2)
#draw_legend(plt, 0.4)
plt.show()

When I tried to install spanish at sphnx

BEfore realizing google freee speech recognition does even a better job 

Trying Sphinx dfor spanish

In [None]:
from pocketsphinx import get_model_path

pocketsphinx_model_path = get_model_path()
pocketsphinx_model_path

downloaded_path = "/workspaces/voice-analysis/dependency/pocketsphinx_language"

pocketsphinx_model_path_es = os.path.join(pocketsphinx_model_path, "es-ES")
# /usr/local/python/3.10.15/lib/python3.10/site-packages/pocketsphinx/model/es-ES'


pocketsphinx_model_path_es = "/usr/local/python/3.10.15/lib/python3.10/site-packages/speech_recognition/pocketsphinx-data/es-ES"

os.makedirs(pocketsphinx_model_path_es, exist_ok=True)

shutil.copy2(os.path.join(downloaded_path, "es-20k.lm.gz"), pocketsphinx_model_path_es)
shutil.copy2(os.path.join(downloaded_path, "es.dict"), pocketsphinx_model_path_es)
shutil.copy2(os.path.join(downloaded_path, "cmusphinx-es-5.2.tar.gz"), pocketsphinx_model_path_es)

## Chunkng to get WPM by sections

In [None]:


# times between which to extract the wave from
start = 6 # seconds
end = 11 # seconds

# file to extract the snippet from
with wave.open(audio_filename, "rb") as infile:
    # get file data
    nchannels = infile.getnchannels()
    sampwidth = infile.getsampwidth()
    framerate = infile.getframerate()
    # set position in wave to start of segment
    infile.setpos(int(start * framerate))
    # extract data
    data = infile.readframes(int((end - start) * framerate))

# write the extracted data to a new file
with wave.open(audio_filename+"second_half.wav", 'w') as outfile:
    outfile.setnchannels(nchannels)
    outfile.setsampwidth(sampwidth)
    outfile.setframerate(framerate)
    outfile.setnframes(int(len(data) / sampwidth))
    outfile.writeframes(data)
with sr.AudioFile(audio_filename+"second_half.wav") as source:
    audio_data = recognizer.record(source)
    # Recognize speech using Google Web Speech API
    text = recognizer.recognize_google(audio_data, language="es-ES")
    print("Transcription:", text)
