In [63]:
!pip install -q transformers torch pydub openai-whisper accelerate numba syllables librosa torchaudio openpyxl

In [None]:
import torch
from transformers import pipeline
from IPython.display import Audio, display
import numpy as np
import whisper

device = "mps"

Test with one audio

In [None]:
from pydub import AudioSegment

#convert to wav format
audio = AudioSegment.from_file("speechscoring/speech_data/audio onepte-repeat sentence/audio_0a5a69f5-ba7d-4711-b43d-06d71d8cb59d.m4a")
audio.export("output.wav", format="wav")

In [None]:
def transcribe_audio(file_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio=file_path,  word_timestamps=True, language='en') # word_timestamps=True to get timestamps
    return result

result = transcribe_audio("output.wav")
print("Transcribed Text:")
print(result["text"])
print(result['segments'])

#print timestamps
for segment in result['segments']:
    print(''.join(f"{word['word']}[{word['start']}/{word['end']}]" 
                    for word in segment['words']))

In [None]:
whisper_output = result['segments']
whisper_output

In [None]:
# Define pause threshold
pause_threshold = 0.25

whisper_output = result['segments'][0]['words']

# Detect pauses
pauses = []
for i in range(1, len(whisper_output)):
    prev_word_end = whisper_output[i-1]["end"]
    curr_word_start = whisper_output[i]["start"]
    pause_duration = curr_word_start - prev_word_end

    if pause_duration > pause_threshold:
        pauses.append({
            "start": prev_word_end,
            "end": curr_word_start,
            "duration": pause_duration
        })

for pause in pauses:
    print(f"Pause from {pause['start']:.2f}s to {pause['end']:.2f}s (duration: {pause['duration']:.2f}s)")

# Calculate total pause duration and frequency
total_pause_duration = sum(pause["duration"] for pause in pauses)
pause_frequency = len(pauses)

print("\nTotal pause duration:", total_pause_duration)
print("Pause frequency:", pause_frequency)

In [None]:
# Count syllables (requires a syllable counter)
from syllables import estimate

syllable_count = estimate(result['text'])

# Calculate total duration (including pauses)
total_duration = whisper_output[-1]["end"]

# Calculate speaking duration (excluding pauses)
speaking_duration = total_duration - total_pause_duration

# Calculate speech rate and articulation rate
speech_rate = syllable_count / total_duration
articulation_rate = syllable_count / speaking_duration

# Calculate average pause duration
average_pause_duration = total_pause_duration / pause_frequency if pause_frequency > 0 else 0

# Calculate phonation time ratio
phonation_time_ratio = speaking_duration / total_duration # not sure if this is same as the formula exactly 

print("\nFluency Features:")
print("Speech Rate:", speech_rate)
print("Articulation Rate:", articulation_rate)
print("Average Pause Duration:", average_pause_duration)
print("Phonation Time Ratio:", phonation_time_ratio)

In [1]:
import librosa

# Load audio file
audio_path = "output.wav"
y, sr = librosa.load(audio_path, sr=16000)  # y: audio signal, sr: sampling rate

In [None]:
import matplotlib.pyplot as plt

# Plot the waveform
plt.figure(figsize=(14, 5))
plt.plot(y)
plt.title('Waveform of the Audio Signal')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.show()

Process datatset

In [None]:
import openpyxl
import os

# Load Whisper model
model = whisper.load_model("base")

# Define pause threshold
pause_threshold = 0.25

# Function to extract features from audio
def extract_features(audio_path):
    # Automatically detect format based on file extension
    audio = AudioSegment.from_file(audio_path)
    audio.export("temp.wav", format="wav")

    # Transcribe audio
    result = model.transcribe(audio="temp.wav", word_timestamps=True)
    whisper_output = result['segments'][0]['words']

    # Detect pauses
    pauses = []
    for i in range(1, len(whisper_output)):
        prev_word_end = whisper_output[i-1]["end"]
        curr_word_start = whisper_output[i]["start"]
        pause_duration = curr_word_start - prev_word_end

        if pause_duration > pause_threshold:
            pauses.append({
                "start": prev_word_end,
                "end": curr_word_start,
                "duration": pause_duration
            })

    # Calculate total pause duration and frequency
    total_pause_duration = sum(pause["duration"] for pause in pauses)
    pause_frequency = len(pauses)

    # Count syllables
    syllable_count = estimate(result['text'])

    # Calculate total duration (including pauses)
    total_duration = whisper_output[-1]["end"]

    # Calculate speaking duration (excluding pauses)
    speaking_duration = total_duration - total_pause_duration

    # Calculate features
    speech_rate = syllable_count / total_duration
    articulation_rate = syllable_count / speaking_duration
    average_pause_duration = total_pause_duration / pause_frequency if pause_frequency > 0 else 0
    phonation_time_ratio = speaking_duration / total_duration

    # Return features as a dictionary
    return {
        "speech_rate": speech_rate,
        "articulation_rate": articulation_rate,
        "average_pause_duration": average_pause_duration,
        "phonation_time_ratio": phonation_time_ratio
    }

# Path to the directory containing audio files
audio_dir = "speechscoring/speech_data/audio onepte-repeat sentence"

# Load existing Excel file
input_excel_path = "speechscoring/processed_audio_sample_scoring.xlsx"  
wb = openpyxl.load_workbook(input_excel_path)
ws = wb.active

# Find the last row with data
last_row = ws.max_row

# Add headers for new columns if they don't exist
if "Speech Rate" not in [cell.value for cell in ws[1]]:
    ws.cell(row=1, column=ws.max_column + 1, value="Speech Rate")
    ws.cell(row=1, column=ws.max_column + 1, value="Articulation Rate")
    ws.cell(row=1, column=ws.max_column + 1, value="Average Pause Duration")
    ws.cell(row=1, column=ws.max_column + 1, value="Phonation Time Ratio")

# Iterate through audio files
for file_name in os.listdir(audio_dir):
    if file_name.endswith(".m4a") or file_name.endswith(".mp3"):
        file_path = os.path.join(audio_dir, file_name)
        print(f"Processing {file_name}...")

        # Extract features
        features = extract_features(file_path)

        # Remove file extension for matching
        file_name_without_extension = os.path.splitext(file_name)[0]

        # Find the row corresponding to the current file
        for row in ws.iter_rows(min_row=2, max_row=last_row, min_col=1, max_col=2):
            if row[1].value == file_name_without_extension:
                # Write features to the corresponding row
                ws.cell(row=row[1].row, column=ws.max_column - 3, value=features["speech_rate"])
                ws.cell(row=row[1].row, column=ws.max_column - 2, value=features["articulation_rate"])
                ws.cell(row=row[1].row, column=ws.max_column - 1, value=features["average_pause_duration"])
                ws.cell(row=row[1].row, column=ws.max_column, value=features["phonation_time_ratio"])
                break

# Save the updated Excel file
output_excel_path = "updated_speech_features.xlsx"
wb.save(output_excel_path)
print(f"Features saved to {output_excel_path}")