In [1]:
import os, shutil, torch, whisper as ws
from typing import Any

print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available())
print("ffmpeg on PATH:", bool(shutil.which("ffmpeg")))

Torch: 2.8.0+cpu CUDA: False
ffmpeg on PATH: True


In [None]:
import json as js
audio_path = r"d:\Early Spark\backend\audio\audio2.mp3"
print("Audio exists:", os.path.exists(audio_path))

model: Any = ws.load_model("small", device="cpu", download_root=r"D:\Models\whisper_cache")
result: dict = model.transcribe(audio_path, fp16=False, language="en", word_timestamps=True)
with open("transcription.json", "w", encoding="utf-8") as f:
    js.dump(result, f, ensure_ascii=False, indent=4)

Audio exists: True


In [11]:
import numpy as np
with open("transcription.json", "r", encoding="utf-8") as f:
	text = js.load(f)

total_time:float = -0.0
total_pause_time:float = 0.0
for segment in text["segments"]:
    total_time = max(total_time, segment["end"])
    if(np.abs(segment["start"] - segment["end"])>0.8 ):
        total_pause_time += np.abs(segment["start"] - segment["end"])
print("Total time:", total_time)
print("Total pause time:", total_pause_time)

Total time: 18.8
Total pause time: 16.360000000000003


In [14]:
pause_density : float = np.round(total_pause_time / (total_pause_time + total_time) * 100,4)
print("Pause density (%):", pause_density)

Pause density (%): 46.5301


In [19]:
words_count: dict[str, int] = {}

new_text:str = text["text"].replace("\n", " ").replace(".", " ").replace(",", " ").replace("!", " ").replace("?", " ").replace(";", " ").replace(":", " ").replace("\"", " ").replace("'", " ").replace("(", " ").replace(")", " ").replace("[", " ").replace("]", " ").replace("{", " ").replace("}", " ").replace("-", " ").replace("_", " ")

for word in new_text.split():
    words_count[word.lower()] = words_count.get(word.lower(), 0) + 1

words_count


{'e': 1,
 'you': 2,
 'woke': 1,
 'up': 1,
 'late': 1,
 'today': 1,
 'um': 1,
 'then': 1,
 'i': 5,
 'went': 1,
 'to': 1,
 'the': 5,
 'kitchen': 1,
 'and': 3,
 'uh': 1,
 'made': 1,
 'tea': 1,
 'sat': 2,
 'down': 2,
 'in': 1,
 'chair': 1,
 'just': 1,
 'stared': 1,
 'at': 1,
 'clock': 1,
 'know': 1,
 'was': 1,
 'thinking': 1,
 'about': 1,
 'river': 2,
 'near': 1,
 'my': 1,
 'old': 1,
 'house': 1,
 'mango': 1,
 'trees': 1,
 'paper': 1,
 'boats': 1,
 'it': 1,
 's': 1,
 'a': 1}

In [22]:
repeated_words:int =0
for i in words_count:
    if words_count[i] > 1:
        repeated_words += words_count.get(i,0)-1
print("Repeated words:", repeated_words)

Repeated words: 14


In [26]:
filler_Count :int =0
for i in ["um","uh","like","you know","so","actually","basically","right","i mean","and","but","or"]:
    filler_Count += words_count.get(i,0)

filler_frequency : float = np.round(filler_Count / np.sum(list(words_count.values())) * 100,4)
print("Filler words:", filler_Count)
print("Filler frequency:", filler_frequency)

Filler words: 5
Filler frequency: 9.0909


In [27]:
unique_words:int =0
for i in words_count:
    if words_count[i] ==1:
        unique_words += 1
print("Unique words:", unique_words)

Unique words: 34


In [28]:
lexical_diversity : float = np.round(unique_words / np.sum(list(words_count.values())) * 100,4)
print("Lexical diversity (%):", lexical_diversity)

Lexical diversity (%): 61.8182


In [31]:
speech_fluency : float = np.round(100 - (pause_density * 0.6) - (filler_frequency * 0.8) - (repeated_words * 1.5) + (lexical_diversity * 0.2), 2)
print("Speech fluency (words/sec):", speech_fluency)

Speech fluency (words/sec): 56.17


In [32]:
final_rest :dict[str,float] = {
    "Total time": total_time,
    "Total pause time": total_pause_time,
    "Pause density (%)": pause_density,
    "Repeated words": repeated_words,
    "Filler words": filler_Count,
    "Filler frequency (%)": filler_frequency,
    "Unique words": unique_words,
    "Lexical diversity (%)": lexical_diversity,
    "Speech fluency (words/sec)": speech_fluency
}