In [4]:
# !pip install pydub

In [5]:
# from google.colab import drive

In [6]:
# drive.mount('/content/drive')

# images = '/content/drive/MyDrive/Colab Notebooks/audio_samples/' # zip file path
# ouput = '/content/dataset' # extracted dataset

In [None]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import speech_recognition as sr
import spacy
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from pydub import AudioSegment

nlp = spacy.load("en_core_web_sm")

# audio preprocessing & speech to text
def preprocess_audio(audio_path, sr_target=16000):
    y, sr = librosa.load(audio_path, sr=sr_target, mono=True)
    y_trimmed, _ = librosa.effects.trim(y)
    return y_trimmed, sr

def transcribe_audio(audio_path):
    # convert audio to WAV format if it's not already
    if not audio_path.lower().endswith(('.wav', '.flac', '.aiff', '.aifc')):
        try:
            audio = AudioSegment.from_file(audio_path)
            temp_audio_path = audio_path + '.wav'  # create a temporary WAV file
            audio.export(temp_audio_path, format="wav")
            audio_path = temp_audio_path  # sse the temporary WAV file for transcription
        except Exception as e:
            print(f"Error converting audio file: {e}")
            return ""  # return empty string if conversion fails

    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        text = ""

    # removing the temporary WAV file if it was created
    if audio_path.endswith('.wav') and audio_path != audio_path:
        os.remove(audio_path)

    return text

# feature extraction
def extract_audio_features(y, sr):
    duration = librosa.get_duration(y=y, sr=sr)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)]
    pitch_std = np.std(pitch) if len(pitch) > 0 else 0
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    return {
        "speech_rate": tempo,
        "pitch_variability": pitch_std,
        "duration": duration
    }

def extract_text_features(text):
    doc = nlp(text)
    num_sentences = len(list(doc.sents))
    num_words = len([t for t in doc if t.is_alpha])
    hesitations = sum(1 for word in text.lower().split() if word in ["uh", "um", "like"])
    pause_rate = hesitations / num_sentences if num_sentences > 0 else 0
    word_per_sentence = num_words / num_sentences if num_sentences > 0 else 0
    return {
        "pause_rate": pause_rate,
        "word_per_sentence": word_per_sentence,
        "hesitation_count": hesitations
    }

# unsupervised analysis #
def detect_anomalies(features):
    feature_matrix = np.array([[
        f["speech_rate"][0] if isinstance(f["speech_rate"], np.ndarray) else f["speech_rate"],  # access the first element if it's an array
        f["pitch_variability"],
        f["pause_rate"],
        f["word_per_sentence"]
    ] for f in features])
    model = IsolationForest(contamination=0.2)
    anomalies = model.fit_predict(feature_matrix)
    return anomalies

# demo run #
def analyze_audio_files(folder_path):
    results = []
    for file in os.listdir(folder_path):
        if file.endswith((".wav", ".ogg")):
            audio_path = os.path.join(folder_path, file)
            y, sr = preprocess_audio(audio_path)
            text = transcribe_audio(audio_path)
            audio_feats = extract_audio_features(y, sr)
            text_feats = extract_text_features(text)
            all_feats = {**audio_feats, **text_feats, "filename": file, "text": text}
            results.append(all_feats)
    return results

# risk scoring api #
def get_cognitive_decline_risk_score(audio_path):
    y, sr = preprocess_audio(audio_path)
    text = transcribe_audio(audio_path)
    audio_feats = extract_audio_features(y, sr)
    text_feats = extract_text_features(text)
    # simple scoring: normalized sum of key features
    score = (
        audio_feats["pitch_variability"] * 0.3 +
        text_feats["pause_rate"] * 0.4 +
        text_feats["hesitation_count"] * 0.3
    )
    return min(score, 1.0)

if __name__ == "__main__":
    folder = './audio_samples/'  # path to the folder containing audio files
    print("Analyzing audio files in:", folder)
    all_features = analyze_audio_files(folder)
    for entry in all_features:
        print(f"\nFile: {entry['filename']}")
        print(f"Transcript: {entry['text']}")
        print("Features:", {k: v for k, v in entry.items() if k not in ["filename", "text"]})

    anomalies = detect_anomalies(all_features)
    for i, entry in enumerate(all_features):
        entry["anomaly"] = anomalies[i]
        print(f"{entry['filename']} - {'At Risk' if anomalies[i] == -1 else 'Normal'}")


Analyzing audio files in: /content/drive/MyDrive/Colab Notebooks/audio_samples/

File: voice_one.ogg
Transcript: what's your name
Features: {'speech_rate': array([170.45454545]), 'pitch_variability': np.float32(816.0424), 'duration': 1.6135, 'pause_rate': 0.0, 'word_per_sentence': 3.0, 'hesitation_count': 0}

File: voice_two.ogg
Transcript: hello everyone my name is harshad and I am from mtk today I'm going to leave a Road Plantation on topic energy efficient and LP so as per the growing models and it'll be doing best model are very low so as you can see
Features: {'speech_rate': array([156.25]), 'pitch_variability': np.float32(950.50323), 'duration': 35.0335, 'pause_rate': 0.0, 'word_per_sentence': 15.0, 'hesitation_count': 0}

File: voice_one.ogg.wav
Transcript: what's your name
Features: {'speech_rate': array([170.45454545]), 'pitch_variability': np.float32(888.43646), 'duration': 1.6135, 'pause_rate': 0.0, 'word_per_sentence': 3.0, 'hesitation_count': 0}

File: voice_two.ogg.wav
Tr