<a href="https://colab.research.google.com/github/kandika-Rohan/audio_to_text/blob/emotion_added/emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:




# --------------------------
# 2. EMOTION DATASET PREPARATION
# --------------------------
# Download and extract RAVDESS emotional speech dataset
!wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
!unzip -q Audio_Speech_Actors_01-24.zip -d "/content/emotion_dataset"

def load_emotion_data(dataset_path):
    emotions = []
    features = []

    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav"):
                emotion_code = int(file.split("-")[2])
                emotions.append({
                    1: 'neutral', 2: 'calm', 3: 'happy',
                    4: 'sad', 5: 'angry', 6: 'fear',
                    7: 'disgust', 8: 'surprise'
                }[emotion_code])

                # Load and preprocess audio
                file_path = os.path.join(root, file)
                y, sr = librosa.load(file_path, sr=22050)
                y = nr.reduce_noise(y=y, sr=sr)

                # Extract MFCC features
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                mfcc_processed = np.mean(mfcc.T, axis=0)

                features.append(mfcc_processed)

    return np.array(features), np.array(emotions)

# Load and prepare dataset
print("Loading emotion dataset...")
features, emotions = load_emotion_data("/content/emotion_dataset")

# Encode labels
label_encoder = LabelEncoder()
encoded_emotions = label_encoder.fit_transform(emotions)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    features, encoded_emotions, test_size=0.2, random_state=42
)

# --------------------------
# 3. EMOTION RECOGNITION MODEL
# --------------------------
def create_emotion_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Create and train model
print("Training emotion recognition model...")
model = create_emotion_model((X_train.shape[1],), len(label_encoder.classes_))
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

# Save model
model.save('/content/drive/MyDrive/emotion_model.h5')

# --------------------------
# 4. AUDIO PROCESSING PIPELINE
# --------------------------
class SafetyAnalyzer:
    def _init_(self):
        self.panic_words = {
            "help", "fire", "emergency", "danger", "run", "stop", "police",
            "watch out", "look out", "get down", "move", "duck", "stay back",
            "back off", "stay away", "call 911", "call an ambulance",
            "call the police", "call for help", "mayday", "sos", "alert"
        }
        self.whisper_model = whisper.load_model("base")
        self.emotion_model = tf.keras.models.load_model('/content/drive/MyDrive/emotion_model.h5')
        self.label_encoder = label_encoder

    def process_audio(self, input_path):
        # Convert to WAV
        if input_path.endswith(".opus"):
            audio = AudioSegment.from_file(input_path)
            wav_path = input_path.replace(".opus", ".wav")
            audio.export(wav_path, format="wav")
            os.remove(input_path)
            input_path = wav_path

        # Noise reduction
        y, sr = librosa.load(input_path, sr=22050)
        y_clean = nr.reduce_noise(y=y, sr=sr)
        clean_path = input_path.replace(".wav", "_clean.wav")
        sf.write(clean_path, y_clean, sr)

        return clean_path

    def analyze_emotion(self, audio_path):
        y, sr = librosa.load(audio_path, sr=22050)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        features = np.mean(mfcc.T, axis=0)
        prediction = self.emotion_model.predict(np.expand_dims(features, axis=0))
        emotion = self.label_encoder.inverse_transform([np.argmax(prediction)])[0]
        confidence = np.max(prediction)
        return emotion, confidence

    def transcribe_audio(self, audio_path):
        result = self.whisper_model.transcribe(audio_path)
        return result["text"]

    def analyze_safety(self, audio_path):
        # Process audio
        clean_path = self.process_audio(audio_path)

        # Emotion analysis
        emotion, confidence = self.analyze_emotion(clean_path)

        # Transcription analysis
        transcription = self.transcribe_audio(clean_path)
        panic_detected = [word for word in self.panic_words
                          if word in transcription.lower()]

        # Generate report
        report = {
            "file_path": audio_path,
            "emotion": emotion,
            "confidence": float(confidence),
            "transcription": transcription,
            "panic_words": panic_detected,
            "requires_attention": any(panic_detected) or emotion in ['fear', 'angry']
        }

        return report

# --------------------------
# 5. MAIN EXECUTION
# --------------------------
def analyze_directory(directory_path):
    analyzer = SafetyAnalyzer()
    results = []

    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith((".wav", ".opus")):
                file_path = os.path.join(root, file)
                try:
                    report = analyzer.analyze_safety(file_path)
                    results.append(report)

                    if report["requires_attention"]:
                        print(f"🚨 SAFETY ALERT: {file_path}")
                        print(f"Detected emotion: {report['emotion']} ({report['confidence']:.2%})")
                        print(f"Panic words detected: {report['panic_words']}")
                        print(f"Transcription: {report['transcription']}\n")

                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")

    return results

# Run analysis on your audio directory
audio_directory = "/content/drive/MyDrive/audiosdata/audiototext/audio data"
results = analyze_directory(audio_directory)

# --------------------------
# 6. VISUALIZATION
# --------------------------
def visualize_results(results):
    emotions = [r['emotion'] for r in results]
    alerts = [r['requires_attention'] for r in results]

    plt.figure(figsize=(15, 5))

    # Emotion distribution
    plt.subplot(1, 2, 1)
    pd.Series(emotions).value_counts().plot(kind='bar')
    plt.title("Emotion Distribution")

    # Alert distribution
    plt.subplot(1, 2, 2)
    pd.Series(alerts).value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title("Safety Alerts Distribution")

    plt.tight_layout()
    plt.show()


In [None]:
%%capture
!pip install pydub noisereduce librosa openai-whisper soundfile tensorflow scikit-learn matplotlib
!apt install ffmpeg


In [None]:
# --------------------------
# 1. SETUP AND CONFIGURATION
# --------------------------
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
import librosa
import noisereduce as nr
import soundfile as sf
from pydub import AudioSegment
import whisper
import matplotlib.pyplot as plt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Download and extract RAVDESS emotional speech dataset
!wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
!unzip -q Audio_Speech_Actors_01-24.zip -d "/content/emotion_dataset"

--2025-03-10 10:39:28--  https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.48.194, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/1188976/files/Audio_Speech_Actors_01-24.zip [following]
--2025-03-10 10:39:28--  https://zenodo.org/records/1188976/files/Audio_Speech_Actors_01-24.zip
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 208468073 (199M) [application/octet-stream]
Saving to: ‘Audio_Speech_Actors_01-24.zip’


2025-03-10 10:44:36 (663 KB/s) - ‘Audio_Speech_Actors_01-24.zip’ saved [208468073/208468073]



In [None]:
# --------------------------
# 2. EMOTION DATASET PREPARATION
# --------------------------


def load_emotion_data(dataset_path):
    emotions = []
    features = []

    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav"):
                emotion_code = int(file.split("-")[2])
                emotions.append({
                    1: 'neutral', 2: 'calm', 3: 'happy',
                    4: 'sad', 5: 'angry', 6: 'fear',
                    7: 'disgust', 8: 'surprise'
                }[emotion_code])

                # Load and preprocess audio
                file_path = os.path.join(root, file)
                y, sr = librosa.load(file_path, sr=22050)
                y = nr.reduce_noise(y=y, sr=sr)

                # Extract MFCC features
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                mfcc_processed = np.mean(mfcc.T, axis=0)

                features.append(mfcc_processed)

    return np.array(features), np.array(emotions)

In [None]:
# Load and prepare dataset
print("Loading emotion dataset...")
features, emotions = load_emotion_data("/content/emotion_dataset")

Loading emotion dataset...


In [None]:
# Encode labels
label_encoder = LabelEncoder()
encoded_emotions = label_encoder.fit_transform(emotions)

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    features, encoded_emotions, test_size=0.2, random_state=42
)

In [None]:
# --------------------------
# 3. EMOTION RECOGNITION MODEL
# --------------------------
def create_emotion_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
!pip install tensorflow==2.12.0



In [None]:
# Create and train model
print("Training emotion recognition model...")
model = create_emotion_model((X_train.shape[1],), len(label_encoder.classes_))
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

Training emotion recognition model...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Save model
model.save('/content/drive/MyDrive/emotion_model.h5')


In [None]:
# --------------------------
# 4. AUDIO PROCESSING PIPELINE
# --------------------------
class SafetyAnalyzer:
    def __init__(self):
        self.panic_words = {
            "help", "fire", "emergency", "danger", "run", "stop", "police",
            "watch out", "look out", "get down", "move", "duck", "stay back",
            "back off", "stay away", "call 911", "call an ambulance",
            "call the police", "call for help", "mayday", "sos", "alert"
        }
        self.whisper_model = whisper.load_model("base")
        self.emotion_model = tf.keras.models.load_model('/content/drive/MyDrive/emotion_model.h5')
        self.label_encoder = label_encoder

    def process_audio(self, input_path):
        # Convert to WAV
        if input_path.endswith(".opus"):
            audio = AudioSegment.from_file(input_path)
            wav_path = input_path.replace(".opus", ".wav")
            audio.export(wav_path, format="wav")
            os.remove(input_path)
            input_path = wav_path

        # Noise reduction
        y, sr = librosa.load(input_path, sr=22050)
        y_clean = nr.reduce_noise(y=y, sr=sr)
        clean_path = input_path.replace(".wav", "_clean.wav")
        sf.write(clean_path, y_clean, sr)

        return clean_path

    def analyze_emotion(self, audio_path):
        y, sr = librosa.load(audio_path, sr=22050)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        features = np.mean(mfcc.T, axis=0)
        prediction = self.emotion_model.predict(np.expand_dims(features, axis=0))
        emotion = self.label_encoder.inverse_transform([np.argmax(prediction)])[0]
        confidence = np.max(prediction)
        return emotion, confidence

    def transcribe_audio(self, audio_path):
        result = self.whisper_model.transcribe(audio_path)
        return result["text"]

    def analyze_safety(self, audio_path):
        # Process audio
        clean_path = self.process_audio(audio_path)

        # Emotion analysis
        emotion, confidence = self.analyze_emotion(clean_path)

        # Transcription analysis
        transcription = self.transcribe_audio(clean_path)
        panic_detected = [word for word in self.panic_words
                          if word in transcription.lower()]

        # Generate report
        report = {
            "file_path": audio_path,
            "emotion": emotion,
            "confidence": float(confidence),
            "transcription": transcription,
            "panic_words": panic_detected,
            "requires_attention": any(panic_detected) or emotion in ['fear', 'angry']
        }

        return report

In [None]:
# --------------------------
# 5. MAIN EXECUTION
# --------------------------
def analyze_directory(directory_path):
    analyzer = SafetyAnalyzer()
    results = []

    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                try:
                    report = analyzer.analyze_safety(file_path)
                    results.append(report)

                    if report["requires_attention"]:
                        print(f"🚨 SAFETY ALERT: {file_path}")
                        print(f"Detected emotion: {report['emotion']} ({report['confidence']:.2%})")
                        print(f"Panic words detected: {report['panic_words']}")
                        print(f"Transcription: {report['transcription']}\n")

                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")

    return results


In [None]:
# Run analysis on your audio directory
audio_directory = "/content/drive/MyDrive/audiosdata/audiototext/audio data"
results = analyze_directory(audio_directory)


100%|███████████████████████████████████████| 139M/139M [00:05<00:00, 27.9MiB/s]
  checkpoint = torch.load(fp, map_location=device)


🚨 SAFETY ALERT: /content/drive/MyDrive/audiosdata/audiototext/audio data/WhatsApp Audio 2025-02-27 at 21.41.46_eeea78d8.wav
Detected emotion: calm (13.83%)
Panic words detected: ['help']
Transcription:  Help me someone is following

🚨 SAFETY ALERT: /content/drive/MyDrive/audiosdata/audiototext/audio data/WhatsApp Audio 2025-02-27 at 21.41.46_da5dbe97.wav
Detected emotion: calm (13.83%)
Panic words detected: ['danger']
Transcription:  I am danger.

🚨 SAFETY ALERT: /content/drive/MyDrive/audiosdata/audiototext/audio data/WhatsApp Audio 2025-02-27 at 21.42.00_a799a120.wav
Detected emotion: calm (13.83%)
Panic words detected: ['help']
Transcription:  help he is attacking me

🚨 SAFETY ALERT: /content/drive/MyDrive/audiosdata/audiototext/audio data/WhatsApp Audio 2025-02-27 at 21.42.01_43b7e3fa.wav
Detected emotion: calm (13.83%)
Panic words detected: ['danger']
Transcription:  Someone shot me, he's dangerous!

🚨 SAFETY ALERT: /content/drive/MyDrive/audiosdata/audiototext/audio data/WhatsApp

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


In [None]:
def visualize_results(results):
    if not results:
        print("No results to visualize.")
        return

    emotions = [r['emotion'] for r in results if 'emotion' in r]
    alerts = [r['requires_attention'] for r in results if 'requires_attention' in r]

    if not emotions:
        print("No emotion data available.")
        return

    if not alerts:
        print("No safety alert data available.")
        return

    plt.figure(figsize=(15, 5))

    # Emotion distribution
    plt.subplot(1, 2, 1)
    pd.Series(emotions).value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title("Emotion Distribution")
    plt.xlabel("Emotions")
    plt.ylabel("Count")

    # Alert distribution
    plt.subplot(1, 2, 2)
    pd.Series(alerts).value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['red', 'green'])
    plt.title("Safety Alerts Distribution")

    plt.tight_layout()
    plt.show()
