In [2]:
!pip install gtts


Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.4


In [3]:
import torch
import numpy as np
import librosa
from transformers import AutoTokenizer, BertForSequenceClassification
from pydub import AudioSegment, silence
from pydub.generators import Sine
import nltk
from gtts import gTTS
import os

In [4]:
# Download NLTK punkt tokenizer for sentence splitting
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Load emotion detection model
tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/bert-base-go-emotion")
model = BertForSequenceClassification.from_pretrained("bhadresh-savani/bert-base-go-emotion")


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
# Define emotion labels
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", 
    "confusion", "curiosity", "desire", "disappointment", "disapproval", 
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization", 
    "relief", "remorse", "sadness", "surprise", "neutral"
]



In [7]:
# Function to generate speech from text
#This is just for testing the model, the input we will have should be an audio only in this case.
def generate_speech(text, output_file="generated_narration.mp3"):
    tts = gTTS(text)
    tts.save(output_file)
    print(f"Generated speech saved as {output_file}")
    return output_file

In [8]:
# Function to detect emotions for sentences
def detect_emotions(text):
    sentences = sent_tokenize(text)
    sentence_emotion_map = {}

    for i, sentence in enumerate(sentences):
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
        predicted_class = torch.argmax(logits, dim=1).item()
        detected_emotion = emotion_labels[predicted_class]

        sentence_emotion_map[i + 1] = (sentence, detected_emotion)
    return sentence_emotion_map

In [9]:
# Function to select background music based on dominant emotion
def select_background_music(dominant_emotion):
    dataset_path = "/kaggle/input/background-music"
    music_tracks = {
        "joy": "joyful_music.mp3", "anger": "intense_music.mp3", "sadness": "sad_music.mp3", 
        "fear": "suspense_music.mp3", "surprise": "mystery_music.mp3", "excitement": "energetic_music.mp3",
        "neutral": "calm_music.mp3"
    }
    return os.path.join(dataset_path, music_tracks.get(dominant_emotion, "calm_music.mp3"))


In [10]:
# Function to remove long silences while keeping natural pauses
def remove_silence(input_file, output_file, silence_thresh=-40, min_silence_len=700, max_silence_len=2000):
    audio = AudioSegment.from_mp3(input_file)
    silent_ranges = silence.detect_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    
    processed_audio = AudioSegment.empty()
    prev_end = 0
    
    for start, end in silent_ranges:
        processed_audio += audio[prev_end:start]
        pause_length = min(end - start, max_silence_len)
        processed_audio += AudioSegment.silent(duration=pause_length)
        prev_end = end
    
    processed_audio += audio[prev_end:]
    processed_audio.export(output_file, format="wav")
    print(f"Processed audio saved as {output_file}")


In [11]:
# Function to add background music with fade in/out
def add_background_music(narration_file, output_file, emotion_map):
    narration = AudioSegment.from_mp3(narration_file).set_channels(1).set_frame_rate(16000)
    emotion_list = [emotion for _, emotion in emotion_map.values()]
    dominant_emotion = max(set(emotion_list), key=emotion_list.count)

    background_music_file = select_background_music(dominant_emotion)
    background_music = AudioSegment.from_mp3(background_music_file).set_channels(1).set_frame_rate(16000)
    
    background_music = background_music - 15  # Louder background music

    background_music = background_music[:len(narration)]  # Match length
    background_music = background_music.fade_in(3000).fade_out(3000)  # Apply fades
    final_audio = narration.overlay(background_music)
    
    final_audio.export(output_file, format="wav")
    print(f"Final audio saved as {output_file}")


In [12]:
 #Full execution pipeline
input_text = "Welcome, everyone. Today, we begin with Yogaasana — the gentle art of uniting breath, body, and mind. As you settle onto your mat, let go of any rush or worry. Each posture is not just movement, but a moment of stillness within. Breathe deeply, and allow yourself to simply be. Let the calm flow through you... and trust your body to guide you gently."  # Placeholder for ASR
raw_audio = generate_speech(input_text)
cleaned_audio = "cleaned_narration.wav"
remove_silence(raw_audio, cleaned_audio)
emotion_map = detect_emotions(input_text)
add_background_music(cleaned_audio, "final_output.wav", emotion_map)

Generated speech saved as generated_narration.mp3
Processed audio saved as cleaned_narration.wav
Final audio saved as final_output.wav
