In [None]:
!pip uninstall -y numpy
!pip install --upgrade --force-reinstall numpy==1.26.4 pandas==2.2.2 torch torchtext nltk transformers matplotlib scikit-learn librosa pydub midiutil pretty_midi music21 scipy


Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==2.2.2
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting sciki

 Initial Setup & Dependencies

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.preprocessing import MinMaxScaler
import json
import os
import random
import librosa
from pydub import AudioSegment
from pydub.playback import play
import io
import base64
import tempfile
import music21

# For MIDI handling
from midiutil import MIDIFile
import pretty_midi

# For interactive audio display in Colab
import IPython.display as ipd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Text Processing Pipeline
handles text cleaning and segmentation to prepare raw text for emotion analysis.

In [None]:
class TextProcessor:
    def __init__(self):
        try:
            self.stop_words = set(stopwords.words('english'))
        except LookupError:
            nltk.download('stopwords')
            self.stop_words = set(stopwords.words('english'))
        try:
            self.lemmatizer = WordNetLemmatizer()
        except LookupError:
            nltk.download('wordnet')
            self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        try:
            words = word_tokenize(text)
        except LookupError:
            words = text.split()
        cleaned_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(cleaned_words)

    def segment_text(self, text, segment_size=500):
        try:
            sentences = sent_tokenize(text)
        except LookupError:
            sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
        segments = []
        current_segment = []
        current_length = 0
        for sentence in sentences:
            sentence_length = len(sentence)
            if current_length + sentence_length > segment_size and current_segment:
                segments.append(' '.join(current_segment))
                current_segment = [sentence]
                current_length = sentence_length
            else:
                current_segment.append(sentence)
                current_length += sentence_length
        if current_segment:
            segments.append(' '.join(current_segment))
        return segments

    def process_text(self, text, segment_size=500):
        segments = self.segment_text(text, segment_size)
        cleaned_segments = [self.clean_text(segment) for segment in segments]
        return segments, cleaned_segments


Emotion Extraction Engine---
Analyzes text segments to detect and quantify emotional content using a state-of-the-art transformer model.

In [None]:
class EmotionExtractor:
    def __init__(self):
        try:
            self.emotion_classifier = pipeline(
                "text-classification",
                model="j-hartmann/emotion-english-distilroberta-base",
                return_all_scores=True
            )
        except:
            self.emotion_classifier = pipeline(
                "text-classification",
                model="j-hartmann/emotion-english-distilroberta-base",
                top_k=None
            )
        self.emotion_categories = [
            'joy', 'sadness', 'anger', 'fear', 'surprise',
            'disgust', 'neutral', 'anticipation', 'trust'
        ]

    def extract_emotions(self, text_segments):
        emotion_maps = []
        for segment in text_segments:
            try:
                emotion_scores = self.emotion_classifier(segment)[0]
                emotion_dict = {item['label']: item['score'] for item in emotion_scores}
                mapped_emotions = {
                    'joy': emotion_dict.get('joy', 0),
                    'sadness': emotion_dict.get('sadness', 0),
                    'anger': emotion_dict.get('anger', 0),
                    'fear': emotion_dict.get('fear', 0),
                    'surprise': emotion_dict.get('surprise', 0),
                    'disgust': emotion_dict.get('disgust', 0),
                    'neutral': emotion_dict.get('neutral', 0),
                    'anticipation': (emotion_dict.get('joy', 0) + emotion_dict.get('surprise', 0)) / 2,
                    'trust': (emotion_dict.get('joy', 0) + emotion_dict.get('neutral', 0)) / 2
                }
                total = sum(mapped_emotions.values())
                if total > 0:
                    mapped_emotions = {k: v/total for k, v in mapped_emotions.items()}
            except Exception as e:
                print(f"Warning: Error processing segment: {e}")
                mapped_emotions = {emotion: 0.0 for emotion in self.emotion_categories}
                mapped_emotions['neutral'] = 1.0
            emotion_maps.append(mapped_emotions)
        return emotion_maps

    def create_emotional_progression(self, emotion_maps):
        progression = {emotion: [] for emotion in self.emotion_categories}
        for emotion_map in emotion_maps:
            for emotion in self.emotion_categories:
                progression[emotion].append(emotion_map.get(emotion, 0))
        return progression

    def get_dominant_emotions(self, emotion_maps, top_n=2):
        dominant_emotions = []
        for emotion_map in emotion_maps:
            sorted_emotions = sorted(emotion_map.items(), key=lambda x: x[1], reverse=True)
            top_emotions = sorted_emotions[:top_n]
            dominant_emotions.append(top_emotions)
        return dominant_emotions

    def emotion_trend_analysis(self, emotional_progression):
        trends = {}
        for emotion, values in emotional_progression.items():
            if len(values) <= 1:
                trends[emotion] = "stable"
                continue
            x = np.arange(len(values))
            y = np.array(values)
            if np.std(y) < 0.05:
                trends[emotion] = "stable"
                continue
            slope = np.polyfit(x, y, 1)[0]
            if slope > 0.05:
                trends[emotion] = "increasing"
            elif slope < -0.05:
                trends[emotion] = "decreasing"
            else:
                trends[emotion] = "stable"
        return trends


Neural Mapping from Emotions to Musical Features---
This class maps emotional inputs to musical features via a neural network, with predefined music theory mappings and probabilistic correlations. It generates detailed prompts for music synthesis, capturing nuances like tempo, key, mode, instrumentation, and complexity, tailored to the emotional context.

In [None]:
class EmotionToMusicMapper(nn.Module):
    def __init__(self, input_dim=9, hidden_dim=64, output_dim=12):
        super(EmotionToMusicMapper, self).__init__()

        # Neural network for mapping emotions to musical features
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()  # Normalized output
        )

        # Define music feature mappings (expanded)
        self.musical_features = {
            'tempo': {'min': 60, 'max': 180},     # BPM
            'key': {'min': 0, 'max': 11},         # C=0, C#=1, ..., B=11
            'mode': {'min': 0, 'max': 1},         # Minor=0, Major=1
            'intensity': {'min': 0, 'max': 1},    # Soft to loud
            'instrumentation': {'min': 0, 'max': 8},  # Expanded instrument groups
            'rhythm_complexity': {'min': 0, 'max': 1},
            'harmonic_complexity': {'min': 0, 'max': 1},
            'melodic_range': {'min': 0, 'max': 1},
            'texture': {'min': 0, 'max': 1},      # Sparse to dense
            'articulation': {'min': 0, 'max': 1},  # Staccato to legato
            'modulation': {'min': 0, 'max': 1},    # Likelihood of key changes
            'scale_type': {'min': 0, 'max': 5}     # Different scale types (major, minor, pentatonic, etc.)
        }

        # Map features to index in output
        self.feature_to_idx = {feature: i for i, feature in enumerate(self.musical_features.keys())}

        # Define emotional correlations (expanded with more nuanced mappings)
        self.emotion_correlations = {
            'joy': {
                'tempo': {'value': 'high', 'variance': 0.2},
                'mode': {'value': 'major', 'variance': 0.1},
                'intensity': {'value': 'moderate-high', 'variance': 0.3},
                'rhythm_complexity': {'value': 'moderate', 'variance': 0.3},
                'harmonic_complexity': {'value': 'moderate', 'variance': 0.2},
                'scale_type': ['major', 'lydian', 'pentatonic_major'],
                'instrumentation': ['piano', 'strings', 'synth', 'woodwinds', 'brass'],
                'description': ['uplifting', 'bright', 'cheerful', 'buoyant', 'exuberant', 'playful', 'optimistic']
            },
            'sadness': {
                'tempo': {'value': 'low', 'variance': 0.1},
                'mode': {'value': 'minor', 'variance': 0.2},
                'intensity': {'value': 'low', 'variance': 0.2},
                'rhythm_complexity': {'value': 'low', 'variance': 0.1},
                'harmonic_complexity': {'value': 'moderate-high', 'variance': 0.3},
                'scale_type': ['minor', 'dorian', 'phrygian'],
                'instrumentation': ['piano', 'strings', 'guitar', 'cello', 'violin'],
                'description': ['melancholic', 'somber', 'wistful', 'contemplative', 'haunting', 'bittersweet', 'mournful']
            },
            'anger': {
                'tempo': {'value': 'high', 'variance': 0.2},
                'mode': {'value': 'minor', 'variance': 0.1},
                'intensity': {'value': 'high', 'variance': 0.1},
                'rhythm_complexity': {'value': 'high', 'variance': 0.3},
                'harmonic_complexity': {'value': 'high', 'variance': 0.2},
                'scale_type': ['minor', 'phrygian', 'locrian'],
                'instrumentation': ['synth', 'percussion', 'orchestral', 'brass', 'electric_guitar'],
                'description': ['intense', 'aggressive', 'powerful', 'driving', 'dissonant', 'chaotic', 'turbulent']
            },
            'fear': {
                'tempo': {'value': 'variable', 'variance': 0.4},
                'mode': {'value': 'minor', 'variance': 0.1},
                'intensity': {'value': 'variable', 'variance': 0.4},
                'rhythm_complexity': {'value': 'low', 'variance': 0.2},
                'harmonic_complexity': {'value': 'high', 'variance': 0.2},
                'scale_type': ['minor', 'locrian', 'diminished'],
                'instrumentation': ['strings', 'synth', 'percussion', 'violin', 'cello'],
                'description': ['tense', 'suspenseful', 'eerie', 'unsettling', 'mysterious', 'foreboding', 'chilling']
            },
            'surprise': {
                'tempo': {'value': 'variable', 'variance': 0.5},
                'mode': {'value': 'variable', 'variance': 0.5},
                'intensity': {'value': 'variable', 'variance': 0.5},
                'rhythm_complexity': {'value': 'high', 'variance': 0.3},
                'harmonic_complexity': {'value': 'moderate', 'variance': 0.3},
                'scale_type': ['chromatic', 'whole_tone', 'lydian'],
                'instrumentation': ['piano', 'synth', 'orchestral', 'percussion', 'woodwinds'],
                'description': ['unexpected', 'quirky', 'sudden', 'playful', 'whimsical', 'unpredictable', 'startling']
            },
            'disgust': {
                'tempo': {'value': 'low-moderate', 'variance': 0.3},
                'mode': {'value': 'minor', 'variance': 0.2},
                'intensity': {'value': 'moderate', 'variance': 0.3},
                'rhythm_complexity': {'value': 'moderate', 'variance': 0.3},
                'harmonic_complexity': {'value': 'high', 'variance': 0.2},
                'scale_type': ['locrian', 'chromatic', 'diminished'],
                'instrumentation': ['synth', 'percussion', 'orchestral', 'brass', 'prepared_piano'],
                'description': ['dissonant', 'unsettling', 'gritty', 'uncomfortable', 'jarring', 'off-kilter', 'distorted']
            },
            'neutral': {
                'tempo': {'value': 'moderate', 'variance': 0.2},
                'mode': {'value': 'variable', 'variance': 0.5},
                'intensity': {'value': 'moderate', 'variance': 0.2},
                'rhythm_complexity': {'value': 'moderate', 'variance': 0.2},
                'harmonic_complexity': {'value': 'moderate', 'variance': 0.2},
                'scale_type': ['major', 'minor', 'pentatonic_major'],
                'instrumentation': ['piano', 'strings', 'guitar', 'acoustic_guitar', 'flute'],
                'description': ['balanced', 'ambient', 'atmospheric', 'calm', 'steady', 'peaceful', 'flowing']
            },
            'anticipation': {
                'tempo': {'value': 'moderate-high', 'variance': 0.3},
                'mode': {'value': 'variable', 'variance': 0.4},
                'intensity': {'value': 'building', 'variance': 0.3},
                'rhythm_complexity': {'value': 'moderate', 'variance': 0.3},
                'harmonic_complexity': {'value': 'moderate-high', 'variance': 0.3},
                'scale_type': ['mixolydian', 'lydian', 'major'],
                'instrumentation': ['strings', 'percussion', 'orchestral', 'piano', 'harp'],
                'description': ['building', 'hopeful', 'expectant', 'curious', 'forward-moving', 'suspenseful', 'rising']
            },
            'trust': {
                'tempo': {'value': 'moderate', 'variance': 0.2},
                'mode': {'value': 'major', 'variance': 0.2},
                'intensity': {'value': 'moderate', 'variance': 0.2},
                'rhythm_complexity': {'value': 'low-moderate', 'variance': 0.2},
                'harmonic_complexity': {'value': 'moderate', 'variance': 0.2},
                'scale_type': ['major', 'mixolydian', 'pentatonic_major'],
                'instrumentation': ['piano', 'acoustic_guitar', 'strings', 'harp', 'woodwinds'],
                'description': ['warm', 'secure', 'comforting', 'resolute', 'stable', 'reliable', 'grounded']
            }
        }

        # Define scale mappings for the scale_type feature
        self.scale_mappings = {
            'major': [0, 2, 4, 5, 7, 9, 11],  # Major scale
            'minor': [0, 2, 3, 5, 7, 8, 10],  # Natural minor
            'harmonic_minor': [0, 2, 3, 5, 7, 8, 11],  # Harmonic minor
            'melodic_minor': [0, 2, 3, 5, 7, 9, 11],  # Melodic minor (ascending)
            'dorian': [0, 2, 3, 5, 7, 9, 10],  # Dorian mode
            'phrygian': [0, 1, 3, 5, 7, 8, 10],  # Phrygian mode
            'lydian': [0, 2, 4, 6, 7, 9, 11],  # Lydian mode
            'mixolydian': [0, 2, 4, 5, 7, 9, 10],  # Mixolydian mode
            'locrian': [0, 1, 3, 5, 6, 8, 10],  # Locrian mode
            'pentatonic_major': [0, 2, 4, 7, 9],  # Major pentatonic
            'pentatonic_minor': [0, 3, 5, 7, 10],  # Minor pentatonic
            'blues': [0, 3, 5, 6, 7, 10],  # Blues scale
            'whole_tone': [0, 2, 4, 6, 8, 10],  # Whole tone scale
            'chromatic': list(range(12)),  # Chromatic scale
            'diminished': [0, 2, 3, 5, 6, 8, 9, 11]  # Diminished scale
        }

        # Advanced chord progressions for different modes
        self.advanced_progressions = {
            'major': [
                [1, 4, 5, 1],       # I-IV-V-I
                [1, 6, 4, 5],       # I-vi-IV-V
                [1, 5, 6, 4],       # I-V-vi-IV
                [2, 5, 1, 6],       # ii-V-I-vi
                [1, 3, 4, 5],       # I-iii-IV-V
                [6, 2, 5, 1],       # vi-ii-V-I (jazz)
                [1, 5, 6, 3, 4, 1, 4, 5]  # I-V-vi-iii-IV-I-IV-V
            ],
            'minor': [
                [1, 4, 5, 1],       # i-iv-v-i
                [1, 6, 3, 7],       # i-VI-III-VII
                [1, 7, 6, 5],       # i-VII-VI-v
                [1, 4, 7, 3],       # i-iv-VII-III
                [1, 4, 5, 6],       # i-iv-v-VI
                [1, 2, 5, 1],       # i-iidim-v-i
                [1, 6, 4, 3, 2, 5, 1]  # i-VI-iv-III-iidim-V-i
            ],
            'dorian': [
                [1, 4, 1, 7],       # i-IV-i-VII
                [1, 2, 3, 7],       # i-ii-III-VII
                [1, 3, 4, 7],       # i-III-IV-VII
                [1, 7, 4, 1]        # i-VII-IV-i
            ],
            'phrygian': [
                [1, 2, 7, 1],       # i-II-VII-i
                [1, 6, 7, 1],       # i-VI-VII-i
                [1, 4, 7, 1]        # i-iv-VII-i
            ],
            'lydian': [
                [1, 2, 5, 7],       # I-II-V-VII
                [1, 3, 5, 7],       # I-iii-V-VII
                [1, 2, 7, 1]        # I-II-VII-I
            ],
            'mixolydian': [
                [1, 7, 4, 5],       # I-VII-IV-V
                [1, 5, 7, 4],       # I-v-VII-IV
                [5, 7, 1, 4]        # v-VII-I-IV
            ]
        }

        # Expanded instrumentation mapping
        self.instrument_mappings = {
            'piano': 0,             # Acoustic Grand Piano
            'strings': 48,          # String Ensemble 1
            'guitar': 24,           # Acoustic Guitar (nylon)
            'acoustic_guitar': 25,  # Acoustic Guitar (steel)
            'electric_guitar': 27,  # Electric Guitar (clean)
            'synth': 80,            # Lead 1 (square)
            'orchestral': 48,       # String Ensemble 1
            'percussion': 118,      # Synth Drum
            'violin': 40,           # Violin
            'cello': 42,            # Cello
            'harp': 46,             # Harp
            'woodwinds': 71,        # Clarinet
            'flute': 73,            # Flute
            'brass': 56,            # Trumpet
            'prepared_piano': 1     # Bright Acoustic Piano (as substitute for prepared piano)
        }

    def forward(self, emotion_vector):
        #Map emotion vector to musical features
        return self.network(emotion_vector)

    def map_to_actual_values(self, normalized_features):
        #Convert normalized outputs to actual musical values
        actual_values = {}

        for feature, feature_range in self.musical_features.items():
            idx = self.feature_to_idx.get(feature, 0)
            if idx >= len(normalized_features):
                # Handle case where feature isn't in the output
                actual_values[feature] = (feature_range['min'] + feature_range['max']) / 2
                continue

            norm_value = normalized_features[idx].item()

            # Scale to the actual range
            min_val = feature_range['min']
            max_val = feature_range['max']
            actual_value = min_val + norm_value * (max_val - min_val)

            # Round as needed
            if feature in ['key', 'instrumentation', 'scale_type']:
                actual_value = round(actual_value)

            actual_values[feature] = actual_value

        return actual_values

    def get_scale_for_feature(self, scale_type_value):
        #Get the actual scale based on the scale_type feature value
        # Map the numerical value to a scale type
        scale_types = list(self.scale_mappings.keys())
        scale_idx = min(int(scale_type_value), len(scale_types) - 1)
        scale_name = scale_types[scale_idx]

        return scale_name, self.scale_mappings[scale_name]

    def get_key_diversity(self, emotion_scores):
        #Generate key diversity based on emotion scores to avoid F# dominance
        # Calculate a diversity factor based on emotions
        surprise_factor = emotion_scores.get('surprise', 0) * 0.8
        joy_factor = emotion_scores.get('joy', 0) * 0.6
        anger_factor = emotion_scores.get('anger', 0) * 0.4

        diversity_factor = surprise_factor + joy_factor + anger_factor

        # Calculate probabilities for different keys
        # Reduce probability of F# (key 6)
        key_weights = np.ones(12) * (1 - diversity_factor) / 11
        key_weights[6] = diversity_factor / 10  # Much lower probability for F#

        # Normalize weights
        key_weights = key_weights / np.sum(key_weights)

        # Return a randomly selected key based on weights
        selected_key = np.random.choice(12, p=key_weights)
        return selected_key

    def get_descriptors_for_emotion_blend(self, emotion_scores):
        #Get appropriate musical descriptors based on a blend of emotions
        # Get top 3 emotions
        sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)
        top_emotions = sorted_emotions[:3]

        descriptors = []
        instruments = set()
        scales = set()

        # Weighted selection of descriptors based on emotion intensity
        total_weight = sum(score for _, score in top_emotions)

        for emotion, score in top_emotions:
            # Skip emotions with very low scores
            if score < 0.1:
                continue

            # Weight by the emotion's intensity
            weight = score / total_weight

            # Get descriptors for this emotion
            if emotion in self.emotion_correlations:
                emotion_descriptors = self.emotion_correlations[emotion].get('description', [])

                # Add descriptors proportional to the emotion's weight
                num_descriptors = max(1, int(weight * 3))  # At least 1, up to 3 descriptors
                if emotion_descriptors:
                    selected_descriptors = random.sample(
                        emotion_descriptors,
                        min(num_descriptors, len(emotion_descriptors))
                    )
                    descriptors.extend(selected_descriptors)

                # Add potential instruments
                emotion_instruments = self.emotion_correlations[emotion].get('instrumentation', [])
                # Choose 1-2 instruments based on weight
                num_instruments = max(1, int(weight * 2))
                if emotion_instruments:
                    selected_instruments = random.sample(
                        emotion_instruments,
                        min(num_instruments, len(emotion_instruments))
                    )
                    instruments.update(selected_instruments)

                # Add potential scales
                emotion_scales = self.emotion_correlations[emotion].get('scale_type', [])
                if emotion_scales:
                    selected_scale = random.choice(emotion_scales)
                    scales.add(selected_scale)

        # Return unique descriptors, instruments, and scales
        return list(set(descriptors)), list(instruments), list(scales)

    def generate_musiclm_prompt(self, musical_features, emotion_scores=None):
        #Convert musical features to a MusicLM text prompt with enhanced variety
        # Map key number to name
        key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
        key_name = key_names[round(musical_features['key'])]

        # Map mode number to name
        mode_name = "major" if musical_features['mode'] > 0.5 else "minor"

        # Get scale type
        scale_name, _ = self.get_scale_for_feature(musical_features['scale_type'])

        # Special handling for scales that imply a different mode
        if 'major' in scale_name and mode_name == 'minor':
            mode_name = 'major'
        elif ('minor' in scale_name or scale_name in ['phrygian', 'locrian']) and mode_name == 'major':
            mode_name = 'minor'

        # Map instrumentation to instrument types
        instrument_types = list(self.instrument_mappings.keys())
        instrument_idx = min(round(musical_features['instrumentation']), len(instrument_types) - 1)
        instrument = instrument_types[instrument_idx]

        # Determine tempo description
        tempo = musical_features['tempo']
        if tempo < 80:
            tempo_desc = "slow"
        elif tempo < 120:
            tempo_desc = "moderate"
        else:
            tempo_desc = "fast"

        # Determine intensity description
        intensity = musical_features['intensity']
        if intensity < 0.3:
            intensity_desc = "soft"
        elif intensity < 0.7:
            intensity_desc = "moderate"
        else:
            intensity_desc = "powerful"

        # Get emotional descriptors if emotion
        # Continuing from where the code was cut off:
        # Get emotional descriptors if emotion scores are provided
        descriptors = []
        additional_instruments = []
        scale_suggestions = []

        if emotion_scores:
            descriptors, additional_instruments, scale_suggestions = self.get_descriptors_for_emotion_blend(emotion_scores)

        # Add more instruments to the mix
        if additional_instruments:
            # Combine with the main instrument
            all_instruments = [instrument] + additional_instruments
            # Take up to 3 instruments
            selected_instruments = all_instruments[:3]
            instruments_text = ", ".join(selected_instruments)
        else:
            instruments_text = instrument

        # Build the prompt with variety
        prompt_elements = []

        # Add key and mode information
        prompt_elements.append(f"{key_name} {mode_name}")

        # Add scale information if we have suggestions
        if scale_suggestions and scale_name not in scale_suggestions:
            # Use one of the emotion-derived scales instead
            scale_name = random.choice(scale_suggestions)
            prompt_elements.append(f"{scale_name} scale")
        elif scale_name not in ['major', 'minor']:  # Don't repeat if already mentioned in mode
            prompt_elements.append(f"{scale_name} scale")

        # Add tempo
        prompt_elements.append(f"{tempo_desc} tempo")

        # Add intensity
        prompt_elements.append(f"{intensity_desc} intensity")

        # Add instruments
        prompt_elements.append(f"featuring {instruments_text}")

        # Add random descriptors for variety
        if descriptors:
            # Choose 2-3 descriptors
            num_descriptors = min(len(descriptors), random.randint(2, 3))
            selected_descriptors = random.sample(descriptors, num_descriptors)

            # Format descriptors with different patterns
            descriptor_formats = [
                f"with a {selected_descriptors[0]} feel",
                f"creating a {' and '.join(selected_descriptors[:2])} atmosphere",
                f"{', '.join(selected_descriptors[:3])} in character"
            ]

            prompt_elements.append(random.choice(descriptor_formats))

        # Random additions for variety
        additional_elements = [
            f"approximately {round(musical_features['tempo'])} BPM",
            f"with {musical_features['rhythm_complexity'] > 0.5 and 'complex' or 'simple'} rhythm",
            f"featuring {musical_features['harmonic_complexity'] > 0.5 and 'rich' or 'straightforward'} harmonies"
        ]

        # Add 1-2 additional elements
        num_additional = random.randint(1, 2)
        selected_additional = random.sample(additional_elements, num_additional)
        prompt_elements.extend(selected_additional)

        # Randomize the order a bit (keeping key/mode first)
        first_element = prompt_elements[0]
        remaining_elements = prompt_elements[1:]
        random.shuffle(remaining_elements)

        # Build the final prompt
        return f"{first_element}, {', '.join(remaining_elements)}"

In [None]:
# Define a sample text
sample_text = """
It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness.
The world was in turmoil, yet hope shone in the hearts of the brave. A new chapter of life awaited, full of promise and uncertainty.
"""

# Instantiate the processors
text_processor = TextProcessor()
original_segments, cleaned_segments = text_processor.process_text(sample_text)

# Instantiate the emotion extractor and extract emotions on original segments
emotion_extractor = EmotionExtractor()
emotion_maps = emotion_extractor.extract_emotions(original_segments)

# For demonstration, print the first segment's extracted emotions
print("Extracted Emotions for first segment:")
print(emotion_maps[0])

# Instantiate the mapper
music_mapper = EmotionToMusicMapper()

# (For testing) Create dummy musical features. In practice, these would be generated by your network.
sample_musical_features = {
    'tempo': 120,
    'key': 0,
    'mode': 1,  # 1 indicates major in our mapping
    'intensity': 0.5,
    'instrumentation': 0,  # First instrument mapping (e.g., piano)
    'rhythm_complexity': 0.5,
    'harmonic_complexity': 0.5,
    'melodic_range': 0.5,
    'texture': 0.5,
    'articulation': 0.5,
    'modulation': 0.5,
    'scale_type': 2  # Arbitrary value; will be mapped via get_scale_for_feature
}

# Generate a MuseNet-compatible prompt using the mapper's generate_musiclm_prompt method.
# (Even though the method is named generate_musiclm_prompt, we will use its output as our MuseNet prompt.)
prompt = music_mapper.generate_musiclm_prompt(sample_musical_features, emotion_maps[0])
print("Generated MuseNet Prompt:")
print(prompt)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Extracted Emotions for first segment:
{'joy': 0.10362333210054346, 'sadness': 0.19321911986271012, 'anger': 0.014818017410239975, 'fear': 0.19282435152686717, 'surprise': 0.0021597877189712717, 'disgust': 0.11584679137679749, 'neutral': 0.18187024936256097, 'anticipation': 0.052891559909757364, 'trust': 0.14274679073155222}
Generated MuseNet Prompt:
C minor, flowing, chilling, wistful in character, moderate intensity, featuring piano, guitar, percussion, fast tempo, diminished scale, featuring straightforward harmonies


 MusicGen Integration for Text-to-Music Synthesis----
 This code block implements the final stage of the pipeline: generating music from text prompts using Meta's MusicGen model.

In [None]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import scipy.io.wavfile as wavfile
import torch
import os

def generate_music_with_musicgen(prompt, output_path="generated_music", model_size="small", duration_seconds=10):

    print(f"Generating music with prompt: '{prompt}'")

    os.makedirs(output_path, exist_ok=True)

    model_name = f"facebook/musicgen-{model_size}"

    processor = AutoProcessor.from_pretrained(model_name)
    model = MusicgenForConditionalGeneration.from_pretrained(model_name)

    # Set the max length based on duration (50 tokens per second × duration)
    max_new_tokens = int(duration_seconds * 50)

    # Prepare inputs
    inputs = processor(
        text=[prompt],
        padding=True,
        return_tensors="pt"
    )

    # Generate audio
    with torch.no_grad():
        audio_values = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True  # Enable sampling for more variety
        )

    # Get the sampling rate from the model config
    sampling_rate = model.config.audio_encoder.sampling_rate

    # Create a unique filename based on the prompt
    safe_prompt = "".join(x for x in prompt[:20] if x.isalnum() or x in [' ', '_']).strip().replace(' ', '_')
    filename = f"{output_path}/{safe_prompt}_{model_size}.wav"

    # Save the audio file
    wavfile.write(
        filename,
        rate=sampling_rate,
        data=audio_values[0, 0].cpu().numpy()
    )

    print(f"Music saved to {filename}")
    return filename


In [None]:
def generate_music_for_segments(emotion_maps, music_mapper, model_size="small", duration_seconds=10):
    """ Generate music for each text segment based on emotion maps."""

    audio_files = []
    prompts = []

    for i, emotion_map in enumerate(emotion_maps):
        print(f"\nProcessing segment {i+1}/{len(emotion_maps)}...")

        # Convert emotion map to tensor
        emotion_vector = torch.tensor([emotion_map.get(emotion, 0.0)
                                      for emotion in music_mapper.emotion_correlations.keys()])

        # Generate musical features
        with torch.no_grad():
            normalized_features = music_mapper(emotion_vector)

        # Convert to actual musical values
        musical_features = music_mapper.map_to_actual_values(normalized_features)

        # Generate a text prompt
        prompt = music_mapper.generate_musiclm_prompt(musical_features, emotion_map)
        prompts.append(prompt)
        print(f"Generated prompt: {prompt}")

        # Generate music using MusicGen
        audio_file = generate_music_with_musicgen(
            prompt=prompt,
            output_path=f"segment_{i+1}",
            model_size=model_size,
            duration_seconds=duration_seconds
        )

        audio_files.append(audio_file)

    return audio_files, prompts


In [None]:
# Define a sample text
sample_text = """
It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness.
The world was in turmoil, yet hope shone in the hearts of the brave. A new chapter of life awaited, full of promise and uncertainty.
"""

text_processor = TextProcessor()
emotion_extractor = EmotionExtractor()
music_mapper = EmotionToMusicMapper()

original_segments, cleaned_segments = text_processor.process_text(sample_text)
emotion_maps = emotion_extractor.extract_emotions(original_segments)

audio_files, prompts = generate_music_for_segments(
    emotion_maps=emotion_maps,
    music_mapper=music_mapper,
    model_size="small",
    duration_seconds=10
)

for i, (segment, prompt, audio_file) in enumerate(zip(original_segments, prompts, audio_files)):
    print(f"\nSegment {i+1}:")
    print(f"Text: {segment[:100]}...")
    print(f"Prompt: {prompt}")
    print(f"Audio file: {audio_file}")

    from IPython.display import Audio, display
    display(Audio(audio_file))


Device set to use cuda:0



Processing segment 1/1...
Generated prompt: F minor, fast tempo, featuring electric_guitar, strings, piano, moderate intensity, with simple rhythm, approximately 121 BPM, pentatonic_major scale, with a bittersweet feel
Generating music with prompt: 'F minor, fast tempo, featuring electric_guitar, strings, piano, moderate intensity, with simple rhythm, approximately 121 BPM, pentatonic_major scale, with a bittersweet feel'


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.87k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summ

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]



Music saved to segment_1/F_minor_fast_tempo_small.wav

Segment 1:
Text: It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foo...
Prompt: F minor, fast tempo, featuring electric_guitar, strings, piano, moderate intensity, with simple rhythm, approximately 121 BPM, pentatonic_major scale, with a bittersweet feel
Audio file: segment_1/F_minor_fast_tempo_small.wav


In [None]:
# Define a sample text
sample_text = """ His jaws opened, and he muttered some inarticulate sounds, while a grin wrinkled his cheeks. He might have spoken, but I did not hear; one hand was stretched out, seemingly to detain me, but I escaped and rushed down stairs. I took refuge in the courtyard belonging to the house which I inhabited; where I remained during the rest of the night, walking up and down in the greatest agitation, listening attentively, catching and fearing each sound as if it were to announce the approach of the demoniacal corpse to which I had so miserably given life."""

# Instantiate the processors
text_processor = TextProcessor()
original_segments, cleaned_segments = text_processor.process_text(sample_text)

# Instantiate the emotion extractor and extract emotions on original segments
emotion_extractor = EmotionExtractor()
emotion_maps = emotion_extractor.extract_emotions(original_segments)

# For demonstration, print the first segment's extracted emotions
print("Extracted Emotions for first segment:")
print(emotion_maps[0])

# Instantiate the mapper
music_mapper = EmotionToMusicMapper()

# (For testing) Create dummy musical features. In practice, these would be generated by your network.
sample_musical_features = {
    'tempo': 120,
    'key': 0,
    'mode': 1,  # 1 indicates major in our mapping
    'intensity': 0.5,
    'instrumentation': 0,  # First instrument mapping (e.g., piano)
    'rhythm_complexity': 0.5,
    'harmonic_complexity': 0.5,
    'melodic_range': 0.5,
    'texture': 0.5,
    'articulation': 0.5,
    'modulation': 0.5,
    'scale_type': 2  # Arbitrary value; will be mapped via get_scale_for_feature
}

# Generate a MuseNet-compatible prompt using the mapper's generate_musiclm_prompt method.
# (Even though the method is named generate_musiclm_prompt, we will use its output as our MuseNet prompt.)
prompt = music_mapper.generate_musiclm_prompt(sample_musical_features, emotion_maps[0])
print("Generated MuseNet Prompt:")
print(prompt)


Device set to use cuda:0


Extracted Emotions for first segment:
{'joy': 0.002356462646023033, 'sadness': 0.03356684487501281, 'anger': 0.44884204388067234, 'fear': 0.1568035531341478, 'surprise': 0.011603535992204463, 'disgust': 0.23627152882111951, 'neutral': 0.06826520000579651, 'anticipation': 0.006979999319113748, 'trust': 0.03531083132590977}
Generated MuseNet Prompt:
C minor, fast tempo, diminished scale, featuring straightforward harmonies, approximately 120 BPM, moderate intensity, gritty, foreboding in character, featuring piano, synth, brass




In [None]:
""" His jaws opened, and he muttered some inarticulate sounds, while a grin wrinkled his cheeks. He might have spoken, but I did not hear; one hand was stretched out, seemingly to detain me, but I escaped and rushed down stairs. I took refuge in the courtyard belonging to the house which I inhabited; where I remained during the rest of the night, walking up and down in the greatest agitation, listening attentively, catching and fearing each sound as if it were to announce the approach of the demoniacal corpse to which I had so miserably given life."""

# Define a sample text
sample_text = """ His jaws opened, and he muttered some inarticulate sounds, while a grin wrinkled his cheeks. He might have spoken, but I did not hear; one hand was stretched out, seemingly to detain me, but I escaped and rushed down stairs. I took refuge in the courtyard belonging to the house which I inhabited; where I remained during the rest of the night, walking up and down in the greatest agitation, listening attentively, catching and fearing each sound as if it were to announce the approach of the demoniacal corpse to which I had so miserably given life."""


text_processor = TextProcessor()
emotion_extractor = EmotionExtractor()
music_mapper = EmotionToMusicMapper()

original_segments, cleaned_segments = text_processor.process_text(sample_text)
emotion_maps = emotion_extractor.extract_emotions(original_segments)

audio_files, prompts = generate_music_for_segments(
    emotion_maps=emotion_maps,
    music_mapper=music_mapper,
    model_size="small",
    duration_seconds=10
)

for i, (segment, prompt, audio_file) in enumerate(zip(original_segments, prompts, audio_files)):
    print(f"\nSegment {i+1}:")
    print(f"Text: {segment[:100]}...")
    print(f"Prompt: {prompt}")
    print(f"Audio file: {audio_file}")

    from IPython.display import Audio, display
    display(Audio(audio_file))


Device set to use cuda:0



Processing segment 1/2...
Generated prompt: F minor, aggressive, gritty in character, moderate intensity, featuring electric_guitar, percussion, orchestral, moderate tempo, approximately 117 BPM, locrian scale, with complex rhythm
Generating music with prompt: 'F minor, aggressive, gritty in character, moderate intensity, featuring electric_guitar, percussion, orchestral, moderate tempo, approximately 117 BPM, locrian scale, with complex rhythm'


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summ

Music saved to segment_1/F_minor_aggressive_small.wav

Processing segment 2/2...
Generated prompt: F minor, moderate tempo, creating a mysterious and eerie atmosphere, featuring electric_guitar, strings, approximately 117 BPM, moderate intensity, minor scale
Generating music with prompt: 'F minor, moderate tempo, creating a mysterious and eerie atmosphere, featuring electric_guitar, strings, approximately 117 BPM, moderate intensity, minor scale'


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summ

Music saved to segment_2/F_minor_moderate_te_small.wav

Segment 1:
Text: His jaws opened, and he muttered some inarticulate sounds, while a grin wrinkled his cheeks. He migh...
Prompt: F minor, aggressive, gritty in character, moderate intensity, featuring electric_guitar, percussion, orchestral, moderate tempo, approximately 117 BPM, locrian scale, with complex rhythm
Audio file: segment_1/F_minor_aggressive_small.wav



Segment 2:
Text: I took refuge in the courtyard belonging to the house which I inhabited; where I remained during the...
Prompt: F minor, moderate tempo, creating a mysterious and eerie atmosphere, featuring electric_guitar, strings, approximately 117 BPM, moderate intensity, minor scale
Audio file: segment_2/F_minor_moderate_te_small.wav
