# Import libraries, modules and packages

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

from audiocraft.models import MusicGen


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\julio\anaconda3\envs\singer-doge\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\julio\anaconda3\envs\singer-doge\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\julio\anaconda3\envs\singer-doge\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\julio\anaconda3\envs\singer-doge\lib\site-packages\traitlets\config\application.py", line 1075, in launch_in

# Check out CUDA device availability

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

# Student Model Architecture

In [None]:
# convert lyrics to music in midi format model
class Lyrics2MusicModel(nn.Module):
    def __init__(self, text_emb_size, input_size, hidden_size, output_size, num_heads, num_layers):
        super(Lyrics2MusicModel, self).__init__()
        # music embedding layer
        self.music_emb = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)
        # text encoder layer
        self.text_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads),
            num_layers=num_layers
        )
        # music decoder layer
        self.music_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=hidden_size, nhead=num_heads),
            num_layers=num_layers
        )
        # full connected layer
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, music_input, text_input):
        music_emb = self.music_emb(music_input)

        text_emb = self.text_encoder(text_input)
        # duplicate text embedding shape toward music embedding
        text_emb_repeated = text_emb.unsqueeze(0).repeat(music_emb.size(0), 1, 1)

        # concat music and text embedding into one embedding (text conditioning)
        conditioned_emb = music_emb + text_emb_repeated

        # decode the embedding to midi output
        midi_format_output = self.music_decoder(conditioned_emb)
        return self.fc(midi_format_output)

# Teache Model & Synthetic Training Data

In [1]:
# theme word options
themes_word_elements = [
    "Love", "Heartbreak", "Nature", "Exploration", "Sci-fi", "Futuristic", "Motivation", "Resilience", "Storytelling", "Legends",
    "Philosophy", "Romantic", "Thoughts", "Sadness", "Melancholy", "Deep", "Joy", "Celebration", "Mystery", "Dark", "Light", "Fantasy",
    "Ecstasy", "Nostalgia", "Anger", "Serenity", "Anxiety", "Hope", "Envy", "Awe", "Underwater", "Desert", "Jungle", "Space Station",
    "Carnival", "Graveyard", "Medieval", "Renaissance", "Ancient Egypt", "Japanese Samurai", "African Safari", "Running", "Dancing",
    "Fighting", "Reading", "Flying", "Adventure", "Loneliness", "Magic", "Victory", "Reflection", "Surprise", "Tranquility", "Urgency",
    "Melancholy", "Nostalgia", "Epic", "Battle", "Cyberpunk", "City", "Shine", "Sunset", "Dark", "Forest", "Mystery", "Space", "Odyssey",
    "Chill", "Lo-Fi", "Vibes", "Jazz", "Lounge", "Night", "Day", "Ancient", "Ruins", "Exploration", "Medieval", "Folk", "Dance",
    "Dream", "Awakening", "Journey", "Destiny", "Time", "Eternity", "Universe", "Cosmic", "Stars", "Moon", "Sun", "Rain", "Snow",
    "Wind", "Fire", "Water", "Earth", "Sky", "Ocean", "River", "Mountain", "Valley", "Flower", "Tree", "Leaf", "Bird", "Animal",
    "Whisper", "Silence", "Echo", "Shadow", "Light", "Color", "Sound", "Harmony", "Chaos", "Balance", "Peace", "War", "Life", "Death",
    "Spirit", "Soul", "Heart", "Mind", "Body", "Strength", "Weakness", "Courage", "Fear", "Passion", "Desire", "Memory", "Future",
    "Present", "Past", "Childhood", "Adulthood", "Wisdom", "Knowledge", "Truth", "Lie", "Hope", "Despair", "Faith", "Doubt",
    "Change", "Growth", "Decay", "Creation", "Destruction", "Freedom", "Imprisonment", "Justice", "Injustice", "Love Song",
    "Ballad", "Anthem", "Hymn", "Lullaby", "Elegy", "Ode", "Symphony", "Concerto", "Sonata", "Etude", "Nocturne", "Waltz", "Tango",
    "Rumba", "Samba", "Cha-cha", "Swing", "Blues", "Rock", "Pop", "Hip-hop", "Electronic", "Classical", "World Music", "Ambient",
    "Minimalist", "Experimental", "Avant-garde", "Indie", "Alternative", "Underground", "Mainstream", "Commercial", "Independent",
    "Art", "Music", "Dance", "Theater", "Film", "Literature", "Poetry", "Painting", "Sculpture", "Architecture", "Science",
    "Technology", "Engineering", "Mathematics", "History", "Geography", "Culture", "Society", "Politics", "Economics", "Religion",
    "Philosophy", "Psychology", "Sociology", "Anthropology", "Education", "Health", "Environment", "Sustainability", "Innovation",
    "Progress", "Revolution", "Evolution", "Transformation", "Harmony", "Dissonance", "Contrast", "Balance", "Symmetry", "Asymmetry",
    "Repetition", "Variation", "Improvisation", "Composition", "Performance", "Audience", "Concert", "Festival", "Club", "Studio",
    "Stage", "Backstage", "Microphone", "Instrument", "Voice", "Melody", "Harmony", "Rhythm", "Tempo", "Dynamics", "Timbre", "Texture",
    "Form", "Structure", "Style", "Genre", "Era", "Movement", "Influence", "Inspiration", "Creativity", "Expression", "Communication",
    "Emotion", "Feeling", "Mood", "Atmosphere", "Story", "Narrative", "Theme", "Motif", "Symbol", "Metaphor", "Allegory", "Irony",
    "Humor", "Wit", "Sarcasm", "Paradox", "Mystery", "Suspense", "Drama", "Comedy", "Tragedy", "Romance", "Adventure", "Fantasy",
    "Horror", "Thriller", "Science Fiction", "Historical Fiction", "Contemporary", "Classic", "Modern", "Postmodern", "Abstract",
    "Surreal", "Realism", "Naturalism", "Romanticism", "Symbolism", "Expressionism", "Dadaism", "Surrealism", "Modernism",
    "Postmodernism", "Minimalism", "Conceptual Art", "Performance Art", "Installation Art", "Video Art", "Digital Art", "Pop Art",
    "Op Art", "Land Art", "Environmental Art", "Social Commentary", "Political Satire", "Personal Reflection", "Spiritual Journey",
    "Inner Peace", "Outer Chaos", "Human Condition", "Universal Themes", "Timeless Truths", "Ephemeral Beauty", "Fleeting Moments",
    "Precious Memories", "Lost Loves", "Unspoken Words", "Hidden Meanings", "Secret Desires", "Dreams", "Nightmares", "Hopes", "Fears",
    "Joys", "Sorrows", "Triumphs", "Failures", "Beginnings", "Endings", "Life", "Death", "Creation", "Destruction", "Order", "Chaos",
    "Light", "Shadow", "Good", "Evil", "Love", "Hate", "Peace", "War", "Freedom", "Imprisonment", "Justice", "Injustice",
    "Truth", "Lies", "Faith", "Doubt", "Certainty", "Uncertainty", "Change", "Growth", "Decay", "Renewal", "Progress", "Regression",
    "Evolution", "Transformation", "Harmony", "Dissonance", "Balance", "Imbalance", "Symmetry", "Asymmetry", "Repetition", "Variation",
    "Improvisation", "Composition", "Performance", "Audience", "Concert", "Festival", "Club", "Studio", "Stage", "Backstage",
    "Microphone", "Instrument", "Voice", "Melody", "Harmony", "Rhythm", "Tempo", "Dynamics", "Timbre", "Texture", "Form", "Structure",
    "Style", "Genre", "Era", "Movement", "Influence", "Inspiration", "Creativity", "Expression", "Communication", "Emotion", "Feeling",
    "Mood", "Atmosphere", "Story", "Narrative", "Theme", "Motif", "Symbol", "Metaphor", "Allegory", "Irony", "Humor", "Wit", "Sarcasm",
    "Paradox", "Mystery", "Suspense", "Drama", "Comedy", "Tragedy", "Romance", "Adventure", "Fantasy", "Horror", "Thriller",
    "Science", "Fiction", "Historical", "Contemporary", "Classic", "Modern", "Postmodern", "Abstract", "Surreal", "Realism",
    "Naturalism", "Romanticism", "Symbolism", "Expressionism", "Dadaism", "Surrealism", "Modernism", "Postmodernism", "Minimalism",
    "Conceptual", "Art", "Performance", "Installation", "Video", "Digital", "Pop", "Op", "Land", "Environmental", "Social", "Commentary",
    "Political", "Satire", "Personal", "Reflection", "Spiritual", "Journey", "Inner", "Outer", "Human", "Condition", "Universal",
    "Themes", "Timeless", "Truths", "Ephemeral", "Beauty", "Fleeting", "Moments", "Precious", "Memories", "Lost", "Loves",
    "Unspoken", "Words", "Hidden", "Meanings", "Secret", "Desires", "Dreams", "Nightmares", "Hopes", "Fears", "Joys", "Sorrows",
    "Triumphs", "Failures", "Beginnings", "Endings", "Life", "Death", "Creation", "Destruction", "Order", "Chaos", "Light", "Shadow",
    "Good", "Evil", "Love", "Hate", "Peace", "War", "Freedom", "Imprisonment", "Justice", "Injustice", "Truth", "Lies", "Faith",
    "Doubt", "Certainty", "Uncertainty", "Change", "Growth", "Decay", "Renewal", "Progress", "Regression", "Evolution", "Transformation",
    "Harmony", "Dissonance", "Balance", "Imbalance", "Symmetry", "Asymmetry", "Repetition", "Variation"
    ]

In [None]:
# script to generate 5k random music title/lyrics, in order to generate synthetic midi output for student model training
def random_generate_music_title(elem_array):
    random_text_len = random.choice(int)
    text = ""
    for i in range(random_text_len):
        random_text_elem = random.choice(elem_array)
        text += ((random_text_elem + " ") if random_text_len - i > 1 else random_text_elem)
    return text

file_path = r"D:\notebook\singer-doge\lyrics2music-dataset\music_title.txt"

with open(file_path, "w", encoding="utf-8") as file:
    for i in range(5000):
        text = random_generate_music_title(themes_word_elements)
        file.write(text)
        file.write("\n")

In [None]:
teacher_model = MusicGen.get_trained("facebook/musicgen-small")
lyrics = [
    ""
]

# Model Training

In [None]:
# initialize the model
model = Lyrics2MusicModel(input_size=128, hidden_size=512, output_size=128, num_heads=8, num_layers=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 50

# training epoch
for epoch in range(epochs):
    for music_input, text_input, target_output in dataloader:
        optimizer.zero_grad()
        output = model(music_input, text_input)
        loss = criterion(output, target_output)
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 5 == 0:
        print(f"epoch {epoch}, loss: {loss:.4f}")

# Model Evaluation