# Import libraries, modules and packages

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os

from audiocraft.models import MusicGen

# Check out CUDA device availability

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

# Student Model Architecture

In [4]:
# convert title (lyrics, we called it lyrics just for a better memorization) to music in midi format model
class Lyrics2MusicModel(nn.Module):
    def __init__(self, text_emb_size, input_size, hidden_size, output_size, num_heads, num_layers):
        super(Lyrics2MusicModel, self).__init__()
        # music embedding layer
        self.music_emb = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)
        # text encoder layer
        self.text_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads),
            num_layers=num_layers
        )
        # music decoder layer
        self.music_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=hidden_size, nhead=num_heads),
            num_layers=num_layers
        )
        # full connected layer
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, music_input, text_input):
        music_emb = self.music_emb(music_input)

        text_emb = self.text_encoder(text_input)
        # duplicate text embedding shape toward music embedding
        text_emb_repeated = text_emb.unsqueeze(0).repeat(music_emb.size(0), 1, 1)

        # concat music and text embedding into one embedding (text conditioning)
        conditioned_emb = music_emb + text_emb_repeated

        # decode the embedding to midi output
        midi_format_output = self.music_decoder(conditioned_emb)
        return self.fc(midi_format_output)

# Teache Model & Synthetic Training Data

In [5]:
# theme word combination options
themes_word_elements = [
    "Love", "Heartbreak", "Nature", "Exploration", "Sci-fi", "Futuristic", "Motivation", "Resilience", "Storytelling", "Legends",
    "Philosophy", "Romantic", "Thoughts", "Sadness", "Melancholy", "Deep", "Joy", "Celebration", "Mystery", "Dark", "Light", "Fantasy",
    "Ecstasy", "Nostalgia", "Anger", "Serenity", "Anxiety", "Hope", "Envy", "Awe", "Underwater", "Desert", "Jungle", "Space Station",
    "Carnival", "Graveyard", "Medieval", "Renaissance", "Ancient Egypt", "Japanese Samurai", "African Safari", "Running", "Dancing",
    "Fighting", "Reading", "Flying", "Adventure", "Loneliness", "Magic", "Victory", "Reflection", "Surprise", "Tranquility", "Urgency",
    "Melancholy", "Nostalgia", "Epic", "Battle", "Cyberpunk", "City", "Shine", "Sunset", "Dark", "Forest", "Mystery", "Space", "Odyssey",
    "Chill", "Lo-Fi", "Vibes", "Jazz", "Lounge", "Night", "Day", "Ancient", "Ruins", "Exploration", "Medieval", "Folk", "Dance",
    "Dream", "Awakening", "Journey", "Destiny", "Time", "Eternity", "Universe", "Cosmic", "Stars", "Moon", "Sun", "Rain", "Snow",
    "Wind", "Fire", "Water", "Earth", "Sky", "Ocean", "River", "Mountain", "Valley", "Flower", "Tree", "Leaf", "Bird", "Animal",
    "Whisper", "Silence", "Echo", "Shadow", "Light", "Color", "Sound", "Harmony", "Chaos", "Balance", "Peace", "War", "Life", "Death",
    "Spirit", "Soul", "Heart", "Mind", "Body", "Strength", "Weakness", "Courage", "Fear", "Passion", "Desire", "Memory", "Future",
    "Present", "Past", "Childhood", "Adulthood", "Wisdom", "Knowledge", "Truth", "Lie", "Hope", "Despair", "Faith", "Doubt",
    "Change", "Growth", "Decay", "Creation", "Destruction", "Freedom", "Imprisonment", "Justice", "Injustice", "Love Song",
    "Ballad", "Anthem", "Hymn", "Lullaby", "Elegy", "Ode", "Symphony", "Concerto", "Sonata", "Etude", "Nocturne", "Waltz", "Tango",
    "Rumba", "Samba", "Cha-cha", "Swing", "Blues", "Rock", "Pop", "Hip-hop", "Electronic", "Classical", "World Music", "Ambient",
    "Minimalist", "Experimental", "Avant-garde", "Indie", "Alternative", "Underground", "Mainstream", "Commercial", "Independent",
    "Art", "Music", "Dance", "Theater", "Film", "Literature", "Poetry", "Painting", "Sculpture", "Architecture", "Science",
    "Technology", "Engineering", "Mathematics", "History", "Geography", "Culture", "Society", "Politics", "Economics", "Religion",
    "Philosophy", "Psychology", "Sociology", "Anthropology", "Education", "Health", "Environment", "Sustainability", "Innovation",
    "Progress", "Revolution", "Evolution", "Transformation", "Harmony", "Dissonance", "Contrast", "Balance", "Symmetry", "Asymmetry",
    "Repetition", "Variation", "Improvisation", "Composition", "Performance", "Audience", "Concert", "Festival", "Club", "Studio",
    "Stage", "Backstage", "Microphone", "Instrument", "Voice", "Melody", "Harmony", "Rhythm", "Tempo", "Dynamics", "Timbre", "Texture",
    "Form", "Structure", "Style", "Genre", "Era", "Movement", "Influence", "Inspiration", "Creativity", "Expression", "Communication",
    "Emotion", "Feeling", "Mood", "Atmosphere", "Story", "Narrative", "Theme", "Motif", "Symbol", "Metaphor", "Allegory", "Irony",
    "Humor", "Wit", "Sarcasm", "Paradox", "Mystery", "Suspense", "Drama", "Comedy", "Tragedy", "Romance", "Adventure", "Fantasy",
    "Horror", "Thriller", "Science Fiction", "Historical Fiction", "Contemporary", "Classic", "Modern", "Postmodern", "Abstract",
    "Surreal", "Realism", "Naturalism", "Romanticism", "Symbolism", "Expressionism", "Dadaism", "Surrealism", "Modernism",
    "Postmodernism", "Minimalism", "Conceptual Art", "Performance Art", "Installation Art", "Video Art", "Digital Art", "Pop Art",
    "Op Art", "Land Art", "Environmental Art", "Social Commentary", "Political Satire", "Personal Reflection", "Spiritual Journey",
    "Inner Peace", "Outer Chaos", "Human Condition", "Universal Themes", "Timeless Truths", "Ephemeral Beauty", "Fleeting Moments",
    "Precious Memories", "Lost Loves", "Unspoken Words", "Hidden Meanings", "Secret Desires", "Dreams", "Nightmares", "Hopes", "Fears",
    "Joys", "Sorrows", "Triumphs", "Failures", "Beginnings", "Endings", "Life", "Death", "Creation", "Destruction", "Order", "Chaos",
    "Light", "Shadow", "Good", "Evil", "Love", "Hate", "Peace", "War", "Freedom", "Imprisonment", "Justice", "Injustice",
    "Truth", "Lies", "Faith", "Doubt", "Certainty", "Uncertainty", "Change", "Growth", "Decay", "Renewal", "Progress", "Regression",
    "Evolution", "Transformation", "Harmony", "Dissonance", "Balance", "Imbalance", "Symmetry", "Asymmetry", "Repetition", "Variation",
    "Improvisation", "Composition", "Performance", "Audience", "Concert", "Festival", "Club", "Studio", "Stage", "Backstage",
    "Microphone", "Instrument", "Voice", "Melody", "Harmony", "Rhythm", "Tempo", "Dynamics", "Timbre", "Texture", "Form", "Structure",
    "Style", "Genre", "Era", "Movement", "Influence", "Inspiration", "Creativity", "Expression", "Communication", "Emotion", "Feeling",
    "Mood", "Atmosphere", "Story", "Narrative", "Theme", "Motif", "Symbol", "Metaphor", "Allegory", "Irony", "Humor", "Wit", "Sarcasm",
    "Paradox", "Mystery", "Suspense", "Drama", "Comedy", "Tragedy", "Romance", "Adventure", "Fantasy", "Horror", "Thriller",
    "Science", "Fiction", "Historical", "Contemporary", "Classic", "Modern", "Postmodern", "Abstract", "Surreal", "Realism",
    "Naturalism", "Romanticism", "Symbolism", "Expressionism", "Dadaism", "Surrealism", "Modernism", "Postmodernism", "Minimalism",
    "Conceptual", "Art", "Performance", "Installation", "Video", "Digital", "Pop", "Op", "Land", "Environmental", "Social", "Commentary",
    "Political", "Satire", "Personal", "Reflection", "Spiritual", "Journey", "Inner", "Outer", "Human", "Condition", "Universal",
    "Themes", "Timeless", "Truths", "Ephemeral", "Beauty", "Fleeting", "Moments", "Precious", "Memories", "Lost", "Loves",
    "Unspoken", "Words", "Hidden", "Meanings", "Secret", "Desires", "Dreams", "Nightmares", "Hopes", "Fears", "Joys", "Sorrows",
    "Triumphs", "Failures", "Beginnings", "Endings", "Life", "Death", "Creation", "Destruction", "Order", "Chaos", "Light", "Shadow",
    "Good", "Evil", "Love", "Hate", "Peace", "War", "Freedom", "Imprisonment", "Justice", "Injustice", "Truth", "Lies", "Faith",
    "Doubt", "Certainty", "Uncertainty", "Change", "Growth", "Decay", "Renewal", "Progress", "Regression", "Evolution", "Transformation",
    "Harmony", "Dissonance", "Balance", "Imbalance", "Symmetry", "Asymmetry", "Repetition", "Variation"
    ]

In [6]:
# script to generate 5k random music title, in order to generate synthetic midi output for student model training
def random_generate_music_title(elem_array):
    # random and max length for music title
    random_text_len = random.randint(1, 3)
    text = ""
    for i in range(random_text_len):
        # random music title generation
        random_text_elem = random.choice(elem_array)
        text += ((random_text_elem + " ") if random_text_len - i > 1 else random_text_elem)
    return text

# saving files parameters (obs: i am saving into my hdd, due the lack memory space of my notebook)
music_quantity = 5000
hdd_directory_path = r"D:\notebook\singer-doge\lyrics2music-dataset"
music_titles_path = os.path.join(hdd_directory_path, "music_title.txt")
teacher_midis_path = os.path.join(hdd_directory_path, "midi_teacher_outputs.pth")
if not os.path.exists(hdd_directory_path):
    os.makedirs(hdd_directory_path)

# saving the music titles into a file
with open(music_titles_path, "w", encoding="utf-8") as file:
    for i in range(music_quantity):
        text = random_generate_music_title(themes_word_elements)
        file.write(text)
        file.write("\n")

In [12]:
# activating the teacher model from musicgen
teacher_model = MusicGen.get_pretrained("facebook/musicgen-small")
#teacher_model = teacher_model.to(device)

# input of music titles
music_titles = []
# output of teacher midis
teacher_midis = []



In [None]:
# open saved music titles files
with open(music_titles_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file, start=1):
        # each music title
        music_title = line.strip()
        music_titles.append(music_title)
        # each midi tokens
        midi_tokens = teacher_model.generate([music_title], progress=True)
        midi_tokens_cpu = [token.cpu() for token in midi_tokens]
        teacher_midis.append(midi_tokens_cpu)
        
try:
    # save the output in a tensor shape for futher training usage
    torch.save({"title": music_titles, "midi": teacher_midis}, teacher_midis_path)
except Exception as e:
    print(f"Erro by saving teacher midis: ", e)

# Model Training

In [None]:
# initialize the model
model = Lyrics2MusicModel(input_size=128, hidden_size=512, output_size=128, num_heads=8, num_layers=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 50

# training epoch
for epoch in range(epochs):
    for music_input, text_input, target_output in dataloader:
        optimizer.zero_grad()
        output = model(music_input, text_input)
        loss = criterion(output, target_output)
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 5 == 0:
        print(f"epoch {epoch}, loss: {loss:.4f}")

# Model Evaluation