# Import libraries, modules and packages

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Model Architecture

In [None]:
# convert lyrics to music in midi format model
class Lyrics2MusicModel(nn.Module):
    def __init__(self, text_emb_size, input_size, hidden_size, output_size, num_heads, num_layers):
        super(Lyrics2MusicModel, self).__init__()
        # music embedding layer
        self.music_emb = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)
        # text encoder layer
        self.text_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads),
            num_layers=num_layers
        )
        # music decoder layer
        self.music_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=hidden_size, nhead=num_heads),
            num_layers=num_layers
        )
        # full connected layer
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, music_input, text_input):
        music_emb = self.music_emb(music_input)

        text_emb = self.text_encoder(text_input)
        # duplicate text embedding shape toward music embedding
        text_emb_repeated = text_emb.unsqueeze(0).repeat(music_emb.size(0), 1, 1)

        # concat music and text embedding into one embedding (text conditioning)
        conditioned_emb = music_emb + text_emb_repeated

        # decode the embedding to midi output
        midi_format_output = self.music_decoder(conditioned_emb)
        return self.fc(midi_format_output)

# Check for the gpu available condition

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

# Datasets Collecting & Processing

In [None]:
batch_size = 32
sequence_length = 100

# find out dataset for lyrics and music that is syncronized
dataloader = []

dataloader.to(device)

# Model Training

In [None]:
# initialize the model
model = Lyrics2MusicModel(input_size=128, hidden_size=512, output_size=128, num_heads=8, num_layers=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 50

# training epoch
for epoch in range(epochs):
    for music_input, text_input, target_output in dataloader:
        optimizer.zero_grad()
        output = model(music_input, text_input)
        loss = criterion(output, target_output)
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 5 == 0:
        print(f"epoch {epoch}, loss: {loss:.4f}")

# Model Evaluation