In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("songSectionDataClean.csv")

In [9]:
data = df.Progression.tolist()

['I-VI#-ii-I-v-ii-I', 'VI#-ii-I-v-ii-I', 'v-ii-I', 'I-VI#-ii-I-v-ii-I', 'I-VI#-ii-I-v-ii-I', 'I-VI#-ii-I-v-ii-I', 'I-ii-vi-IV', 'I-ii-vi-IV', 'I-ii-vi-IV', 'iii-vi-V', 'I-ii-vi-IV', 'I-vi-IV-V', 'vi-IV-I-V', 'I-V-vi-IV', 'vi-IV-I-V-I-V', 'vi', 'vi', 'vi-I-IV-III', 'I-V-ii-iv', 'vi-V#-I-II-ii', 'ii-V-vi-V#-I-IV#', 'I-V-ii', 'vi', 'vi-ii-III-vi', 'vi', 'vi-ii-III-vi', 'vi-ii-III-vi', 'I-II', 'ii-vi-I-v', 'ii-i#-I#-I-ii-vi#-VII-ii-vi#-VII-vi#-ii-vi#-VII-ii', 'I-II', 'I-II-iii-V-I', 'I-II-iii', 'I-II-iii', 'I-vi-ii-I', 'IV-vi-ii-I', 'IV-vi-ii-I', 'IV-vi-ii-I', 'I-V-vi-IV-I', 'I-V-vi-IV', 'I-V-vi-IV-I-V-vi-V', 'I-V-vi-IV', 'IV-vi-V-iii', 'IV-vi-V-iii', 'iii-IV-vi-V', 'IV-vi-V', 'IV-vi-V-iii', 'I-I#', 'I-I#', 'ii-V-I-iii-vi', 'I-I#', 'I-V-vi-V', 'I-V-vi-V-I-V-vi-II', 'I-V-vi-VII-I-V-vi-II-I-VII-iii', 'I', 'I-V-ii-IV', 'I-V-ii-IV', 'vi-iii-IV-ii', 'vi-iii-IV-ii', 'vi-iii-IV-ii', 'IV-III-vi-I', 'IV-III-vi-I', 'IV-III-vi-I-IV-III-vi', 'ii-V-I-IV', 'I-V-ii-IV', 'I-V-ii-IV', 'I-V-ii-IV-V', 'ii-IV

In [10]:
tokenizer = Tokenizer(filters=' ', split="-")  # chords are separated by '-'
tokenizer.fit_on_texts(data)

# convert chord progressions to sequences of integer tokens
sequences = tokenizer.texts_to_sequences(data)

# create input and output sequences
input_sequences = []
output_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_sequences.append(sequence[:i])
        output_sequences.append(sequence[i])

# pad input sequences with zeros at the beginning
input_sequences = pad_sequences(input_sequences)

# convert output sequences to categorical (one-hot encoding)
output_sequences = to_categorical(output_sequences, num_classes=len(tokenizer.word_index) + 1)


In [11]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=input_sequences.shape[1]))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])




In [12]:
model.fit(input_sequences, output_sequences, epochs=50, verbose=1)  # choose suitable number of epochs


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x16a8bedd0>

In [20]:
def predict_chord(model, tokenizer, text, num_chords):
    sequence = tokenizer.texts_to_sequences([text])[0]
    sequence = pad_sequences([sequence], maxlen=input_sequences.shape[1])
    
    # Generate a sequence of chords
    chord_sequence = []
    for _ in range(num_chords):
        prediction = model.predict(sequence)
        predicted_class = np.argmax(prediction, axis=-1)
        chord = tokenizer.sequences_to_texts([predicted_class])[0]
        chord_sequence.append(chord)
        
        # Update sequence with predicted class for next prediction
        sequence = np.append(sequence[0, 1:], predicted_class)
        sequence = np.reshape(sequence, (1, len(sequence)))
        
    return chord_sequence

print(predict_chord(model, tokenizer, 'I', 2))

['v', 'vi']
