##### Imports:

In [None]:
from utils import NOTES_NAMES, FULL_CHORD_LIST, CUSTOM_ENCODING, INVERSE_ENCODING, separate_last_chord, mean_chord_distance_with_quality, predict_next_chords, calculate_covariance_from_chroma, separate_for_training, calculate_mu_from_chroma, calculate_transition_probabilites, format_indiv_chroma, get_unique_predicted, calculate_initial_probabilities, chord_distance_with_quality
import pickle
from chroma import get_chromagram
import pandas as pd
from tqdm import tqdm
import numpy as np
from hmmlearn import hmm
from sklearn.metrics import f1_score
import altair as alt

##### Steps:

1. Training / Testing Data Split
2. Create Chromagram from Training Data
3. Create HMM Initialization Components
    - Initial State Probabilities
    - Transition Probability Matrix
    - Mu Value
    - Emission Matrix
4. Create HMM Object
5. Fit / Train HMM

##### Training / Test Data Split:

In [None]:
# Load data and split into training and test
with open(r"dataset.pkl", 'rb') as data:
    midi_data:dict = pickle.load(data)

training_piece_names, test_piece_names = separate_for_training(midi_data, 0.8)
NOTES_NAMES =   ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
FULL_CHORD_LIST = [note + suffix for note in NOTES_NAMES for suffix in ['', 'm', 'dim']]

##### Create Chromagram from Training Data:

In [None]:
song_chromagrams = []
for song_name in tqdm(list(training_piece_names)):
    indiv_chroma = get_chromagram(song_name, midi_data)
    formatted = format_indiv_chroma(indiv_chroma)
    song_chromagrams.append(indiv_chroma)

chromagram = pd.concat(song_chromagrams)
chromagram.head(200)

##### Create HMM Components:

###### Initial State Probabilities:

In [None]:
initial_state_probabilties = calculate_initial_probabilities(training_piece_names, midi_data)
initial_state_probabilties

###### Transition Matrix:

In [None]:
transition_prob_matrix = calculate_transition_probabilites(chromagram)
print(transition_prob_matrix)
assert np.allclose(transition_prob_matrix.sum(axis=1), 1), "Not all rows sum to 1"

###### Mu Value:

In [None]:
mu = calculate_mu_from_chroma(chromagram)

###### Covariance Matrix:

In [None]:
covars = calculate_covariance_from_chroma(chromagram)
print("Covariances shape:", covars.shape)

In [None]:
#Initialize model and set parameters
model = hmm.GaussianHMM(n_components=transition_prob_matrix.shape[0], covariance_type="diag")
model.startprob_ = initial_state_probabilties
model.transmat_ = transition_prob_matrix.values
model.means_ = mu
model.covars_ = np.array([np.diag(cov_matrix) + 1e-6 for cov_matrix in covars]).reshape(-1, 12)
model.n_features = 36

In [None]:
#Conduct testing by separating the last chord from a testing piece and using the beginning of said piece to make a prediction.
#Compare true last chord to predicted last chord to make initial assessment
true_labels = []
predicted_labels = []
for song_name in tqdm(list(test_piece_names)):
    last_chord, chromagram_without_last_chord = separate_last_chord(get_chromagram(song_name, midi_data))
    if not chromagram_without_last_chord.empty:
        encoded_chromagram_without_last_chord = chromagram_without_last_chord['Chord Actual'].apply(lambda x: CUSTOM_ENCODING.get(x, -1)).values.reshape(-1, 1)
        preds = model.predict(encoded_chromagram_without_last_chord)
        prediction = preds[-1]
        predicted_labels.append(prediction)
        true_labels.append(CUSTOM_ENCODING.get(last_chord, -1))

f1 = f1_score(true_labels, predicted_labels, average='micro')
print(f"F1 Score: {f1}")

In [None]:
#Return chords to readable form, calculate average distance between predictions and true chords
inverted_custom_encoding = {val:key for key, val in CUSTOM_ENCODING.items()}

true_chords = pd.Series(true_labels).apply(lambda x: inverted_custom_encoding[x])
predicted_chords = pd.Series(predicted_labels).apply(lambda x: inverted_custom_encoding[x])
song_names = pd.Series(test_piece_names)

frame_data = {
    'Song Name': song_names,
    'True Chord': true_chords,
    'Predicted Chord': predicted_chords
}
chord_distance_df = pd.DataFrame(frame_data)

chord_distance_df['Distance'] = pd.Series([chord_distance_with_quality(pred, true) for pred, true in zip(chord_distance_df['True Chord'], chord_distance_df['Predicted Chord'])])


chord_distance_df.head()


In [None]:
#Plot distances
num_test_songs = chord_distance_df['Song Name'].count()
print(chord_distance_df['Distance'].apply(np.abs).mean())

distance_bars = alt.Chart(chord_distance_df).mark_bar(
    binSpacing = 0.1,
    width=1
).encode(
    x=alt.X('Song Name:N', axis=alt.Axis(labels=False, ticks=True), sort='-y').title(f'Songs (n={num_test_songs})'),
    y=alt.Y('Distance:Q').title('Distance (Actual - Predicted)'),
    color=alt.Color('Distance:N').scale(scheme='bluepurple'),
    tooltip=['Song Name', 'True Chord', 'Predicted Chord', 'Distance'],
).properties(
    title='Distribution of the distance between Predicted and Actual chords for each Test Song'
)

distance_avg_line = alt.Chart(chord_distance_df).mark_rule(
    color='red',
).encode(
    y=alt.Y('mean(Distance):Q', title='')
)

combined_chart = alt.layer(
    distance_bars,
    distance_avg_line
).properties(
    width=1000
)
combined_chart