In [4]:
import pickle
import statistics
import librosa as lbs
from tqdm import tqdm
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import random
from chroma import *
import traceback
from utils import preprocess, separate_for_training

In [7]:
NOTES_NAMES =   ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
FULL_CHORD_LIST = [note + suffix for note in NOTES_NAMES for suffix in ['', 'm', 'dim']]

In [22]:
def format_indiv_chroma(unformatted_chroma:pd.DataFrame):
    # TODO: determine what values go in the start and end rows
    zeroed_vals = [[0 for i in range(unformatted_chroma.shape[1])]]
    # start = pd.DataFrame([zeroed_vals[0][:-1] + [['<S>']]], columns=unformatted_chroma.columns)
    middle = unformatted_chroma
    end = pd.DataFrame([zeroed_vals[0][:-1] + [['<E>']]], columns=unformatted_chroma.columns)

    formatted_chroma = pd.concat([middle, end]).reset_index(drop=True)
    return formatted_chroma

In [5]:
# Load data and split into training and test
piece_name_dict = preprocess('dataset.pkl')

training_piece_names, _, test_piece_names = separate_for_training(piece_name_dict, 0.8, 0.)

with open(r"dataset.pkl", 'rb') as data:
    midi_data = pickle.load(data)

100%|██████████| 5762/5762 [00:13<00:00, 425.90it/s]


##### Initial Probabilities:

In [14]:
def get_initial_chord(file_name, midi_data):
    mode = midi_data[file_name]['mode']
    # check if sequence is in a minor or major scale
    if mode == 'm':
        seq_scale = get_minor_scale(NOTES_NAMES, file_name, midi_data)
    else:
        seq_scale = get_maj_scale(NOTES_NAMES, file_name, midi_data)
    # get the first chord
    chord = list(set(get_progression(file_name, midi_data)))[0]
    if chord not in seq_scale:
        # define regex pattern to get an instance of the chord
        pattern = r'\b\w*{}\w*\b'.format(re.escape(chord))
        # join list of chords into a single string to parse for regex and replace in chroma labels
        # each seq_scale is of length 7
        scale_to_string = (' ').join(seq_scale)
        # find true chord
        found_chords = re.findall(pattern, scale_to_string, flags=re.IGNORECASE)
        if found_chords:
            chord = found_chords[0]
        else:
            return None
    return chord

#returns all initial probabilities, also adapts for dimensions of transition matrix
#returns a 36x1 of probabilities for each chord
def calculate_initial_probabilities(filenames, midi_data):
    first_chords = []
    # Get all initial chords
    for file_name in filenames:
        chord = get_initial_chord(file_name, midi_data)
        if chord is not None:
            first_chords.append(chord)
    chord_counts = np.unique(first_chords, return_counts=True)
    total_num_chords = chord_counts[1].sum()
    probabilities = chord_counts[1].astype(np.float64)/float(total_num_chords)
    # Create a Series from the counts
    initial_probs = pd.Series(probabilities, index=chord_counts[0])
    # all_chords = pd.Series(np.zeros(len(FULL_CHORD_LIST)), index=FULL_CHORD_LIST)
    # all_chords.update(initial_probs)
    dif = 1.0 - initial_probs.sum() 
    if dif != 0:
        max_index = np.argmax(initial_probs)
        initial_probs.iloc[max_index] += dif

    return initial_probs

In [19]:
initial_probabilities = calculate_initial_probabilities(test_piece_names, midi_data)
initial_probabilities, initial_probabilities.shape

(A      0.017346
 B      0.230703
 Bm     0.397225
 C      0.014744
 C#m    0.000867
 D      0.010408
 D#m    0.156982
 Em     0.018213
 F#     0.111882
 F#m    0.041631
 dtype: float64,
 (10,))

##### Transition Matrix:

In [30]:
def calculate_chord_prob(chord_notes):
    group_count = chord_notes.groupby('following_chords').size().reset_index()
    group_count.columns = ['following_chords', 'count']
    total = group_count['count'].sum()
    group_count['transition_probability'] = group_count['count'] / total
    return group_count

def calculate_transition_probabilites(chroma):
    # Look into splitting between songs somehow
    initial_chords = chroma['Chord Actual'].values[:-1]
    following_chords = chroma['Chord Actual'][1:].tolist()

    sequence_df = pd.DataFrame({'initial_chords': initial_chords, 'following_chords': following_chords})

    transition_prob_matrix = sequence_df.groupby('initial_chords').apply(calculate_chord_prob).reset_index().drop('level_1', axis=1)

    transition_prob_matrix = transition_prob_matrix.pivot(index='initial_chords', columns='following_chords', values='transition_probability')

    # Transition probabilities for start and end states
    # transition_prob_matrix['<E>'] = pd.Series(np.zeros(transition_prob_matrix.shape[1]), name='<E>')

    # transition_prob_matrix = transition_prob_matrix.fillna(0)

    # transition_prob_matrix['<S>'] = 0
    # transition_prob_matrix.loc['<E>'] = 0
    # transition_prob_matrix.loc['<E>', '<E>'] = 1

    # # Initialize a 36x36 DataFrame with zeros
    # all_chords_matrix = pd.DataFrame(0, index=FULL_CHORD_LIST, columns=FULL_CHORD_LIST)

    # # Update this matrix with the calculated transition probabilities
    # all_chords_matrix.update(transition_prob_matrix)

    # # Fill any NaN values with 0
    # all_chords_matrix = all_chords_matrix.fillna(0)

    transition_prob_matrix = transition_prob_matrix.fillna(0.)

    # for chord in transition_prob_matrix.index:
    #     row = transition_prob_matrix.loc[chord]
    #     row_sum = row.sum()
    #     if row_sum != 1:
    #         row[chord] = 1.
    #     transition_prob_matrix.loc[chord] = row

    return transition_prob_matrix

In [33]:
song_chromagrams = []
for song_name in tqdm(list(training_piece_names)):
    indiv_chroma = get_chromagram(song_name, midi_data)
    formatted = format_indiv_chroma(indiv_chroma)
    song_chromagrams.append(indiv_chroma)

chromagram = pd.concat(song_chromagrams)
chromagram.head(-1)

100%|██████████| 4609/4609 [01:38<00:00, 46.65it/s] 


Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,Chord Actual
0,0,0,257,0,0,0,0,67,0,75,0,145,G
1,0,0,257,0,0,0,0,67,0,75,0,145,G
2,0,0,257,0,0,0,0,67,0,75,0,145,G
3,0,0,257,0,0,0,0,67,0,75,0,145,G
4,0,0,257,0,0,0,0,67,0,75,0,145,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,0,0,0,0,0,0,0,0,0,0,127,0,F#
26,0,0,0,127,0,127,0,127,0,0,254,0,F#
28,0,0,0,127,0,127,0,127,0,0,254,0,F#
29,0,0,0,0,0,127,0,0,0,0,127,0,F#


In [32]:
transition_matrix = calculate_transition_probabilites(chromagram)
transition_matrix, transition_matrix.shape, [transition_matrix.loc[i].sum() for i in transition_matrix.index]

  transition_prob_matrix = sequence_df.groupby('initial_chords').apply(calculate_chord_prob).reset_index().drop('level_1', axis=1)


(following_chords         A        A#     A#dim         B        Bm         C  \
 initial_chords                                                                 
 A                 0.861361  0.000000  0.000072  0.005890  0.042310  0.000216   
 A#                0.000000  0.875000  0.000000  0.000000  0.000000  0.019231   
 A#dim             0.000000  0.000000  0.878641  0.012136  0.021845  0.000000   
 B                 0.001176  0.000000  0.000181  0.863912  0.004340  0.000000   
 Bm                0.042588  0.000000  0.000000  0.002165  0.863114  0.000131   
 C                 0.005894  0.000000  0.000000  0.005894  0.005894  0.860511   
 C#dim             0.002597  0.000000  0.000000  0.012987  0.036364  0.000000   
 C#m               0.000444  0.000000  0.000000  0.007398  0.004143  0.000000   
 D                 0.051112  0.000086  0.000000  0.004037  0.028262  0.000344   
 D#                0.000000  0.000000  0.000000  0.000000  0.015625  0.000000   
 D#m               0.001008 

##### Mu and Covariance:

In [76]:
def calculate_mu_from_chroma(chroma):
    ''' 
    
    '''
    return chroma[NOTES_NAMES].mean()

def calculate_emission_from_chroma(chroma):
    
    matrices = []

    # seen_chord_groups = chroma.groupby('Chord Actual')
    # for chord in FULL_CHORD_LIST:
    #     if chord in seen_chord_groups.index:
    #         # actual covariance
    #     else:
    #         # Fake covariance
        
        
    chord_groups = chroma.groupby('Chord Actual')
    print(chord_groups.groups.keys())
    for chord, group in chord_groups:
        chord_cov_matrix = group[list(chord_groups.groups.keys())].cov().values
        matrices.append(chord_cov_matrix)

    return np.array(matrices)
    # return chord_groups

In [67]:
calculate_emission_from_chroma(chromagram)

Unnamed: 0_level_0,C,C,C,C,C,C,C,C,C#,C#,...,A#,A#,B,B,B,B,B,B,B,B
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Chord Actual,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A,13921.0,1.855757,16.328036,0.0,0.0,0.0,0.0,226.0,13921.0,89.86488,...,62.0,300.0,13921.0,19.233029,42.928168,0.0,0.0,0.0,0.0,254.0
A#,104.0,19.769231,29.816667,0.0,0.0,0.0,62.0,67.0,104.0,0.0,...,120.0,173.0,104.0,14.0,28.576588,0.0,0.0,0.0,0.0,97.0
A#dim,412.0,5.097087,26.437279,0.0,0.0,0.0,0.0,181.0,412.0,0.0,...,191.25,392.0,412.0,1.786408,8.906817,0.0,0.0,0.0,0.0,48.0
B,11059.0,0.465142,6.251723,0.0,0.0,0.0,0.0,118.0,11059.0,31.426259,...,0.0,235.0,11059.0,134.502035,82.735313,0.0,66.0,128.0,177.0,442.0
Bm,15239.0,2.003872,16.211812,0.0,0.0,0.0,0.0,218.0,15239.0,10.580812,...,0.0,227.0,15239.0,96.979198,74.464303,0.0,53.0,88.0,140.0,461.0
C,509.0,157.031434,112.309134,0.0,58.0,144.0,228.0,457.0,509.0,0.848723,...,0.0,79.0,509.0,14.37721,36.269051,0.0,0.0,0.0,0.0,214.0
C#dim,385.0,4.072727,27.994696,0.0,0.0,0.0,0.0,196.0,385.0,134.654545,...,0.0,184.0,385.0,1.755844,10.275508,0.0,0.0,0.0,0.0,81.0
C#m,6759.0,7.96316,29.314991,0.0,0.0,0.0,0.0,213.0,6759.0,87.407013,...,118.0,399.0,6759.0,17.600829,45.457526,0.0,0.0,0.0,0.0,249.0
D,11641.0,0.022077,1.454444,0.0,0.0,0.0,0.0,127.0,11641.0,9.05704,...,0.0,94.0,11641.0,3.248518,16.865493,0.0,0.0,0.0,0.0,207.0
D#,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,...,113.0,121.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
mu = calculate_mu_from_chroma(chromagram)
cov = calculate_emission_from_chroma(chromagram)
mu, cov, cov.shape

dict_keys(['A', 'A#', 'A#dim', 'B', 'Bm', 'C', 'C#dim', 'C#m', 'D', 'D#', 'D#m', 'E', 'Em', 'F', 'F#', 'F#m', 'G', 'G#', 'G#m', 'dim'])


KeyError: "['A#dim', 'Bm', 'C#dim', 'C#m', 'D#m', 'Em', 'F#m', 'G#m', 'dim'] not in index"