Dataset Evaluation
===============
This notebook produces the data analysis graphics of the pieces generated by a model.

In [None]:
from music21 import corpus, converter, instrument, note, chord, stream, pitch, key
import sys
sys.path.append("..")
from preprocessing import read_mxl, get_notes
import pandas as pd
from fractions import Fraction
import os
from scipy.special import softmax
import matplotlib.pyplot as plt
from statistics import mean
from PIL import Image
import pickle

In [None]:
grid_search_dataset = '../../example_models/notes_slurs_dynamics_spanners_articulations_text-expressions'

In [None]:
def items_in_pieces(pieces, all_pitches, all_quarterLengths):
    pitches_in_pieces = {}
    qls_in_pieces = {}
    for k,p in pieces.items():
        pitches_in_p = []
        qls_in_p = []
        for n in p:
            pitches_in_p.append(n[0])
            qls_in_p.append(n[1])
        pitches_in_pieces.update({k:pitches_in_p})
        qls_in_pieces.update({k:qls_in_p})
    return frequencies_of_items(pitches_in_pieces, all_pitches), frequencies_of_items(qls_in_pieces, all_quarterLengths)

def frequencies_of_items(num_in_pieces, all_items):
    df_item_frequencies = pd.DataFrame(columns = ["file"] + list(all_items))
    for k,p in num_in_pieces.items():
        df_row = [k]
        for pi in df_item_frequencies.iteritems():
            if pi[0] == "file":
                continue
            freq_item = p.count(pi[0])
            df_row.append(freq_item)
        df_item_frequencies = df_item_frequencies.append(pd.Series(df_row, index=df_item_frequencies.columns), ignore_index=True)

    return df_item_frequencies

In [None]:
def sum_df_frequencies(df_pitch_frequencies):
    return df_pitch_frequencies.drop("file", axis=1).sum()

In [None]:
def concat_generation_types(gt_frequencies):
    return pd.concat(gt_frequencies, axis=1, sort=False)

In [None]:
plt.rcParams.update({'font.size': 40})
def pitches_per_piece(df_pitch_frequencies):
    return df_pitch_frequencies.astype(bool).sum(axis=1)
def produce_graphs(dataset):
    pitch_freq_series = []
    so_pitch_freq_series = []
    ql_freq_series = []
    gen_techs = []
    counted_acc_per_gen_tech = {}
    counted_pitches_per_gen_tech = {}
    counted_pitches_per_piece = []
    counted_accidentals = []
    mean_pitches_per_piece = []
    mean_accidentals = []
    for i, piece_folder in enumerate(os.listdir(dataset)+['abrsm_grade_5_violin/data']):
        full_folder_name = dataset+"/"+piece_folder if piece_folder != 'abrsm_grade_5_violin/data' else piece_folder
        if not os.path.isdir(full_folder_name):
            continue
        gen_techs.append(piece_folder)
        parts_paths, time_signatures, key_signatures, files = read_mxl(full_folder_name)
        accidentals = get_accidentals(parts_paths, key_signatures)
        mean_accidentals.append(mean(accidentals))
        counted_accidentals.append(accidentals)
        counted_acc_per_gen_tech[piece_folder] = {name:acc for name,acc in zip(files, accidentals)}
        pieces, all_pitches, all_quarterLengths = get_notes(parts_paths, single_octave=False)
        so_pieces, so_all_pitches, _ = get_notes(parts_paths, single_octave=True)
        pieces = {f:p for f, p in zip(files, pieces)}
        so_pieces = {f:p for f, p in zip(files, so_pieces)}
        df_pitch_frequencies, df_ql_frequencies = items_in_pieces(pieces, all_pitches, all_quarterLengths)
        df_so_pitch_frequencies, _ = items_in_pieces(so_pieces, so_all_pitches, all_quarterLengths)
        pitches_per_piece = df_pitch_frequencies.astype(bool).sum(axis=1)
        counted_pitches_per_gen_tech[piece_folder] = {name:acc for name,acc in zip(files, pitches_per_piece)}
        counted_pitches_per_piece.append(pitches_per_piece)
        mean_pitches_per_piece.append(pitches_per_piece.mean())
        pitch_series = sum_df_frequencies(df_pitch_frequencies)
        pitch_series = pitch_series.div(pitch_series.sum())
        pitch_freq_series.append(pitch_series)
        so_pitch_series = sum_df_frequencies(df_so_pitch_frequencies)
        so_pitch_series = so_pitch_series.div(so_pitch_series.sum())
        so_pitch_freq_series.append(so_pitch_series)
        ql_series = sum_df_frequencies(df_ql_frequencies)
        ql_series = ql_series.div(ql_series.sum())
        ql_freq_series.append(ql_series)
    

    plot_freq_dist(pitch_freq_series, gen_techs)
    plt.title("Pitch Frequency Analysis")
    plt.savefig(dataset+"/pitch_frequency_analysis.png")
    plt.clf()
    
    plt.bar(gen_techs, mean_pitches_per_piece, align='center')
    plt.title("Mean Number of Pitches Per Piece")
    plt.savefig(dataset+"/mean_pitches_per_piece.png")
    plt.clf()
    
    plt.bar(gen_techs, mean_accidentals, align='center')
    plt.title("Mean Number of Accidentals Per Piece")
    plt.savefig(dataset+"/mean_accidentals_per_piece.png")
    plt.clf()
    
    useful_plots = []
    
    plot_freq_dist(so_pitch_freq_series, gen_techs)
    plt.title("Pitch Frequency Analysis Single Octave")
    plt.savefig(dataset+"/pitch_frequency_analysis_single_octave.png")
    useful_plots.append(dataset+"/pitch_frequency_analysis_single_octave.png")
    plt.clf()
    
    plot_freq_dist(ql_freq_series, gen_techs)
    plt.title("Note Length Frequency Analysis")
    plt.savefig(dataset+"/ql_frequency_analysis.png")
    useful_plots.append(dataset+"/ql_frequency_analysis.png")
    plt.clf()
    
    ls = dict(linewidth=4.0, color='black')
    ml = dict(linewidth=4.0, color='orange')
    meanl = dict(linewidth=4.0, color='blue')
    fig, ax = plt.subplots(figsize=(30,30))
    ax.set_title('Box Plot Showing The Distribution of Accidentals Accross The Generated Pieces')
    ax.boxplot(counted_accidentals, labels=gen_techs, boxprops=ls , whiskerprops=ls, capprops=ls, flierprops=ls, medianprops=ml, meanprops=meanl, meanline=True, showmeans=True)
    plt.savefig(dataset+"/accidentals_per_piece.png")
    useful_plots.append(dataset+"/accidentals_per_piece.png")
    plt.clf()
    
    fig, ax = plt.subplots(figsize=(30,30))
    ax.set_title('Box Plot Showing The Distribution of Pitches Accross The Generated Pieces')
    ax.boxplot(counted_pitches_per_piece, labels=gen_techs, boxprops=ls, whiskerprops=ls, capprops=ls, flierprops=ls, medianprops=ml, meanprops=meanl, meanline=True, showmeans=True)
    plt.savefig(dataset+"/pitches_per_piece.png")
    useful_plots.append(dataset+"/pitches_per_piece.png")
    plt.clf()
    
    useful_plots_opened = [Image.open(p) for p in useful_plots]
    widths, heights = zip(*(i.size for i in useful_plots_opened))
    width = max(widths)
    height = max(heights)
    useful_plots_image = Image.new('RGB', (width*2, height*2))
    
    useful_plots_image.paste(useful_plots_opened[0], (0,0))
    useful_plots_image.paste(useful_plots_opened[1], (width,0))
    useful_plots_image.paste(useful_plots_opened[2], (0,height))
    useful_plots_image.paste(useful_plots_opened[3], (width,height))
    useful_plots_image.save(dataset+'/useful_plots.png')
    
    return counted_acc_per_gen_tech, counted_accidentals[-1], counted_pitches_per_gen_tech, counted_pitches_per_piece[-1]
    
def plot_freq_dist(freq_series, gen_techs):
    transposed_df = concat_generation_types(freq_series)
    column_names = {i:g for i,g in enumerate(gen_techs)}
    transposed_df = transposed_df.rename(columns = {i:g for i,g in enumerate(gen_techs)})
    width = 30
    if len(transposed_df) > 20:
        width = 100
    transposed_df.plot.bar(rot=0, figsize=(width,30))
    

def get_accidentals(parts_paths, key_signatures):
    pieces, _, _ = get_notes(parts_paths, single_octave=True)
    counted_accidentals = []
    for p, k in zip(pieces, key_signatures):
        key_object = key.KeySignature(k).getScale("major")
        notes_in_key = list(set([str(p)[:-1] for p in key_object.getPitches()]))
        all_pitches = [n[0] for n in p]
        all_accidentals = [p for p in all_pitches if p not in notes_in_key+["Rest"]]
        counted_accidentals.append(len(all_accidentals))
    return counted_accidentals

for i, piece_folder in enumerate(os.listdir(grid_search_dataset)):
    full_folder_name = "{}/test_results".format(grid_search_dataset, piece_folder)
    print(full_folder_name)
    if os.path.isdir(full_folder_name):
        for loss_val_loss in ["loss","val_loss"]:
            print("{}/{}/generated_scores".format(full_folder_name, loss_val_loss))
            produce_graphs("{}/{}/generated_scores".format(full_folder_name, loss_val_loss))