In [1]:
# importing necessary dependencies
import os
import os.path
os.chdir('..')
from src.read_pitchscapes import *
from src.key_profile import *
import pandas as pd
from pitchscapes.keyfinding import KeyEstimator
os.chdir('notebooks')
import scipy
from sklearn.manifold import Isomap
import unicodedata as ud

# Pitchscapes, metadata and useful columns
In this notebook, we prepare the pitch class distributions for training, along with the needed metadata and useful info.

In [2]:
# Depending on which case we're delaing with, we prepare the paths and composer name for the upcoming work
sample_composers = False
if sample_composers:
    composers = ['Scriabin_Alexander',
                 'Brahms_Johannes', 
                 'Liszt_Franz',
                 'Chopin_Frederic',
                 'Schubert_Franz',
                 'Debussy_Claude',
                 'Ravel_Maurice']
    
    dump_path = 'dump/sample_composers/'
    
else:
    composers = ['Bach_JohannSebastian']
    
    dump_path = 'dump/bach/'

dump_pieces_path = dump_path + 'pieces.csv'
dump_embedding_path = dump_path + 'X_transformed.npy'
dump_landmarks_path = dump_path + 'landmarks.npy'
dump_landmark_labels_path = dump_path + 'landmark_labels.npy'

**Note** In order to run this cell, the data must be in a data/ folder at the root of the project.
The dataset is private, but can be obtained from the DCML, our project supervisor is Dr Robert Lieck: robert.lieck@epfl.ch

However, we save the necessary files for the rest of the pipeline in .csv files. Therefore, it's not necessary to obtain the data.

In [3]:
# For each composer, read the pichscape distributions, we use n_times=10
df_pieces = pd.DataFrame()
for composer in composers:
    pitchscape_folder = read_pitchscapes_in_folder(f'../data/scores/{composer}', 10)
    pitchscape_folder['composer'] = composer
    df_pieces = pd.concat([df_pieces, pitchscape_folder]) 


reading folder: ../data/scores/Bach_JohannSebastian

reading piece: Bach_BWV_795.musicxml
reading piece: 227966-Prelude_No._2_BWV_847_in_C_Minor.mxl
reading piece: 1303106-Fugue_No._11_BWV_880_in_F_Major.mxl
reading piece: 667506-Prelude_No._19_BWV_864_in_A_Major.mxl
reading piece: Bach_BWV_772a.musicxml
reading piece: 971581-Prelude_No._3_BWV_872_in_C_Major.mxl
reading piece: 609331-Fugue_No._17_BWV_862_in_A_Major.mxl
reading piece: Brandenburg-Concerto-no.-5-BWV-1050_I-Allegro_Bach-Johann-Sebastian_file1.xml
reading piece: 571476-Fugue_No._16_BWV_861_in_G_Minor.mxl
reading piece: 1559631-Fugue_No._16_BWV_885_in_G_Minor.mxl
reading piece: 1497201-Prelude_No._15_BWV_884_in_G_Major.mxl
reading piece: 1718101-Fugue_No._19_BWV_888_in_A_Major.mxl
reading piece: 1141201-Fugue_No._7_BWV_876_in_E_Major.mxl
reading piece: 869596-Fugue_No._24_BWV_869_in_B_Minor.mxl
reading piece: 1032401-Prelude_No._4_BWV_873_in_C_Minor.mxl
reading piece: 936016-Prelude_No._2_BWV_871_in_C_Minor.mxl
reading pie

In [4]:
df_pieces.head(5)

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,ts,te,name,piece,composer
0,0.112179,0.067308,0.083333,0.073718,0.086538,0.152244,0.043269,0.06891,0.086538,0.064103,0.092949,0.06891,0.0,14.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian
1,0.13125,0.069792,0.054167,0.069792,0.097917,0.157292,0.034375,0.083333,0.0875,0.066667,0.095833,0.052083,0.0,28.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian
2,0.123457,0.076389,0.061728,0.075617,0.085648,0.151235,0.040123,0.07716,0.10571,0.055556,0.102623,0.044753,0.0,42.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian
3,0.120711,0.078431,0.066176,0.094363,0.07598,0.135417,0.03799,0.089461,0.110294,0.04902,0.10049,0.041667,0.0,56.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian
4,0.127541,0.07622,0.065041,0.099593,0.072154,0.135671,0.034553,0.09502,0.105183,0.049797,0.102642,0.036585,0.0,70.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian


In [5]:
# this is a small detail, there's a difference in the unicode encoding between 
# the metadata file and musicXML filenames, here we deal with that problem
df_pieces['name'] = df_pieces['name'].apply(lambda x: ud.normalize('NFD', str(x)))

In [6]:
# computing the window width for each data point
df_pieces["window_width"] = df_pieces['te'] - df_pieces['ts'] 

In [7]:
df_pieces.head()

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,ts,te,name,piece,composer,window_width
0,0.112179,0.067308,0.083333,0.073718,0.086538,0.152244,0.043269,0.06891,0.086538,0.064103,0.092949,0.06891,0.0,14.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,14.0
1,0.13125,0.069792,0.054167,0.069792,0.097917,0.157292,0.034375,0.083333,0.0875,0.066667,0.095833,0.052083,0.0,28.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,28.0
2,0.123457,0.076389,0.061728,0.075617,0.085648,0.151235,0.040123,0.07716,0.10571,0.055556,0.102623,0.044753,0.0,42.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,42.0
3,0.120711,0.078431,0.066176,0.094363,0.07598,0.135417,0.03799,0.089461,0.110294,0.04902,0.10049,0.041667,0.0,56.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,56.0
4,0.127541,0.07622,0.065041,0.099593,0.072154,0.135671,0.034553,0.09502,0.105183,0.049797,0.102642,0.036585,0.0,70.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,70.0


In [8]:
# computing the relative window width
window_width = (df_pieces['te'] - df_pieces['ts']).to_frame(name='window_width')
window_width['name'] = df_pieces['name']
maxs = window_width.groupby('name')['window_width'].max().rename('max')
mins = window_width.groupby('name')['window_width'].min().rename('min')
maxs_mins = pd.concat([maxs,mins],axis=1)
merged_max_min = df_pieces.merge(maxs_mins, left_on='name', right_index=True)

In [9]:
maxs_mins.head(5)

Unnamed: 0_level_0,max,min
name,Unnamed: 1_level_1,Unnamed: 2_level_1
1032401-Prelude_No._4_BWV_873_in_C_Minor.mxl,276.0,27.6
1040681-Fugue_No._4_BWV_873_in_C_Minor.mxl,210.75,21.075
1066276-Prelude_No._5_BWV_874_in_D_Major.mxl,224.0,22.4
1073231-Fugue_No._5_BWV_874_in_D_Major.mxl,199.5,19.95
1098286-Prelude_No._6_BWV_875_in_D_Minor.mxl,183.0,18.3


In [10]:
merged_max_min.head(5)

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,ts,te,name,piece,composer,window_width,max,min
0,0.112179,0.067308,0.083333,0.073718,0.086538,0.152244,0.043269,0.06891,0.086538,0.064103,0.092949,0.06891,0.0,14.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,14.0,140.0,14.0
1,0.13125,0.069792,0.054167,0.069792,0.097917,0.157292,0.034375,0.083333,0.0875,0.066667,0.095833,0.052083,0.0,28.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,28.0,140.0,14.0
2,0.123457,0.076389,0.061728,0.075617,0.085648,0.151235,0.040123,0.07716,0.10571,0.055556,0.102623,0.044753,0.0,42.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,42.0,140.0,14.0
3,0.120711,0.078431,0.066176,0.094363,0.07598,0.135417,0.03799,0.089461,0.110294,0.04902,0.10049,0.041667,0.0,56.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,56.0,140.0,14.0
4,0.127541,0.07622,0.065041,0.099593,0.072154,0.135671,0.034553,0.09502,0.105183,0.049797,0.102642,0.036585,0.0,70.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,70.0,140.0,14.0


In [11]:
df_pieces['relative_width'] = (merged_max_min['window_width'] - merged_max_min['min']) \
/ (merged_max_min['max'] - merged_max_min['min'])

In [12]:
df_pieces.head(5)

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,ts,te,name,piece,composer,window_width,relative_width
0,0.112179,0.067308,0.083333,0.073718,0.086538,0.152244,0.043269,0.06891,0.086538,0.064103,0.092949,0.06891,0.0,14.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,14.0,0.0
1,0.13125,0.069792,0.054167,0.069792,0.097917,0.157292,0.034375,0.083333,0.0875,0.066667,0.095833,0.052083,0.0,28.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,28.0,0.111111
2,0.123457,0.076389,0.061728,0.075617,0.085648,0.151235,0.040123,0.07716,0.10571,0.055556,0.102623,0.044753,0.0,42.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,42.0,0.222222
3,0.120711,0.078431,0.066176,0.094363,0.07598,0.135417,0.03799,0.089461,0.110294,0.04902,0.10049,0.041667,0.0,56.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,56.0,0.333333
4,0.127541,0.07622,0.065041,0.099593,0.072154,0.135671,0.034553,0.09502,0.105183,0.049797,0.102642,0.036585,0.0,70.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,70.0,0.444444


In [13]:
# getting the assignement to tonality and transpositions found by the Key estimator
data_pts = get_pitchscapes_as_np_array(df_pieces)
key_estim = KeyEstimator()
best_match = key_estim.get_estimate(data_pts)

In [14]:
# adding the major_minor estimate
df_pieces["tonality"] = pd.Series(best_match[:,0])
df_pieces["tonality"] = df_pieces["tonality"].apply(lambda r: "major" if r == 0 else "minor")

In [15]:
# adding transposition
df_pieces["transposition"] = pd.Series(best_match[:, 1])
mapping_transpo = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
df_pieces["transposition"] = df_pieces["transposition"].apply(lambda r: mapping_transpo[r])

In [16]:
df_pieces.head(5)

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,...,B,ts,te,name,piece,composer,window_width,relative_width,tonality,transposition
0,0.112179,0.067308,0.083333,0.073718,0.086538,0.152244,0.043269,0.06891,0.086538,0.064103,...,0.06891,0.0,14.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,14.0,0.0,minor,F
1,0.13125,0.069792,0.054167,0.069792,0.097917,0.157292,0.034375,0.083333,0.0875,0.066667,...,0.052083,0.0,28.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,28.0,0.111111,minor,F
2,0.123457,0.076389,0.061728,0.075617,0.085648,0.151235,0.040123,0.07716,0.10571,0.055556,...,0.044753,0.0,42.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,42.0,0.222222,minor,F
3,0.120711,0.078431,0.066176,0.094363,0.07598,0.135417,0.03799,0.089461,0.110294,0.04902,...,0.041667,0.0,56.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,56.0,0.333333,minor,F
4,0.127541,0.07622,0.065041,0.099593,0.072154,0.135671,0.034553,0.09502,0.105183,0.049797,...,0.036585,0.0,70.0,Bach_BWV_795.musicxml,0,Bach_JohannSebastian,70.0,0.444444,minor,F


In [17]:
# loading the metadata dataframe
metadata_df = pd.read_csv("../data/metadata.csv", sep="	")
metadata_df = metadata_df[["composer", "composer_first", "filename", "display_year"]]
metadata_df['filename'] = metadata_df['filename'].apply(lambda x: ud.normalize('NFD', str(x)))
metadata_df = metadata_df.rename(columns={"composer": "composer_last"})

In [18]:
metadata_df.head(5)

Unnamed: 0,composer_last,composer_first,filename,display_year
0,Alkan,Charles Valentin,Un_Morceau_Opus_15_No._1__in_A_Minor_Aime-moi.mxl,1837.0
1,Alkan,Charles Valentin,Un_Morceau_Opus_15_No._2_in_B_Minor_Le_Vent.mxl,1837.0
2,Alkan,Charles Valentin,Un_Morceau_Opus_15_No._3_in_E_Minor_Morte.mxl,1837.0
3,Alkan,Charles Valentin,Etude_de_Bravoure_Opus_16_No._3_in_B_Minor.mxl,1837.0
4,Alkan,Charles Valentin,Etude_de_Concert_pour_Piano_Le_Preux_Opus_17.mxl,1844.0


In [19]:
# merging the two dataframe 
pieces_final = df_pieces.merge(metadata_df, left_on="name", right_on="filename", how='inner').drop("name", axis=1)

In [22]:
# sanity check
pieces_final.isnull().any(None)

False

In [23]:
pieces_final.head(5)

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,...,piece,composer,window_width,relative_width,tonality,transposition,composer_last,composer_first,filename,display_year
0,0.112179,0.067308,0.083333,0.073718,0.086538,0.152244,0.043269,0.06891,0.086538,0.064103,...,0,Bach_JohannSebastian,14.0,0.0,minor,F,Bach,Johann Sebastian,Bach_BWV_795.musicxml,1723.0
1,0.13125,0.069792,0.054167,0.069792,0.097917,0.157292,0.034375,0.083333,0.0875,0.066667,...,0,Bach_JohannSebastian,28.0,0.111111,minor,F,Bach,Johann Sebastian,Bach_BWV_795.musicxml,1723.0
2,0.123457,0.076389,0.061728,0.075617,0.085648,0.151235,0.040123,0.07716,0.10571,0.055556,...,0,Bach_JohannSebastian,42.0,0.222222,minor,F,Bach,Johann Sebastian,Bach_BWV_795.musicxml,1723.0
3,0.120711,0.078431,0.066176,0.094363,0.07598,0.135417,0.03799,0.089461,0.110294,0.04902,...,0,Bach_JohannSebastian,56.0,0.333333,minor,F,Bach,Johann Sebastian,Bach_BWV_795.musicxml,1723.0
4,0.127541,0.07622,0.065041,0.099593,0.072154,0.135671,0.034553,0.09502,0.105183,0.049797,...,0,Bach_JohannSebastian,70.0,0.444444,minor,F,Bach,Johann Sebastian,Bach_BWV_795.musicxml,1723.0


In [None]:
# saving the dataframe to a csv file
pieces_final.to_csv(dump_pieces_path)

# Isomap Embedding:
**Warning** This takes a long time to run, especially for the multiple composers case! We have computed the embeddings and saved them in ./dump/, therefore the rest of the pipeline can be run without these cells.

In [None]:
# adding the albrecht key profile and the uniform distribution to the datapoints to compute a whole embedding
# The albrecht profile and uniform distribution will be used as landmarks to the visualisations
profile_transposed, labels = key_profile_uniform_add()

In [None]:
data_pts = get_pitchscapes_as_np_array(pieces_final)
embedding = Isomap(n_components=3, n_neighbors=10, metric=scipy.spatial.distance.jensenshannon)
X_transformed = embedding.fit_transform(data_pts)
landmarks = embedding.transform(profile_transposed)

In [None]:
np.save(dump_embedding_path, X_transformed)
np.save(dump_landmarks_path, landmarks)
np.save(dump_landmark_labels_path, labels)