### Symbolic, unconditioned generation 

I.e., train a model that learns a music distribution p(x) (presumably based on some training set) and samples from that 
distribution. A Markov chain (Module 3) is an example of such a model (though you
should generally avoid models already implemented in homeworks unless you can significantly extend them)

In [1]:
import os
import json
import pandas as pd
import numpy as np
import miditoolkit

In [2]:
data = []
maestro_root = "maestro-v3.0.0"

for year in ["2004", "2006", "2008", "2009", "2011", "2013", "2014", "2015", "2017", "2018"]:
    folder = os.path.join(maestro_root, year)
    for fname in os.listdir(folder):
        if not fname.endswith(".midi"):
            continue
        path = os.path.join(folder, fname)
        try:
            midi = miditoolkit.MidiFile(path)
            all_notes = [n for inst in midi.instruments for n in inst.notes]
            pitches = [n.pitch for n in all_notes]
            has_drums = any(inst.is_drum for inst in midi.instruments)
            composer = fname.split("_")[0]

            row = {
                "filename": fname,
                "duration_ticks": midi.max_tick,
                "tempo_bpm": midi.tempo_changes[0].tempo if midi.tempo_changes else 120,
                "num_notes": len(pitches),
                "avg_pitch": np.mean(pitches) if pitches else 0,
                "pitch_range": max(pitches) - min(pitches) if pitches else 0,
                "unique_pitches": len(set(pitches)),
            }

            data.append(row)

        except Exception as e:
            print("Error in", fname, ":", e)

df = pd.DataFrame(data)

In [3]:
orig = pd.read_csv("maestro-v3.0.0\maestro-v3.0.0.csv")
orig

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.661160
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508
...,...,...,...,...,...,...,...
1271,Wolfgang Amadeus Mozart,"Sonata in F Major, K280",test,2004,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,241.470442
1272,Wolfgang Amadeus Mozart,"Sonata in F Major, K280",train,2004,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,114.696243
1273,Wolfgang Amadeus Mozart,"Sonata in F Major, K533",validation,2004,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,1139.198478
1274,Wolfgang Amadeus Mozart,"Sonata in F Major, K533/K494",validation,2018,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,1068.751602


In [5]:
orig["filename"] = orig["midi_filename"].apply(lambda x: os.path.basename(x))
orig

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration,filename
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.661160,MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_201...
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471,MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AU...
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split...
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AU...
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MID--AU...
...,...,...,...,...,...,...,...,...
1271,Wolfgang Amadeus Mozart,"Sonata in F Major, K280",test,2004,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,241.470442,MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MID--AU...
1272,Wolfgang Amadeus Mozart,"Sonata in F Major, K280",train,2004,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,114.696243,MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MID--AU...
1273,Wolfgang Amadeus Mozart,"Sonata in F Major, K533",validation,2004,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,1139.198478,MIDI-Unprocessed_SMF_12_01_2004_01-05_ORIG_MID...
1274,Wolfgang Amadeus Mozart,"Sonata in F Major, K533/K494",validation,2018,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,1068.751602,MIDI-Unprocessed_Recital17-19_MID--AUDIO_17_R1...


In [4]:
df

Unnamed: 0,filename,duration_ticks,tempo_bpm,num_notes,avg_pitch,pitch_range,unique_pitches
0,MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID...,930262,120.0,7894,64.034837,46,44
1,MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID...,256743,120.0,1181,57.906859,67,59
2,MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID...,186331,120.0,2850,67.609123,81,72
3,MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID...,300151,120.0,6369,65.373528,77,75
4,MIDI-Unprocessed_SMF_05_R1_2004_01_ORIG_MID--A...,1352994,120.0,18947,65.783871,76,76
...,...,...,...,...,...,...,...
1271,MIDI-Unprocessed_Schubert4-6_MID--AUDIO_09_R2_...,1967836,120.0,21435,63.313133,73,73
1272,MIDI-Unprocessed_Schubert4-6_MID--AUDIO_10_R2_...,1861252,120.0,21420,63.317880,73,72
1273,MIDI-Unprocessed_Schubert7-9_MID--AUDIO_11_R2_...,1252962,120.0,16662,60.349058,75,73
1274,MIDI-Unprocessed_Schubert7-9_MID--AUDIO_15_R2_...,1598497,120.0,16853,64.632350,72,72


In [10]:
df_combined = orig.merge(df, on="filename", how="inner")
df_combined

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration,filename,duration_ticks,tempo_bpm,num_notes,avg_pitch,pitch_range,unique_pitches
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.661160,MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_201...,540659,120.0,4197,65.149392,75,72
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471,MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AU...,587304,120.0,4206,65.247741,75,74
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split...,447225,120.0,3326,65.996091,75,72
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AU...,837875,120.0,6316,60.581856,81,78
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MID--AU...,306880,120.0,4019,68.433192,83,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1271,Wolfgang Amadeus Mozart,"Sonata in F Major, K280",test,2004,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,241.470442,MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MID--AU...,231819,120.0,875,63.299429,55,50
1272,Wolfgang Amadeus Mozart,"Sonata in F Major, K280",train,2004,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,114.696243,MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MID--AU...,110871,120.0,1231,66.162470,56,53
1273,Wolfgang Amadeus Mozart,"Sonata in F Major, K533",validation,2004,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,1139.198478,MIDI-Unprocessed_SMF_12_01_2004_01-05_ORIG_MID...,1093633,120.0,7557,66.950642,60,59
1274,Wolfgang Amadeus Mozart,"Sonata in F Major, K533/K494",validation,2018,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,1068.751602,MIDI-Unprocessed_Recital17-19_MID--AUDIO_17_R1...,821012,120.0,7533,66.913049,60,59


In [11]:
df_combined.drop(columns=["audio_filename"], inplace=True)
df_combined.head()

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,duration,filename,duration_ticks,tempo_bpm,num_notes,avg_pitch,pitch_range,unique_pitches
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116,MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_201...,540659,120.0,4197,65.149392,75,72
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471,MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AU...,587304,120.0,4206,65.247741,75,74
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split...,447225,120.0,3326,65.996091,75,72
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AU...,837875,120.0,6316,60.581856,81,78
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MID--AU...,306880,120.0,4019,68.433192,83,82


In [12]:
from miditok import REMI, TokenizerConfig
from pathlib import Path

config = TokenizerConfig()
tokenizer = REMI(config)


In [14]:
base_dir = Path("maestro-v3.0.0").resolve()
token_seqs = []

for row in df_combined.itertuples():
    rel_path = Path(str(row.year)) / row.filename
    abs_path = base_dir / rel_path
    try:
        tokens = tokenizer(abs_path)
        if isinstance(tokens, list):
            token_ids = []
            for t in tokens:
                token_ids.extend(t.ids)
        else:
            token_ids = tokens.ids
    except Exception as e:
        print(f"Tokenization failed for {abs_path}: {e}")
        token_ids = []
    token_seqs.append(token_ids)

df_combined["token_sequence"] = token_seqs

In [15]:
df_combined.head()

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,duration,filename,duration_ticks,tempo_bpm,num_notes,avg_pitch,pitch_range,unique_pitches,token_sequence
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116,MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_201...,540659,120.0,4197,65.149392,75,72,"[4, 205, 51, 105, 137, 218, 56, 109, 126, 4, 1..."
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471,MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AU...,587304,120.0,4206,65.247741,75,74,"[4, 205, 51, 105, 138, 216, 56, 107, 131, 4, 1..."
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split...,447225,120.0,3326,65.996091,75,72,"[4, 193, 51, 103, 138, 205, 56, 111, 130, 209,..."
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AU...,837875,120.0,6316,60.581856,81,78,"[4, 206, 54, 106, 152, 50, 102, 156, 207, 42, ..."
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MID--AU...,306880,120.0,4019,68.433192,83,82,"[4, 204, 36, 101, 126, 46, 100, 125, 205, 42, ..."


In [16]:
# df_combined.to_csv("midi_df.csv")