# Importing & Installing Libraries

In [1]:
!pip install pretty_midi
!pip install mido
!pip install music21

Collecting pretty_midi
  Downloading pretty_midi-0.2.9.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25ldone
[?25h  Created wheel for pretty_midi: filename=pretty_midi-0.2.9-py3-none-any.whl size=5591955 sha256=640cbc887151cb65e872ac64cc62264d6e5435df249681608ad0504c53d8bb76
  Stored in directory: /root/.cache/pip/wheels/ad/74/7c/a06473ca8dcb63efb98c1e67667ce39d52100f837835ea18fa
Successfully built pretty_midi
Installing collected packages: mido, pretty_midi
Successfully installed mido-1.2.10 pretty_midi-0.2.9
[0mCollecting music2

In [21]:
import pretty_midi
import pickle
import fractions
import music21
from music21 import *
import glob
import json
import numpy as np
import pathlib
from tensorflow.keras.utils import to_categorical
from typing import List, Tuple, Dict

# Fine-grained encoding approach (abandoned later)

## Functions 

The following functions are used to extract a sequence of notes (along with their pitch and duration) in a given track. The resulting list, however, will not include chords but only sequential sounds.

In [3]:
def get_duration_mapping(midi_file):
    # Load MIDI file
    mid = pretty_midi.PrettyMIDI(midi_file)
    
    # Get the tick resolution of the MIDI file
    resolution = mid.resolution
    
    # Create a dictionary to store the duration mapping
    duration_mapping = {}
    
    # Iterate over all time signatures in the MIDI file
    for time_signature in mid.time_signature_changes:
        # Get the numerator and denominator of the time signature
        numerator = time_signature.numerator
        denominator = time_signature.denominator
        
        # Calculate the tick duration of a quarter note
        quarter_note_tick = resolution * 4 / denominator
        whole_note_tick = quarter_note_tick * 4
        half_note_tick = quarter_note_tick * 2
        eighth_note_tick = quarter_note_tick / 2
        sixteenth_note_tick = quarter_note_tick / 4

        
        # Add the tick duration of a quarter note to the duration mapping
        duration_mapping['whole'] = whole_note_tick
        duration_mapping['half'] = half_note_tick
        duration_mapping['quarter'] = quarter_note_tick
        duration_mapping['eighth'] = eighth_note_tick
        duration_mapping['sixteenth'] = sixteenth_note_tick
        duration_mapping['quarter.eighth'] = quarter_note_tick + eighth_note_tick
        duration_mapping['eigth.sixteenth'] = eighth_note_tick + sixteenth_note_tick
        duration_mapping['half.quarter'] = half_note_tick + quarter_note_tick

        
    return duration_mapping

In [4]:
def extract_tracks_from_midi_ticks(midi_file):
    mid = pretty_midi.PrettyMIDI(midi_file)
    tracks = []
    for instrument in mid.instruments:
        if not instrument.is_drum:
            track = []
            # Keep track of the current tick position
            tick_pos = 0
            for i in range(len(instrument.notes)):
                # Get the current note and its start and end times
                note = instrument.notes[i]
                start_time = note.start
                end_time = note.end
                # Calculate the duration of the rest before the current note
                rest_duration = start_time - tick_pos / mid.resolution
                if rest_duration > 0:
                    rest_ticks = int(mid.time_to_tick(rest_duration))
                    track.append(("rest", rest_ticks))
                # Append the current note to the track
                note_ticks = int(mid.time_to_tick(end_time - start_time))
                track.append((note.pitch, note_ticks))
                # Update the current tick position
                tick_pos = end_time * mid.resolution
            if track:
                tracks.append(track)
    
    return tracks

In [5]:
def replace_ticks_with_notation(data: List[Tuple[int, float]], values: Dict[str, float]) -> List[Tuple[int, str]]:
    result = []
    for tup in data:
        closest_val = min(values.items(), key=lambda x: abs(x[1]-tup[1]))
        result.append((tup[0], closest_val[0]))
    return result

In [6]:
def intlabel_lookup_dictionary(data: List[Tuple]) -> Dict[Tuple, int]:
    labels = {}
    current_label = 1
    for tup in data:
        if tup not in labels:
            labels[tup] = current_label
            current_label += 1
    return labels

In [7]:
def tuples2intlabels(data, lookup_dict):
    result = []
    for tup in data:
        if tup in lookup_dict:
            result.append(lookup_dict[tup])
        else:
            result.append(tup)
    return result

In [8]:
def has_duplicate_keys(dictionary):
    keys = set()
    for key in dictionary.keys():
        if key in keys:
            return True
        keys.add(key)
    return False

In [9]:
def raw_sequence_pipeline(midifile):
    print(f'{midifile} is being processed.')
    
    duration_mapping = get_duration_mapping(midifile)
    tracks_with_ticks = extract_tracks_from_midi_ticks(midifile)

    print(f'{midifile} has {len(tracks_with_ticks)} tracks.')

    sequence_notation = replace_ticks_with_notation(tracks_with_ticks[0], duration_mapping)

    for note in sequence_notation:
        if not isinstance(note, tuple):
            print(note, 'nontuple element-no output produced')
            return

        if not isinstance(note[1], str):
            print(note[1], 'nonstring duration-no output produced')
            return
            
        else:
            continue
    
    print(f'The resulting sequence has {len(sequence_notation)} notes.')
    print('Conversion successful.')
    print('-----------------------------------------------------------')
    
    return sequence_notation

In [10]:
def integer_encoder(raw_sequence):
    lookup_dictionary = intlabel_lookup_dictionary(raw_sequence)
    encoded_sequence = tuples2intlabels(raw_sequence, lookup_dictionary)
    
    if len(raw_sequence) == len(encoded_sequence):
        print('Successful encoding.')
    
    else:
        print('Encoding might be problematic')
    
    print(f'The dictionary has duplicate keys: {has_duplicate_keys(lookup_dictionary)}')
    
    return encoded_sequence, lookup_dictionary

## Generating The Training Dataset

The following lines read each MIDI file from the folder, extract complex note events, tokenise and integer-encode them.

In [11]:
# The following lines of code extract and tokenize complex note events

all_notes = []

for file in glob.glob("/kaggle/input/bachbeethoven/*.mid"):
    for note in raw_sequence_pipeline(file):
        all_notes.append(note)

/kaggle/input/bachbeethoven/beethoven_opus22_3.mid is being processed.
/kaggle/input/bachbeethoven/beethoven_opus22_3.mid has 2 tracks.
The resulting sequence has 1235 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/pathetique_1.mid is being processed.
/kaggle/input/bachbeethoven/pathetique_1.mid has 2 tracks.
The resulting sequence has 3479 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/beethoven_opus22_2.mid is being processed.
/kaggle/input/bachbeethoven/beethoven_opus22_2.mid has 2 tracks.
The resulting sequence has 1193 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/Prelude6.mid is being processed.
/kaggle/input/bachbeethoven/Prelude6.mid has 2 tracks.
The resulting sequence has 610 notes.
Conversion successful.
-----------------------------------------------------------
/



/kaggle/input/bachbeethoven/beethoven_les_adieux_3.mid has 2 tracks.
The resulting sequence has 2820 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/beethoven_opus90_2.mid is being processed.
/kaggle/input/bachbeethoven/beethoven_opus90_2.mid has 2 tracks.
The resulting sequence has 2040 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/sonate_30_chisamori.mid is being processed.
/kaggle/input/bachbeethoven/sonate_30_chisamori.mid has 5 tracks.
The resulting sequence has 3042 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/beeth9-2.mid is being processed.
/kaggle/input/bachbeethoven/beeth9-2.mid has 1 tracks.
The resulting sequence has 1614 notes.
Conversion successful.
-----------------------------------------------------------
/kaggle/input/bachbeethoven/Prelude23.mid is being pr

In [12]:
# Writing raw (non-encoded) sequence of notes

with open('raw_all_notes', 'wb') as filepath:
        pickle.dump(all_notes, filepath)

In [13]:
# Integer encoding the sequence along with its code dictionary

integer_encoded, code_dictionary = integer_encoder(all_notes)

Successful encoding.
The dictionary has duplicate keys: False


In [15]:
# Transforming lookup dictionary to json-compatible format

json_dictionary = {str(key): value for key, value in code_dictionary.items()}

In [16]:
# Writing integer encoded sequence along with its integer lookup dictionary.

with open("integer_encoded_notes", "wb") as fp:
    pickle.dump(integer_encoded, fp)

with open("intcode_dictionary.json", "w") as outfile:
    json.dump(json_dictionary, outfile)

# Coarse approach (adapted from Kapoor and Skuli's works)

In [17]:
def get_notes(filepath):
    notes = []

    for file in glob.glob(filepath):
        midi = converter.parse(file)

        print("Parsing %s" % file)

        notes_to_parse = None

        try: # file has instrument parts
            s2 = instrument.partitionByInstrument(midi)
            notes_to_parse = s2.parts[0].recurse() 
        except: # file has notes in a flat structure
            notes_to_parse = midi.flat.notes

        for element in notes_to_parse:
            if isinstance(element, note.Note):
                notes.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                notes.append('.'.join(str(n) for n in element.normalOrder))

    with open('pitchbased_notes', 'wb') as filepath:
        pickle.dump(notes, filepath)

    return notes

In [22]:
notes = get_notes("/kaggle/input/bachbeethoven/*.mid")



Parsing /kaggle/input/bachbeethoven/beethoven_opus22_3.mid




Parsing /kaggle/input/bachbeethoven/pathetique_1.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus22_2.mid
Parsing /kaggle/input/bachbeethoven/Prelude6.mid
Parsing /kaggle/input/bachbeethoven/sonate_29_(c)hisamori.mid
Parsing /kaggle/input/bachbeethoven/Prelude7.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus22_4.mid




Parsing /kaggle/input/bachbeethoven/mond_3.mid




Parsing /kaggle/input/bachbeethoven/appass_1.mid
Parsing /kaggle/input/bachbeethoven/beeth25.mid
Parsing /kaggle/input/bachbeethoven/bach_minuet.mid




Parsing /kaggle/input/bachbeethoven/elise.mid
Parsing /kaggle/input/bachbeethoven/Prelude11.mid
Parsing /kaggle/input/bachbeethoven/Prelude1.mid
Parsing /kaggle/input/bachbeethoven/invent8.mid
Parsing /kaggle/input/bachbeethoven/goldberg_variations_988_05_(c)grossmann.mid
Parsing /kaggle/input/bachbeethoven/appass_3.mid




Parsing /kaggle/input/bachbeethoven/pathetique_2.mid




Parsing /kaggle/input/bachbeethoven/beethoven_opus90_1.mid
Parsing /kaggle/input/bachbeethoven/bach_inventions_772_free_(c)simonetto.mid




Parsing /kaggle/input/bachbeethoven/beethoven_les_adieux_3.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus90_2.mid
Parsing /kaggle/input/bachbeethoven/sonate_30_chisamori.mid
Parsing /kaggle/input/bachbeethoven/beeth9-2.mid
Parsing /kaggle/input/bachbeethoven/Prelude23.mid
Parsing /kaggle/input/bachbeethoven/Prelude10.mid
Parsing /kaggle/input/bachbeethoven/Prelude8.mid
Parsing /kaggle/input/bachbeethoven/bach_inventions_775_(c)simonetto.mid
Parsing /kaggle/input/bachbeethoven/goldberg_variation1.mid
Parsing /kaggle/input/bachbeethoven/Prelude12.mid
Parsing /kaggle/input/bachbeethoven/sonate_17_chisamori.mid
Parsing /kaggle/input/bachbeethoven/beethoven_les_adieux_2.mid
Parsing /kaggle/input/bachbeethoven/sonate_25_(c)hisamori.mid
Parsing /kaggle/input/bachbeethoven/Prelude4.mid
Parsing /kaggle/input/bachbeethoven/beethoven_les_adieux_1.mid
Parsing /kaggle/input/bachbeethoven/appass_2.mid




Parsing /kaggle/input/bachbeethoven/mond_1.mid




Parsing /kaggle/input/bachbeethoven/mond_2.mid
Parsing /kaggle/input/bachbeethoven/goldberg_variations_988_02_(c)grossmann.mid
Parsing /kaggle/input/bachbeethoven/sonate_20_(c)hisamori.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus10_3.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus10_2.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus10_1.mid
Parsing /kaggle/input/bachbeethoven/orgel_buechlein_bwv-639_(c)luquet.mid
Parsing /kaggle/input/bachbeethoven/beethoven_opus22_1.mid




Parsing /kaggle/input/bachbeethoven/pathetique_3.mid


In [23]:
len(notes)

48686