import sys
!{sys.executable} -m pip install music21
!{sys.executable} -m pip install tqdm



In [None]:
import music21
import numpy as np
import glob
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import pandas as pd
from keras.utils import np_utils
from collections import defaultdict

from keras.models import Sequential
from keras.layers import Activation, Dense, LSTM, Dropout, Flatten


In [None]:
def encode_song(filename):
    """Encodes song in filename. See milestone1 report for encoding"""
    notesRaw=music21.converter.parse(filename)
    notesRaw=notesRaw.flat.notes
        
    pitches=[]
    offsets=[]
        
    for note in notesRaw:
        if isinstance(note,music21.note.Note):
            pitches.append(note.pitch.midi)
            offsets.append(note.offset)
        else:
            chordNotes=[int(b.midi) for b in note.pitches]
            pitches.extend(chordNotes)
            offsets.extend([note.offset]*len(chordNotes))

    pitches=np.array(pitches)
    offsets=np.array(offsets)
    uniqueSortedOffsets=np.sort(np.unique(offsets))
    
    encoding=[]
    
    for i in range(len(uniqueSortedOffsets)-1):
        time=uniqueSortedOffsets[i]
        pitchesHere=pitches[offsets==time]
        goforward='gf'+str(uniqueSortedOffsets[i+1]-time)
        
        encoding.extend(list(np.sort(pitchesHere)))
        encoding.append(goforward)
        
        
    
    return encoding   

In [None]:
def decode_song(song,filename):
    """Decodes song encoded by encode_song(). song is an encoded song of type list, and filename is
    the path where the new decoded midi file is to reside"""
    stream=[]
    offset=0
    for event in song:
        if 'gf' in str(event):
            offset+=float(event[2:])
        else:
            newNote=music21.note.Note(int(event))
            newNote.offset=offset
            newNote.storedInstrument=music21.instrument.Piano()
            stream.append(newNote)
    midi_stream=music21.stream.Stream(stream)
    midi_stream.write('midi', fp=filename)
    return 1
    
    
    

In [None]:
#we get the the directories of all the songs in our dataset. 

artists=glob.glob('Music/*') #files are sorted via artist
 
songs=[] #the song filename

for artist in artists:
    songs.extend(glob.glob(artist+'/*'))


In [None]:
#we encode each song, save them in a dataframe, and then pickle the dataframe.
information=defaultdict(list)

for song in tqdm(songs):
    encoded=encode_song(song)
    _,artist,songname=song.split('\\')
    information['SongName'].append(songname)
    information['Artist'].append(artist)
    information['Encoded'].append(encoded)
    



In [None]:
DF_to_save=pd.DataFrame.from_dict(information)
DF_to_save.head()


In [None]:
DF=DF_to_save
DF_to_save=[]

In [None]:
#we first get all the unique elements of our vocabulary:

vocab=[]
for i in tqdm(range(len(DF))):
    vocab.extend(DF.iloc[i,2])
    vocab=list(set(vocab))
    
#we separate the notesOnly from gfOnly
notesOnly=[]
gfOnly=[]

for word in vocab:

    if 'gf' in str(word):
        gfOnly.append(word)
    else:
        notesOnly.append(word)

In [None]:
#we check to see what is the quarter lengths duration of the unique go forward events. seems like sometimes
#when changin
numbers=[]
for gf in gfOnly:
    if '/' in gf:
        num,denom=gf.split('/')
        num=num[2:]
        numbers.append(float(num)/float(denom))
    else:
        numbers.append(float(gf[2:]))

In [None]:
print(sorted(numbers))

From above we see that alot of the unique offsets are essentially the same but differ slightly due to representation by the author of the midi file. We thus have to fix something in the representation of the gfs. This is something we could have fixed in the encode_song function. However to avoid having to open the files again, we ammend the issue in the dataframe.

In [None]:
NewColumn=[]
for i in tqdm(range(len(DF))):
    encoding=DF.iloc[i,2]
    #now loop through every word of the encoding
    replacement=[]
    for i,word in enumerate(encoding):
        if 'gf' in str(word):
            #we have a gf, we isolate the word from the letters 'gf'
            keep=word[2:]
            #we now check to see if there is a division symbol:
            if '/' in keep:
                numerator,denominator=keep.split('/')
                keep=np.round(float(numerator)/float(denominator),4)
                replacement.append('gf'+str(keep))
            else:
                replacement.append('gf'+str(np.round(float(keep),4)))
                
        else:
            #its just a note.
            replacement.append(word)
    NewColumn.append(replacement)

In [None]:
DF['ModedEncoded']=NewColumn


In [None]:
DF=DF.drop(columns='Encoded')

In [None]:

with open('encodings/information.pickle','wb') as file:
    pickle.dump(DF,file)