In [36]:
import pandas as pd
%pylab inline
import numpy as np
import matplotlib
import music21 as mu
import pymongo
from pymongo import MongoClient
from bs4 import BeautifulSoup
import re

Populating the interactive namespace from numpy and matplotlib


In [37]:
xmlData = BeautifulSoup(open('FinalDataSet/XMLFiles/JazzSolos/KJAllTheThingsYouAre1983_2143_290.xml'))

In [38]:
notesDataFrame = pd.DataFrame
notes = xmlData.find_all('note')

In [39]:
listToHoldAllNoteEvents = []
for note in notes:
    #print note
    theDict = {}
    theDict['restFlag'] = note.rest
    #theDict['durationModified'] = note.tuplet
    theDict['noteName'] = note.step
    theDict['accidental'] = note.alter
    theDict['noteType'] = note.type
    theDict['octave'] = note.octave
    theDict['part'] = note.instrument
    theDict['duration'] = note.duration
    theDict['tieStatus'] = note.tie

    listToHoldAllNoteEvents.append(theDict)
    

In [40]:
df = pd.DataFrame(listToHoldAllNoteEvents)

In [41]:
# first clean of data
def cleanAccidental(data_row):
    accidentalString = str(data_row['accidental'])
    test = re.sub('<[^>]*>', '', accidentalString)
    return str(test) 
    
df['accidental'] = df.apply(cleanAccidental, axis=1) 

def cleanDuration(data_row):
    accidentalString = str(data_row['duration'])
    test = re.sub('<[^>]*>', '', accidentalString)
    return str(test) 
    
df['duration'] = df.apply(cleanDuration, axis=1) 

def cleanNoteName(data_row):
    accidentalString = str(data_row['noteName'])
    test = re.sub('<[^>]*>', '', accidentalString)
    return str(test)
    
df['noteName'] = df.apply(cleanNoteName, axis=1) 

def cleanNoteType(data_row):
    accidentalString = str(data_row['noteType'])
    test = re.sub('<[^>]*>', '', accidentalString)
    return str(test) 
    
df['noteType'] = df.apply(cleanNoteType, axis=1) 

def cleanOctave(data_row):
    accidentalString = str(data_row['octave'])
    test = re.sub('<[^>]*>', '', accidentalString)
    return str(test) 
    
df['octave'] = df.apply(cleanOctave, axis=1) 
        
def cleanRest(data_row):
    restVal = -1
    restAsString = str(data_row['restFlag'])
    if data_row['restFlag'] is not None:
        restVal ='rest'
    else:
        restVal = 'note'

    return str(restVal)
    
df['restFlag'] = df.apply(cleanRest, axis=1)



In [42]:
df.head(1)

Unnamed: 0,accidental,duration,noteName,noteType,octave,part,restFlag,tieStatus
0,,256,,quarter,,"<instrument id=""P1-I1""></instrument>",rest,


In [43]:
#second clean for parts and duration
def cleanPart(data_row):
    accidentalString = str(data_row['part'])
    test = accidentalString[16:18]
    return str(test) 

df['part'] = df.apply(cleanPart, axis=1)

def convertDurationAsPerQuarterNoteIsOneBeat(data_row):
    asFloat = float(data_row['duration'])
    return asFloat / 256

df['quarterNoteDuration'] = df.apply(convertDurationAsPerQuarterNoteIsOneBeat, axis=1)

In [44]:
df.head(1)

Unnamed: 0,accidental,duration,noteName,noteType,octave,part,restFlag,tieStatus,quarterNoteDuration
0,,256,,quarter,,P1,rest,,1


In [45]:
# divide parts to ascertain location
partOne = df[df['part'] == 'P1']
partTwo = df[df['part'] == 'P2']

In [46]:
partOne['noteEventLocation'] = partOne['quarterNoteDuration'].cumsum()
partTwo['noteEventLocation'] = partTwo['quarterNoteDuration'].cumsum()
partOne['noteEventStartingLocation'] = partOne['noteEventLocation'].shift()
partTwo['noteEventStartingLocation'] = partTwo['noteEventLocation'].shift()

In [47]:
df = partOne.append(partTwo)

In [48]:
df = df.sort('noteEventLocation')
df = df.reset_index()

In [49]:
df.head(1)

Unnamed: 0,index,accidental,duration,noteName,noteType,octave,part,restFlag,tieStatus,quarterNoteDuration,noteEventLocation,noteEventStartingLocation
0,0,,256,,quarter,,P1,rest,,1,1,


In [50]:
#create midi frequency value
offsetForC = 0
offsetForD = 2
offsetForE = 4
offsetForF = 5
offsetForG = 7
offsetForA = 9
offsetForB = 11

def midiNumberAssign(data_row):
    
    def adjustForAccidental(accidentalVal):
        if accidentalVal == 'None':
            return 0
        else:
            return int(accidentalVal)

    if data_row['octave'] != "None":
        baseVal = int(data_row['octave']) * 12
        if data_row['noteName'] == 'C':
            return baseVal + adjustForAccidental(data_row['accidental'])
        elif data_row['noteName'] == 'D':
            return baseVal + offsetForD + adjustForAccidental(data_row['accidental'])
        elif data_row['noteName'] == 'E':
            return baseVal + offsetForD + adjustForAccidental(data_row['accidental'])
        elif data_row['noteName'] == 'F':
            return baseVal + offsetForD + adjustForAccidental(data_row['accidental'])
        elif data_row['noteName'] == 'G':
            return baseVal + offsetForD + adjustForAccidental(data_row['accidental'])
        elif data_row['noteName'] == 'A':
            return baseVal + offsetForD + adjustForAccidental(data_row['accidental'])
        elif data_row['noteName'] == 'B':
            return baseVal + offsetForD + adjustForAccidental(data_row['accidental'])



df['midiNumber'] = df.apply(midiNumberAssign, axis=1)

In [51]:
df.head()

Unnamed: 0,index,accidental,duration,noteName,noteType,octave,part,restFlag,tieStatus,quarterNoteDuration,noteEventLocation,noteEventStartingLocation,midiNumber
0,0,,256,,quarter,,P1,rest,,1.0,1.0,,
1,1,-1.0,256,B,quarter,4.0,P1,note,,1.0,2.0,1.0,49.0
2,2058,,1024,G,whole,3.0,P2,note,,4.0,4.0,,38.0
3,2,-1.0,512,A,half,4.0,P1,note,,2.0,4.0,2.0,49.0
4,3,,384,C,quarter,5.0,P1,note,,1.5,5.5,4.0,60.0


In [52]:
#get rid of unused columns
del df['index']
del df['accidental']
del df['duration']
del df['noteType']
del df['octave']
del df['noteEventLocation']
del df['noteName']

In [53]:
df.head(5)

Unnamed: 0,part,restFlag,tieStatus,quarterNoteDuration,noteEventStartingLocation,midiNumber
0,P1,rest,,1.0,,
1,P1,note,,1.0,1.0,49.0
2,P2,note,,4.0,,38.0
3,P1,note,,2.0,2.0,49.0
4,P1,note,,1.5,4.0,60.0
