In [1]:
from music21 import *
import nltk
import pandas as pd
from fractions import Fraction
import numpy as np

In [2]:
# see the influence of classic folk songs in popular music

# Creating Functions to get 'bag of notes' approach

In [3]:
# first pass - only worry about pitches and rests with no duration. Using actual pitches - can change to pitch classes
# later if we get poor results (i.e. 0-11 for each possible pitch)

def get_notes(inScore):
    # check that we have single voicing - can expand to multiple voicings later. can chordify or can choose first
    # voicing. For now we will choose the first voicing and can expand to chords later.
    if len(inScore.parts) > 1:
        for part in inScore.parts:
            if part.hasElementOfClass('Measure'):
                inScore = part
                break
                
    # expand repeats in the score
    if inScore.flat.hasElementOfClass('Repeat'):
        inScore = inScore.expandRepeats()
        
    # get note list. This places chords into an ordered list - does not check if they are the same chord in different voicings
    note_list = []
    for element in inScore.flat.notesAndRests:
        if element.isChord:
            note_list.append(element.root().nameWithOctave)
        elif element.isRest:
            if note_list == []:
                note_list.append('rest')
            elif note_list[-1] != 'rest':
                note_list.append('rest')
            else:
                continue
        else:
            note_list.append(element.nameWithOctave)
    return note_list
    

In [4]:
def get_intervals_and_durations(inScore):
    # check that we have single voicing - can expand to multiple voicings later. can chordify or can choose first
    # voicing. For now we will choose the first voicing and can expand to chords later.
    # Returns: np.array of pairwise (interval, (note1 duration, note2 duration))
    if len(inScore.parts) > 1:
        for part in inScore.parts:
            if part.hasElementOfClass('Measure'):
                inScore = part
                break
                
    # expand repeats in the score
    if inScore.flat.hasElementOfClass('Repeat'):
        inScore = inScore.expandRepeats()
        
    # get interval list. This gets the root value of the chord - simplifying since there are not many songs with chords
    interval_duration_list = []
    note_list = []
    prev_note = None
    for element in inScore.flat.notes:
        
        # right now don't worry about rests... figure out intervals with rests later - treat as stop words
        if prev_note == None:
            prev_note = element
            if element.isChord:
                prev_note = note.Note(element.root())
                prev_note.quarterLength = element.quarterLength
            continue
        cur_note = element
        
        # for a chord we get the root pitch and construct a note object
        if element.isChord:
            cur_note = note.Note(element.root())
            cur_note.quarterLength = element.quarterLength
        
        # now that we only have notes, we take the interval from the previous note and get the duration of 
        # prev_note and cur_note

        inter = interval.notesToChromatic(prev_note, cur_note)
        duration = (str(prev_note.duration.quarterLength), str(cur_note.duration.quarterLength))
        interval_duration_list.append((inter, duration))
        prev_note = cur_note
    return np.array(interval_duration_list)

In [5]:
def stringify(interval_durations, kind='interval'):
    # input interval_durations datastructure - output a list of string tokens to use for count_vectorizer
    string_list = []
    if kind == 'interval':
        for interval in interval_durations[:,0]:
            s = str(interval).split()[1][:-1]
            string_list.append(s)
    if kind == 'duration':
        for duration in interval_durations[:,1]:
            s = str(duration)
            string_list.append(s)
    if kind == 'both':
        for interval_duration in interval_durations:
            s = str(interval_duration).strip('[').strip(']').replace('>','')[36:]
            string_list.append(s)
    return string_list

In [6]:
# create tokenizer and preprocessor for count vectorizer. Can add custom analyzer to add skip grams after for better results

from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams, skipgrams

def my_tokenizer(s):
    return s.split()

def my_tokenizer2(s):
    return s

def my_preprocessor(s):
    return s

## Now to get term matrix for all songs

In [7]:
import os
from collections import defaultdict
# get the list of files to perform analysis

def get_files(path):
    dict_of_files = defaultdict(list)
    
    # grab krn files stored in the selected path. directory organized with origin (country/region/composer) 
    # as the directory name
    for (dirpath, dirnames, filenames) in os.walk(path):
        origin = '/'.join(dirpath.split('/')[-1:])
        for filename in filenames:
            if filename.endswith('.krn'):
                dict_of_files[origin].append(os.sep.join([dirpath, filename]))
    
    # now grab USA folk files from music21 library
    usa_path = corpus.getComposer('miscFolk')[0]
    dict_of_files['usa'].append(usa_path)
    
    return dict_of_files

In [8]:
# files saved locally on my computer - the serialized string dicts are available online

files = get_files('/Users/kzecchini/ds/metis/sheet_music/data/folk_music')

In [9]:
files['usa']

['/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa01.krn',
 '/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa02.krn',
 '/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa03.krn',
 '/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa04.krn',
 '/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa05.krn',
 '/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa06.krn',
 '/Users/kzecchini/ds/metis/sheet_music/data/folk_music/usa/usa07.krn',
 u'/Users/kzecchini/anaconda/lib/python2.7/site-packages/music21/corpus/miscFolk/americanfifeopus.abc']

In [10]:
def parse_song(song_file):
    result_score = converter.parse(song_file)
    return result_score

In [11]:
def create_parsed_dict_notes(song_dict):
    # use the file dictionary to make a parsed song dictionary
    parsed_songs_dict = defaultdict(list)
    for origin, songs in song_dict.iteritems():
        print origin
        for song in songs:
            # parse the song - will return a stream.Score or stream.Opus object
            song_score = parse_song(song)
            # if the song_score is actually an opus make sure to get each individual score
            if type(song_score) == stream.Opus:
                for indv_song in song_score.scores:
                    song_score_parsed = get_notes(indv_song)
                    parsed_songs_dict[origin].append(' '.join(song_score_parsed))
            # else it is a stream.Score object and append just that score. This should be modularized - come back later.
            else:
                song_score_parsed = get_notes(song_score)
                parsed_songs_dict[origin].append(' '.join(song_score_parsed))
    return parsed_songs_dict

In [12]:
def create_parsed_dict_intervals(song_dict):
    # use the file dictionary to make a parsed song dictionary
    parsed_songs_dict = defaultdict(list)
    for origin, songs in song_dict.iteritems():
        print origin
        for song in songs:
            # parse the song - will return a stream.Score or stream.Opus object
            song_score = parse_song(song)
            # if the song_score is actually an opus make sure to get each individual score
            if type(song_score) == stream.Opus:
                for indv_song in song_score.scores:
                    song_score_parsed = get_intervals_and_durations(indv_song)
                    parsed_songs_dict[origin].append(song_score_parsed)
            # else it is a stream.Score object and append just that score. This should be modularized - come back later.
            else:
                song_score_parsed = get_intervals_and_durations(song_score)
                parsed_songs_dict[origin].append(song_score_parsed)
    return parsed_songs_dict

In [19]:
parsed_dict = create_parsed_dict_notes(files)

foster
romania
jugoslav




england
allerkbd




chinese
mexico
czech




scotland
pawnee
lux_r
altdeu1




fink
ukraina




lux_m
danmark




shanxi




han




lux_n
lux_t
lux_s
lorraine
nova
france




kinder
boehme
lux_l
italia
rossiya




ireland
lux_k
gershwin




ballad




sverige
usa
variant
xinhua
zuccal




natmin




schweiz




magyar




luxembrg
ojibway
sioux




polska




lothring




dva




altdeu2




british
elsass
nederlan




oesterrh




african
pentatonic
friuli
erk




In [13]:
parsed_dict2 = create_parsed_dict_intervals(files)

foster
romania
jugoslav
england
allerkbd
chinese
mexico
czech
scotland
pawnee
lux_r
altdeu1
fink
ukraina
lux_m
danmark
shanxi
han
lux_n
lux_t
lux_s
lorraine
nova
france
kinder
boehme
lux_l
italia
rossiya
ireland
lux_k
gershwin
ballad
sverige
usa
variant
xinhua
zuccal
natmin
schweiz
magyar
luxembrg
ojibway
sioux
polska
lothring
dva
altdeu2
british
elsass
nederlan
oesterrh
african
pentatonic
friuli
erk


In [15]:
parsed_dict2['romania'][-2]

array([[<music21.interval.ChromaticInterval 5>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval 2>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval 1>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval 2>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval -2>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval -1>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval -2>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval 0>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval 7>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval 0>, ('1.0', '0.5')],
       [<music21.interval.ChromaticInterval 0>, ('0.5', '0.5')],
       [<music21.interval.ChromaticInterval 0>, ('0.5', '1.0')],
       [<music21.interval.ChromaticInterval 5>, ('1.0', '1.0')],
       [<music21.interval.ChromaticInterval -5>, ('1.0', '2.0')],
       [<music21.interval.ChromaticInterval 0>, ('2.0', '1.0')],
       [<music21.inte

In [26]:
import cPickle as pickle

with open('parsed_dict_notes.pkl', 'w') as picklefile:
    pickle.dump(parsed_dict, picklefile)

with open('parsed_dict_intervals.pkl', 'w') as picklefile:
    pickle.dump(parsed_dict2, picklefile)

# Stringify Interval/durations into three seperate dictionaries...

In [16]:
def stringify_dict(interval_dict, kind='interval'):
    result_dict = {}
    for origin, interval_durations in interval_dict.iteritems():
        result_list = []
        for interval_duration in interval_durations:
            if interval_duration.shape == (0,):
                continue
            result_list.append(stringify(interval_duration, kind=kind))
        result_dict[origin] = result_list
    return result_dict

In [17]:
interval_dict = stringify_dict(parsed_dict2, kind='interval')
duration_dict = stringify_dict(parsed_dict2, kind='duration')
both_dict = stringify_dict(parsed_dict2, kind='both')

In [18]:
print interval_dict['romania'][-1]
print duration_dict['romania'][-1]
print both_dict['romania'][-1]

['5', '0', '-1', '-2', '0', '-2', '0', '5', '0', '2', '3', '-1', '-9', '9', '-2', '2', '3', '0', '-2', '-1', '-2', '-7', '0', '10', '0', '-1', '-2', '-2', '-5', '9', '-2', '-2', '-5', '9', '-2', '2', '3', '0', '-2', '-1', '-2', '-7', '0', '10', '0', '-1', '-2', '-2', '-5', '9', '-2', '-2']
["('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '1.0')", "('1.0', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '1.0')", "('1.0', '2.0')", "('2.0', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '1.0')", "('1.0', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '1.0')", "('1.0', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '1.0')", "('1.0', '2.0')", "('2.0', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '0.5')", "('0.5', '1.0')", "('1.0', '0.5')", "('0.5

In [48]:
with open('interval_dict.pkl', 'w') as picklefile:
    pickle.dump(interval_dict, picklefile)
with open('duration_dict.pkl', 'w') as picklefile:
    pickle.dump(duration_dict, picklefile)
with open('both_dict.pkl', 'w') as picklefile:
    pickle.dump(both_dict, picklefile)