In [71]:
###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [72]:
def build_feature_corpus(filenames):
    """Takes MIDI files and creates feature arrays to be used for training."""

    notes_corpus = []
    steps_corpus = []
    mode_corpus = []

    # Iterating through every other file; for early trials using 1/2 files as training set, and other 1/2 to test
    for filename in filenames:

        # Convert MIDI file to score and iterate over notes in score
        # Save relevant individual Note attributes into list, including Note obj (used to calc interval) and name
        score = music21.converter.parse('MIDI test files/Cello solos/' + filename)
        note_attributes = []
        for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score, 0):
            if note == None:
                pass
            else:
                note_attributes.append([note, note.name])
        print "Note attributes for file {} completed".format(filename)

        # Creates string of all notes in a score and appends to notes_corpus
        all_notes = ""
        for note in note_attributes:
            all_notes += note[1] + " "

        notes_corpus.append(all_notes)

        # Creates string of all steps in a score and appends to steps_corpus
        all_steps = ""
        for i in range(1, len(note_attributes)):
            interval = music21.interval.Interval(noteStart=note_attributes[i-1][0], noteEnd=note_attributes[i][0])
            step = int((interval.cents)/100)
            all_steps += str(step) + ' '

        steps_corpus.append(all_steps)

        # Determines the mode (major or minor) and assigns to output corpus
        # Note: At the moment, this doesn't take into account changes b/w major + minor w/in a score
        mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
        is_major = (mode_at_measure_0 == 'major')
        mode_corpus.append(is_major)

    return notes_corpus, steps_corpus, mode_corpus


In [73]:
def build_notes_feature_vector_and_fit_model(notes_corpus, mode_corpus):

    notes_vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, token_pattern=r'\w#?-?')

    X_train = notes_vectorizer.fit_transform(notes_corpus)
    print X_train.shape
    print X_train
    
    y_train = np.ravel(mode_corpus)
    print y_train.shape
    print y_train

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    print "TRAINING SET SCORE: ", classifier.score(X_train, y_train)

    # notes_pipeline = Pipeline([('notes_vecorizer', notes_vectorizer), ('classifier', classifier)])

    return notes_vectorizer, classifier

In [46]:
# def build_steps_feature_vector_and_fit_model(steps_corpus, mode_corpus):

#     steps_vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(2,3), token_pattern=r'\d\d?')

#     X_train = steps_vectorizer.fit_transform(steps_corpus)
#     y_train = np.ravel(mode_corpus)

#     classifier = LogisticRegression()
#     classifier.fit(X_train, y_train)
#     print "TRAINING SET SCORE: ", classifier.score(X_train, y_train)

#     # notes_pipeline = Pipeline([('notes_vecorizer', notes_vectorizer), ('classifier', classifier)])

#     return steps_vectorizer, classifier

In [76]:
def build_feature_vector_and_predict(vectorizer, classifier, feature_corpus, mode_corpus):
    """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
    X_test = vectorizer.transform(feature_corpus)
    print type(X_test), X_test.shape

    prediction = classifier.predict(X_test)
    prediction = list(prediction)
    print 'PREDICTION:', prediction
    print ""
    print 'ACTUAL OUTCOMES: ', mode_corpus
    
    count = 0
    for i in range(len(prediction)):
        if prediction[i] == mode_corpus[i]:
            count +=1
        
    print '{} correct predictions out of {} sample test files'.format(count, len(mode_corpus))
    print float(count)/len(outcomes) * 100

In [75]:

# ------------------------------Executable Code --------------------------------

# read test files and construct columns
filenames = open('test_files.txt').read().split('\n')

# Build training features from input midi files
notes_corpus_training, steps_corpus_training, mode_corpus_training = build_feature_corpus(filenames[::2])
notes_vectorizer, notes_classifier = build_notes_feature_vector_and_fit_model(notes_corpus_training, mode_corpus_training)

Note attributes for file cs1-1pre.mid completed
Note attributes for file cs1-3cou.mid completed
Note attributes for file cs1-5men.mid completed
Note attributes for file cs2-1pre.mid completed
Note attributes for file cs2-3cou.mid completed
Note attributes for file cs2-5men.mid completed
Note attributes for file cs3-1pre.mid completed
Note attributes for file cs3-3cou.mid completed
Note attributes for file cs3-5bou.mid completed
Note attributes for file cs4-1pre.mid completed
Note attributes for file cs4-3cou.mid completed
Note attributes for file cs4-5bou.mid completed
Note attributes for file cs5-1pre.mid completed
Note attributes for file cs5-3cou.mid completed
Note attributes for file cs5-5gav.mid completed
Note attributes for file cs6-1pre.mid completed
Note attributes for file cs6-3cou.mid completed
Note attributes for file cs6-5gav.mid completed
(18, 12)
  (0, 2)	0.0146746471739
  (0, 7)	0.0278438545053
  (0, 8)	0.0618816098131
  (0, 11)	0.0271906900465
  (0, 4)	0.0880478830435
 

In [77]:
# Build features for testing set of midi files.
notes_corpus_testing, steps_corpus_testing, mode_corpus_testing = build_feature_corpus(filenames[1::2])
build_feature_vector_and_predict(notes_vectorizer, notes_classifier, notes_corpus_testing, mode_corpus_testing)

Note attributes for file cs1-2all.mid completed
Note attributes for file cs1-4sar.mid completed
Note attributes for file cs1-6gig.mid completed
Note attributes for file cs2-2all.mid completed
Note attributes for file cs2-4sar.mid completed
Note attributes for file cs2-6gig.mid completed
Note attributes for file cs3-2all.mid completed
Note attributes for file cs3-4sar.mid completed
Note attributes for file cs3-6gig.mid completed
Note attributes for file cs4-2all.mid completed
Note attributes for file cs4-4sar.mid completed
Note attributes for file cs4-6gig.mid completed
Note attributes for file cs5-2all.mid completed
Note attributes for file cs5-4sar.mid completed
Note attributes for file cs5-6gig.mid completed
Note attributes for file cs6-2all.mid completed
Note attributes for file cs6-4sar.mid completed
Note attributes for file cs6-6gig.mid completed
<class 'scipy.sparse.csr.csr_matrix'> (18, 12)
PREDICTION: [True, True, True, True, False, True, True, True, True, False, False, False, 

NameError: global name 'outcomes' is not defined

In [44]:
# steps_vectorizer, steps_classifier = build_steps_feature_vector_and_fit_model(steps_corpus_training, mode_corpus_training)
# build_feature_vector_and_predict(steps_vectorizer, steps_classifier, steps_corpus_testing, mode_corpus_testing)

TRAINING SET SCORE:  0.777777777778
<class 'scipy.sparse.csr.csr_matrix'> (18, 1812)
PREDICTION: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]

ACTUAL OUTCOMES:  [True, True, True, False, False, False, True, True, True, False, True, False, False, False, False, False, True, True]
