In [2]:
"""THIS TESTER IS USING THE FOLLOWING:

Dataset: 
Music21 corpus 'ryansMammoth'

Testing features:
1) Notes frequency feature vector, ngrams = range of 1-2.

Classifier: 
Naive Bayes-Gaussian

"""

###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
from sklearn import metrics

In [3]:
def build_feature_corpus(corpus):
    """Takes MIDI files and creates feature arrays to be used for training."""

    notes_corpus = []
    mode_corpus = []
    # Convert MIDI file to score and iterate over notes in score
    # Save relevant individual Note attributes into list, including Note obj (used to calc interval) and name
    for score in corpus:
        score = music21.corpus.parse(score)
        all_notes = ""
        for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score,0):
            if note == None:
                pass
            else:
                all_notes += note.name + " "

        notes_corpus.append(all_notes)
        
        mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
        mode_corpus.append((mode_at_measure_0 == 'major'))

    print "Done"
    return notes_corpus, mode_corpus

In [4]:
def build_feature_vector_and_fit_model(training_corpus, outcomes):

    vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(1,2), token_pattern=r'\w#?-?')
    
    X_train = vectorizer.fit_transform(training_corpus)
    X_train = X_train.toarray()
    y_train = np.ravel(outcomes)

    classifier_NB = GaussianNB()
    classifier_NB.fit(X_train, y_train)

    print "TRAINING SET SCORE: ", classifier_NB.score(X_train, y_train)

    return vectorizer, classifier_NB

In [5]:
def predict(vectorizer, classifier, validation_corpus, outcomes):
    """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
    X_test = vectorizer.transform(validation_corpus)
    print type(X_test), X_test.shape
    X_test = X_test.toarray()

    prediction = classifier.predict(X_test)
    prediction = list(prediction)
    print 'PREDICTION:', prediction
    print ""
    print 'ACTUAL OUTCOMES: ', outcomes
    
    count = 0
    for i in range(len(prediction)):
        if prediction[i] == outcomes[i]:
            count +=1
        
    print '{} correct predictions out of {} sample test files'.format(count, len(outcomes))
    print float(count)/len(outcomes) * 100

In [6]:
# ------------------------------Executable Code --------------------------------
scores = music21.corpus.getComposer('ryansMammoth')

training_files = scores[::2]
validation_files = scores[1::2]

In [7]:
# Build training features from input midi files
training_corpus, training_outcomes = build_feature_corpus(training_files)

Done


In [8]:
vectorizer, classifier = build_feature_vector_and_fit_model(training_corpus, training_outcomes)

TRAINING SET SCORE:  0.422641509434


In [9]:
# Build features for validation data set of midi files.
validation_corpus, validation_outcomes = build_feature_corpus(validation_files)

Done


In [10]:
predict(vectorizer, classifier, validation_corpus, validation_outcomes)

<class 'scipy.sparse.csr.csr_matrix'> (529, 192)
PREDICTION: [True, False, True, False, False, True, True, False, False, False, False, False, True, False, False, False, True, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, True, False, True, False, False, False, True, False, False, True, False, False, True, True, False, True, True, False, False, False, False, True, False, True, False, False, False, False, False, False, True, False, False, True, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, True, True, False, True, T