In [18]:
"""THIS TESTER IS APPLYING NAIVE BAYES TO EXISTING NOTES FEATURE VECTORS.

Notes frequency feature vectors are using the latest trial of ngrams = range of 1-2.
    TRAINING SET SCORE:  0.944444444444
    Validation Results: 11 correct predictions out of 18 sample test files
"""

###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [2]:
def build_feature_corpus(filenames):
    """Takes MIDI files and creates feature arrays to be used for training."""

    notes_corpus = []
    mode_corpus = []

    # Iterating through every other file; for early trials using 1/2 files as training set, and other 1/2 to test
    for filename in filenames:

        # Convert MIDI file to score and iterate over notes in score
        # Save relevant individual Note attributes into list, including Note obj (used to calc interval) and name
        score = music21.converter.parse('MIDI test files/Cello solos/' + filename)
        note_attributes = []
        for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score, 0):
            if note == None:
                pass
            else:
                note_attributes.append([note, note.name])
        print "Note attributes for file {} completed".format(filename)

        # Creates string of all notes in a score and appends to notes_corpus
        all_notes = ""
        for note in note_attributes:
            all_notes += note[1] + " "

        notes_corpus.append(all_notes)

        # Determines the mode (major or minor) and assigns to output corpus
        # Note: At the moment, this doesn't take into account changes b/w major + minor w/in a score
        mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
        is_major = (mode_at_measure_0 == 'major')
        mode_corpus.append(is_major)

    return notes_corpus, mode_corpus

In [19]:
def build_feature_vector_and_fit_model(training_corpus, outcomes):

    vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(1,2), token_pattern=r'\w#?-?')
    
    X_train = vectorizer.fit_transform(training_corpus)
    X_train = X_train.toarray()
    y_train = np.ravel(outcomes)

    classifier_NB = MultinomialNB()
    classifier_NB.fit(X_train, y_train)

    print "TRAINING SET SCORE: ", classifier_NB.score(X_train, y_train)

    return vectorizer, classifier_NB

In [11]:
def predict(vectorizer, classifier, validation_corpus, outcomes):
    """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
    X_test = vectorizer.transform(validation_corpus)
    print type(X_test), X_test.shape
    X_test = X_test.toarray()

    prediction = classifier.predict(X_test)
    prediction = list(prediction)
    print 'PREDICTION:', prediction
    print ""
    print 'ACTUAL OUTCOMES: ', outcomes
    
    count = 0
    for i in range(len(prediction)):
        if prediction[i] == outcomes[i]:
            count +=1
        
    print '{} correct predictions out of {} sample test files'.format(count, len(outcomes))

In [20]:
# ------------------------------Executable Code --------------------------------

# read test files and construct training vs validation datasets
filenames = open('test_files.txt').read().split('\n')

training_files = filenames[::2]
validation_files = filenames[1::2]

In [21]:
# Build training features from input midi files
training_corpus, training_outcomes = build_feature_corpus(training_files)

Note attributes for file cs1-1pre.mid completed
Note attributes for file cs1-3cou.mid completed
Note attributes for file cs1-5men.mid completed
Note attributes for file cs2-1pre.mid completed
Note attributes for file cs2-3cou.mid completed
Note attributes for file cs2-5men.mid completed
Note attributes for file cs3-1pre.mid completed
Note attributes for file cs3-3cou.mid completed
Note attributes for file cs3-5bou.mid completed
Note attributes for file cs4-1pre.mid completed
Note attributes for file cs4-3cou.mid completed
Note attributes for file cs4-5bou.mid completed
Note attributes for file cs5-1pre.mid completed
Note attributes for file cs5-3cou.mid completed
Note attributes for file cs5-5gav.mid completed
Note attributes for file cs6-1pre.mid completed
Note attributes for file cs6-3cou.mid completed
Note attributes for file cs6-5gav.mid completed


In [7]:
print training_corpus
print training_outcomes

['G D B A B D B D G D B A B D B D G E C B C E C E G E C B C E C E G F# C B C F# C F# G F# C B C F# C F# G G B A B G B G G G B A B G B F# G E B A B G F# G E G F# G B D C# B C# G A G A G A G C# G A G A G A G F# A D C# D A G A F# A G A D F# E D E B G F# G B G B E B G F# G B G B E C# D E D C# B A G F# E D C# B A G F# E D D A D F# A D E F# A G F# E D G# D F E F D G# D B D F E F D G# D C E A B C A E D C E A B C A F# E E- F# E- F# A F# A F# E- F# E- F# A F# A F# G F# E G F# G A F# G F# E D C B A G F# C D C D C D C F# C D C D C D C G B F E F B F B G B F E F B F B G C E D E C E C G C E D E C E C G F# C B C F# C F# G F# C B C F# C F# G D B A B G F# E D C B A G F# E D C# A E F# G E F# G C# A E F# G E F# G C A D E F# D E F# C A D E F# D E F# C A D F# A C# D A B C D E F# G A F# D E F# G A B C A F# G A B C D E- D C# D D C B C C A F# E D A B C D A D F# A B C A B G D C B G A B D G B D G A B G C# B A B- B- A G# A A G F# G G E C# B A C# E G A C# D C# D A F# E F# A D F# A D C# B A G F# E D C B A G F# E D

In [22]:
vectorizer, classifier = build_feature_vector_and_fit_model(training_corpus, training_outcomes)

TRAINING SET SCORE:  0.722222222222


In [23]:
# Build features for validation data set of midi files.
validation_corpus, validation_outcomes = build_feature_corpus(validation_files)

Note attributes for file cs1-2all.mid completed
Note attributes for file cs1-4sar.mid completed
Note attributes for file cs1-6gig.mid completed
Note attributes for file cs2-2all.mid completed
Note attributes for file cs2-4sar.mid completed
Note attributes for file cs2-6gig.mid completed
Note attributes for file cs3-2all.mid completed
Note attributes for file cs3-4sar.mid completed
Note attributes for file cs3-6gig.mid completed
Note attributes for file cs4-2all.mid completed
Note attributes for file cs4-4sar.mid completed
Note attributes for file cs4-6gig.mid completed
Note attributes for file cs5-2all.mid completed
Note attributes for file cs5-4sar.mid completed
Note attributes for file cs5-6gig.mid completed
Note attributes for file cs6-2all.mid completed
Note attributes for file cs6-4sar.mid completed
Note attributes for file cs6-6gig.mid completed


In [24]:
predict(vectorizer, classifier, validation_corpus, validation_outcomes)

<class 'scipy.sparse.csr.csr_matrix'> (18, 156)
PREDICTION: [True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True]

ACTUAL OUTCOMES:  [True, True, True, False, False, False, True, True, True, False, True, False, False, False, False, False, True, True]
13 correct predictions out of 18 sample test files
