In [2]:
"""THIS TESTER IS USING THE FOLLOWING:

Dataset: 
Music21 corpus 'ryansMammoth'

Testing features:
*Merging two feature vectors using FeatureUnion and Pipelines.
1) Notes frequency feature vector, ngrams = range of 1-2.
2) Steps frequency feature vector, ngrams = bigrams and trigrams.

Classifier: 
SVM

Additional scikit modules:
Cross validation, feature union, metrics

"""

###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import cross_validation
from sklearn import metrics
import cPickle

In [3]:
def build_feature_corpus(corpus):
    """Takes MIDI files and creates feature arrays to be used for training."""

    features = {}
    notes_corpus, steps_corpus, outcomes = [], [], []

    # Convert MIDI file to score and iterate over notes in score
    # Save relevant individual Note attributes into list, including Note obj (used to calc interval) and name
    for score in corpus:
        score = music21.corpus.parse(score)
        all_notes = ""
        notes = []
        for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score, 0):
            if note == None:
                pass
            else:
                notes.append(note)
                all_notes += note.name + " "
                
        notes_corpus.append(all_notes)
        
        # Creates string of all steps in a score and appends to steps_corpus
        all_steps = ""
        for i in range(1, len(notes)):
            interval = music21.interval.Interval(noteStart=notes[i-1], noteEnd=notes[i])
            step = int((interval.cents)/100)
            all_steps += str(step) + ' '

        steps_corpus.append(all_steps)
        
        mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
        outcomes.append((mode_at_measure_0 == 'major'))

    features['notes_freq'],features['steps_freq'] = notes_corpus, steps_corpus
    return features, outcomes


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [11]:
def build_pipeline():

    feature_vectorizer = FeatureUnion(
        transformer_list=[
            ('notes', Pipeline([
                ('selector', ItemSelector(key='notes_freq')),
                ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(1,2), token_pattern=r'\w#?-?')),
            ])),
            ('steps', Pipeline([
                ('selector', ItemSelector(key='steps_freq')),        
                ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(2, 3), token_pattern=r'\d\d?'))
            ])),
        ])

    classifier = SVC()
    
    return feature_vectorizer, classifier

In [12]:
def train_classifier(feature_vectorizer, classifier, features, outcomes):
    
    cv = cross_validation.StratifiedKFold(outcomes, 5)
    feature_vectors = feature_vectorizer.fit_transform(features)
    outcomes = np.ravel(outcomes)
    
    # Running into an issue here splitting dataset bc features is not a vectorized object
    # yet, still a dictionary. Might need to keep vectorizer and classifier separated.
    precision, recall = [], []

    for training_data, testing_data in cv:
        X_train = feature_vectors[training_data]
        X_test = feature_vectors[testing_data]
        y_train = outcomes[training_data]
        y_test = outcomes[testing_data]
        
        classifier.fit(X_train, y_train)
        y_predicted = classifier.predict(X_test)
        p,r,_,_ = metrics.precision_recall_fscore_support(y_test, y_predicted)
        precision.append(p[1])
        recall.append(r[1])
    
    return classifier, precision, recall

In [13]:
def show_precision_recall(precision, recall):
    print "Precision: ", np.average(precision), '+/-', np.std(precision)
    print "Recall: ", np.average(recall), '+/-', np.std(recall)

In [16]:
# def predict(pipeline, validation_features):
#     """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
#     predictions = pipeline.predict(validation_features)
#     outcomes = validation_features['outcomes']
#     print 'PREDICTION:', predictions
#     print ""
#     print 'ACTUAL OUTCOMES: ', outcomes
    
#     count = 0
#     for i in range(len(predictions)):
#         if predictions[i] == outcomes[i]:
#             count +=1
        
#     print '{} correct predictions out of {} sample test files'.format(count, len(outcomes))
#     print float(count)/len(outcomes) * 100

In [7]:
# ------------------------------Executable Code --------------------------------

# Read test files and construct feature and outcome datasets
scores = music21.corpus.getComposer('ryansMammoth')
features, outcomes = build_feature_corpus(scores)

In [14]:
# Train classifier using Kfold to split training vs. validation data, 
# and calculate precision vs. recall
feature_vectorizer, classifier = build_pipeline()
trained_pipeline, precision, recall = train_classifier(feature_vectorizer, classifier, features, outcomes)
show_precision_recall(precision, recall)

Precision:  0.824365747977 +/- 0.00143330832899
Recall:  1.0 +/- 0.0
