In [36]:
"""THIS TESTER IS USING THE FOLLOWING:

Dataset: 
Music21 corpus 'ryansMammoth'
Music21 corpus 'bach'

Testing features:
*Merging two feature vectors using FeatureUnion and Pipelines.
1) Notes frequency feature vector, ngrams = range of 1-2.
2) Steps frequency feature vector, ngrams = bigrams and trigrams.

Classifier: 
Used for all

Additional scikit modules:
Cross validation, feature union, metrics

"""

###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import cross_validation
from sklearn import metrics
import cPickle

In [11]:
def score_generator(filepaths):

    num_scores = len(filepaths)
    n = 0

    while n < num_scores:
        yield music21.corpus.parse(filepaths[n])
        n += 1

In [4]:
def part_generator(score):
    
    parts = [part for part in score.parts]
    num_parts = len(parts)
    n = 0

    while n < num_parts:
        yield n
        n += 1

In [5]:
def build_feature_corpus(scores):
    """Takes MIDI files and creates feature arrays to be used for training."""

    features = {}
    notes_corpus, steps_corpus, outcomes = [], [], []

    for score in (score_generator(scores)):
        print "Beginning new score: ", score

        for n in (part_generator(score)):
            all_notes = ""
            notes = []
            for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score, n):
                if note == None:
                    pass
                else:
                    notes.append(note)
                    all_notes += note.name + " "

            notes_corpus.append(all_notes)

            # Creates string of all steps in a score and appends to steps_corpus
            all_steps = ""
            for i in range(1, len(notes)):
                interval = music21.interval.Interval(noteStart=notes[i-1], noteEnd=notes[i])
                step = int((interval.cents)/100)
                all_steps += str(step) + ' '

            steps_corpus.append(all_steps)

            mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
            outcomes.append((mode_at_measure_0 == 'major'))

    features['notes_freq'],features['steps_freq'] = notes_corpus, steps_corpus
    return features, outcomes

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [108]:
def build_pipeline():

    feature_vectorizer = FeatureUnion(
        transformer_list=[
#             ('notes', Pipeline([
#                 ('selector', ItemSelector(key='notes_freq')),
#                 ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(1,2), token_pattern=r'\w#?-?')),
#             ])),
            ('steps', Pipeline([
                ('selector', ItemSelector(key='steps_freq')),        
                ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(2, 3), token_pattern=r'\d\d?'))
            ])),
        ])

#     classifier = LogisticRegression()
#     classifier = MultinomialNB()
    classifier = SVC(probability=True)
    
    return feature_vectorizer, classifier

In [75]:
def train_classifier(feature_vectorizer, classifier, features, outcomes):
    
    cv = cross_validation.StratifiedKFold(outcomes, 5)
    feature_vectors = feature_vectorizer.fit_transform(features)
    outcomes = np.ravel(outcomes)
    
    # Running into an issue here splitting dataset bc features is not a vectorized object
    # yet, still a dictionary. Might need to keep vectorizer and classifier separated.
    precision, recall = [], []

    for training_data, testing_data in cv:
        X_train = feature_vectors[training_data]
        X_test = feature_vectors[testing_data]
        y_train = outcomes[training_data]
        y_test = outcomes[testing_data]
        
        classifier.fit(X_train, y_train)
        y_predicted = classifier.predict(X_test)
        p,r,_,_ = metrics.precision_recall_fscore_support(y_test, y_predicted)
        precision.append(p[1])
        recall.append(r[1])
    
    return feature_vectorizer, classifier, precision, recall

In [76]:
def show_precision_recall(precision, recall):
    print "Precision: ", np.average(precision), '+/-', np.std(precision)
    print "Recall: ", np.average(recall), '+/-', np.std(recall)

In [77]:
def save_trained_pipeline(feature_vectorizer, trained_classifier):

    trained_pipeline = Pipeline([('feature_vectorizer', feature_vectorizer),
                                 ('classifier', trained_classifier)  
        ])
    
    return trained_pipeline

In [16]:
# def predict(pipeline, validation_features):
#     """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
#     predictions = pipeline.predict(validation_features)
#     outcomes = validation_features['outcomes']
#     print 'PREDICTION:', predictions
#     print ""
#     print 'ACTUAL OUTCOMES: ', outcomes
    
#     count = 0
#     for i in range(len(predictions)):
#         if predictions[i] == outcomes[i]:
#             count +=1
        
#     print '{} correct predictions out of {} sample test files'.format(count, len(outcomes))
#     print float(count)/len(outcomes) * 100

In [12]:
# ------------------------------Executable Code --------------------------------

# Read test files and construct feature and outcome datasets
scores_A = music21.corpus.getComposer('ryansMammoth')
scores_B = music21.corpus.getComposer('bach')
scores = scores_A + scores_B
features, outcomes = build_feature_corpus(scores)

Beginning new score:  <music21.stream.Score 0x116180610>
Beginning new score:  <music21.stream.Score 0x11698dc10>
Beginning new score:  <music21.stream.Score 0x1164e2f10>
Beginning new score:  <music21.stream.Score 0x1169f06d0>
Beginning new score:  <music21.stream.Score 0x1169bf890>
Beginning new score:  <music21.stream.Score 0x11619ff50>
Beginning new score:  <music21.stream.Score 0x116db6e50>
Beginning new score:  <music21.stream.Score 0x1167dcf90>
Beginning new score:  <music21.stream.Score 0x116df3850>
Beginning new score:  <music21.stream.Score 0x116a08cd0>
Beginning new score:  <music21.stream.Score 0x116e68f10>
Beginning new score:  <music21.stream.Score 0x11690bed0>
Beginning new score:  <music21.stream.Score 0x1167eb6d0>
Beginning new score:  <music21.stream.Score 0x116a1ac10>
Beginning new score:  <music21.stream.Score 0x1169e4450>
Beginning new score:  <music21.stream.Score 0x116180fd0>
Beginning new score:  <music21.stream.Score 0x1169bfe90>
Beginning new score:  <music21.

In [109]:
# Train classifier using Kfold to split training vs. validation data, 
# and calculate precision vs. recall
feature_vectorizer, classifier = build_pipeline()
feature_vectorizer, trained_classifier, precision, recall = train_classifier(feature_vectorizer, classifier, features, outcomes)
show_precision_recall(precision, recall)

Precision:  0.660544306716 +/- 0.000466156885474
Recall:  1.0 +/- 0.0


In [110]:
trained_pipeline = save_trained_pipeline(feature_vectorizer, trained_classifier)
pipeline_file = open('pipeline_svc_steps.txt', 'w')
cPickle.dump(trained_pipeline , pipeline_file)
print "Pickled the pipeline to /pipeline_svc_steps.txt."
pipeline_file.close()

Pickled the pipeline to /pipeline_svc_steps.txt.
