In [1]:
"""THIS TESTER IS USING LOGISTIC REGRESSION TO NOTES AND STEPS FEATURE VECTORS. USES CORPUS.

Testing feature mergeing using FeatureUnion and Pipelines.
    Notes frequency feature vectors are using the latest trial of ngrams = range of 1-2.
    Steps frequency is applied to bigrams and trigrams.

"""

###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

In [9]:
def build_feature_corpus(corpus):
    """Takes MIDI files and creates feature arrays to be used for training."""

    features = {}
    notes_corpus, steps_corpus, mode_corpus = [], [], []

    # Convert MIDI file to score and iterate over notes in score
    # Save relevant individual Note attributes into list, including Note obj (used to calc interval) and name
    for score in corpus:
        score = music21.corpus.parse(score)
        all_notes = ""
        notes = []
        for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score, 0):
            if note == None:
                pass
            else:
                notes.append(note)
                all_notes += note.name + " "
                
        notes_corpus.append(all_notes)
        
        # Creates string of all steps in a score and appends to steps_corpus
        all_steps = ""
        for i in range(1, len(notes)):
            interval = music21.interval.Interval(noteStart=notes[i-1], noteEnd=notes[i])
            step = int((interval.cents)/100)
            all_steps += str(step) + ' '

        steps_corpus.append(all_steps)
        
        mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
        mode_corpus.append((mode_at_measure_0 == 'major'))

    features['notes_freq'],features['steps_freq'], features['outcomes'] = notes_corpus, steps_corpus, mode_corpus
    return features


In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [3]:
def build_feature_vector_and_fit_model(features):

    pipeline = Pipeline([
        ('features', FeatureUnion(
            transformer_list=[
                ('notes', Pipeline([
                    ('selector', ItemSelector(key='notes_freq')),
                    ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(1,2), token_pattern=r'\w#?-?')),
                ])),
                ('steps', Pipeline([
                    ('selector', ItemSelector(key='steps_freq')),        
                    ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(2, 3), token_pattern=r'\d\d?'))
                ])),
            ])),
                        
        ('classifier', LogisticRegression()),
        ])
        
    pipeline.fit(features_and_outcomes_dict, np.ravel(features_and_outcomes_dict['outcomes']))
    
    return pipeline

In [16]:
def predict(pipeline, validation_features):
    """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
    predictions = pipeline.predict(validation_features)
    outcomes = validation_features['outcomes']
    print 'PREDICTION:', predictions
    print ""
    print 'ACTUAL OUTCOMES: ', outcomes
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == outcomes[i]:
            count +=1
        
    print '{} correct predictions out of {} sample test files'.format(count, len(outcomes))
    print float(count)/len(outcomes) * 100

In [7]:
# ------------------------------Executable Code --------------------------------

# read test files and construct training vs validation datasets
scores = music21.corpus.getComposer('ryansMammoth')

training_files = scores[::2]
validation_files = scores[1::2]

In [10]:
# Build training features from input midi files
features_and_outcomes_dict = build_feature_corpus(training_files)

In [13]:
pipeline = build_feature_vector_and_fit_model(features_and_outcomes_dict)

In [14]:
# Build features for validation data set of midi files.
validation_features = build_feature_corpus(validation_files)

In [17]:
predict(pipeline, validation_features)

PREDICTION: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  T