In [13]:
"""THIS TESTER IS USING LOGISTIC REGRESSION TO NOTES AND STEPS FEATURE VECTORS.

Testing feature mergeing using FeatureUnion and Pipelines.
    Notes frequency feature vectors are using the latest trial of ngrams = range of 1-2.
    Steps frequency is applied to bigrams and trigrams.

"""

###############################################

import music21
import pandas as pd
import numpy as np
# from pandas.tools.plotting import scatter_matrix
# import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

In [2]:
def build_feature_corpus(filenames):
    """Takes MIDI files and creates feature arrays to be used for training."""

    features = {}
    notes_corpus, steps_corpus, mode_corpus = [], [], []

    # Iterating through every other file; for early trials using 1/2 files as training set, and other 1/2 to test
    for filename in filenames:

        # Convert MIDI file to score and iterate over notes in score
        # Save relevant individual Note attributes into list, including Note obj (used to calc interval) and name
        score = music21.converter.parse('MIDI test files/Cello solos/' + filename)
        note_attributes = []
        for note in music21.alpha.theoryAnalysis.theoryAnalyzer.getNotes(score, 0):
            if note == None:
                pass
            else:
                note_attributes.append([note, note.name])
        print "Note attributes for file {} completed".format(filename)

        # Creates string of all notes in a score and appends to notes_corpus
        all_notes = ""
        for note in note_attributes:
            all_notes += note[1] + " "

        notes_corpus.append(all_notes)

        # Creates string of all steps in a score and appends to steps_corpus
        all_steps = ""
        for i in range(1, len(note_attributes)):
            interval = music21.interval.Interval(noteStart=note_attributes[i-1][0], noteEnd=note_attributes[i][0])
            step = int((interval.cents)/100)
            all_steps += str(step) + ' '

        steps_corpus.append(all_steps)

        # Determines the mode (major or minor) and assigns to output corpus
        # Note: At the moment, this doesn't take into account changes b/w major + minor w/in a score
        mode_at_measure_0 = music21.alpha.theoryAnalysis.theoryAnalyzer.getKeyAtMeasure(score, 0).mode
        is_major = (mode_at_measure_0 == 'major')
        mode_corpus.append(is_major)

    features['notes_freq'],features['steps_freq'], features['outcomes'] = notes_corpus, steps_corpus, mode_corpus
    return features

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [15]:
def build_feature_vector_and_fit_model(features):

    pipeline = Pipeline([
        ('features', FeatureUnion(
            transformer_list=[
                ('notes', Pipeline([
                    ('selector', ItemSelector(key='notes_freq')),
                    ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(1,2), token_pattern=r'\w#?-?')),
                ])),
                ('steps', Pipeline([
                    ('selector', ItemSelector(key='steps_freq')),        
                    ('tfidf', TfidfVectorizer(min_df=1, analyzer='word', stop_words=None, ngram_range=(2, 3), token_pattern=r'\d\d?'))
                ])),
            ])),
                        
        ('classifier', LogisticRegression()),
        ])
        
    pipeline.fit(features_and_outcomes_dict, np.ravel(features_and_outcomes_dict['outcomes']))
    
    return pipeline

In [25]:
def predict(pipeline, validation_features):
    """Takes notes_corpus as a list of test scores (each a string of notes)."""
    
    predictions = pipeline.predict(validation_features)
    outcomes = validation_features['outcomes']
    print 'PREDICTION:', predictions
    print ""
    print 'ACTUAL OUTCOMES: ', outcomes
    
    count = 0
    for i in range(len(predictions)):
        if predictions[i] == outcomes[i]:
            count +=1
        
    print '{} correct predictions out of {} sample test files'.format(count, len(outcomes))

In [5]:
# ------------------------------Executable Code --------------------------------

# read test files and construct training vs validation datasets
filenames = open('test_files.txt').read().split('\n')

training_files = filenames[::2]
validation_files = filenames[1::2]

In [6]:
# Build training features from input midi files
features_and_outcomes_dict = build_feature_corpus(training_files)

Note attributes for file cs1-1pre.mid completed
Note attributes for file cs1-3cou.mid completed
Note attributes for file cs1-5men.mid completed
Note attributes for file cs2-1pre.mid completed
Note attributes for file cs2-3cou.mid completed
Note attributes for file cs2-5men.mid completed
Note attributes for file cs3-1pre.mid completed
Note attributes for file cs3-3cou.mid completed
Note attributes for file cs3-5bou.mid completed
Note attributes for file cs4-1pre.mid completed
Note attributes for file cs4-3cou.mid completed
Note attributes for file cs4-5bou.mid completed
Note attributes for file cs5-1pre.mid completed
Note attributes for file cs5-3cou.mid completed
Note attributes for file cs5-5gav.mid completed
Note attributes for file cs6-1pre.mid completed
Note attributes for file cs6-3cou.mid completed
Note attributes for file cs6-5gav.mid completed


In [16]:
pipeline = build_feature_vector_and_fit_model(features_and_outcomes_dict)

In [17]:
# Build features for validation data set of midi files.
validation_features = build_feature_corpus(validation_files)

Note attributes for file cs1-2all.mid completed
Note attributes for file cs1-4sar.mid completed
Note attributes for file cs1-6gig.mid completed
Note attributes for file cs2-2all.mid completed
Note attributes for file cs2-4sar.mid completed
Note attributes for file cs2-6gig.mid completed
Note attributes for file cs3-2all.mid completed
Note attributes for file cs3-4sar.mid completed
Note attributes for file cs3-6gig.mid completed
Note attributes for file cs4-2all.mid completed
Note attributes for file cs4-4sar.mid completed
Note attributes for file cs4-6gig.mid completed
Note attributes for file cs5-2all.mid completed
Note attributes for file cs5-4sar.mid completed
Note attributes for file cs5-6gig.mid completed
Note attributes for file cs6-2all.mid completed
Note attributes for file cs6-4sar.mid completed
Note attributes for file cs6-6gig.mid completed


In [26]:
predict(pipeline, validation_features)

PREDICTION: [ True  True  True False False False  True  True  True False False False
 False False False  True  True  True]

ACTUAL OUTCOMES:  [True, True, True, False, False, False, True, True, True, False, True, False, False, False, False, False, True, True]
16 correct predictions out of 18 sample test files
