In [135]:
import os
import pandas as pd
from mido import MidiFile
from collections import Counter
import numpy as np

def get_all_ngrams(items, upper_bound=7):
    ngrams = []
    for n in range(1, min(upper_bound, len(items)) + 1):
        ngrams.extend(get_ngrams(items, n))
    return ngrams

def get_ngrams(items, n=1):
    ngrams = []
    for i in range(len(items) - n + 1):
        ngrams.append(items[i:i + n])
    return ngrams

def get_intervals(notes):
    return (np.roll(notes, -1) - notes)[:-1].tolist()

def get_melody(path):
    melody = []
    mid = MidiFile(path)
    for track in mid.tracks:
        for msg in track:
            if msg.type == 'note_on':
                if msg.note != 0:
                    melody.append(msg.note)
    return melody

def find_melodies(rootdir):
    melodies = {}
    for subdir, dirs, files in os.walk(rootdir):
        files = [file for file in files if file.endswith('.mid')]
        if len(files) > 0:
            for file in files:
                path = os.path.join(subdir, file)
                melodies[path] = get_melody(path)
    return melodies

def calculate_entropy(events):
    probs = Counter(events)
    num_events = len(events)
    for event in probs:
        probs[event] /= num_events
    p = np.array(list(probs.values()))
    return -np.dot(p, np.log2(p))

def encode_list(l):
    return ','.join([str(s) for s in l])

def decode_list(s):
    return [int(x) for x in s.split(',')]
            
melodies = find_melodies('data/')
note_seqs = list(melodies.values())
interval_seqs = [get_intervals(item) for item in note_seqs]

In [89]:
ngrams = get_ngrams(interval_seqs[0], 2)
ngrams_encoded = [encode_list(ngram) for ngram in ngrams]
entropy = calculate_entropy(ngrams_encoded)
print(entropy)

4.52756901109


In [97]:
print(get_all_ngrams(interval_seqs[0]))

[[2], [2], [-4], [4], [5], [-2], [0], [2], [2], [1], [-1], [-2], [-2], [0], [5], [-8], [5], [0], [0], [0], [-4], [2], [2], [-2], [0], [-3], [-4], [7], [-3], [-2], [-2], [7], [5], [-8], [5], [0], [0], [0], [-4], [2], [2], [-2], [0], [-3], [-4], [7], [-3], [-2], [-2], [2, 2], [2, -4], [-4, 4], [4, 5], [5, -2], [-2, 0], [0, 2], [2, 2], [2, 1], [1, -1], [-1, -2], [-2, -2], [-2, 0], [0, 5], [5, -8], [-8, 5], [5, 0], [0, 0], [0, 0], [0, -4], [-4, 2], [2, 2], [2, -2], [-2, 0], [0, -3], [-3, -4], [-4, 7], [7, -3], [-3, -2], [-2, -2], [-2, 7], [7, 5], [5, -8], [-8, 5], [5, 0], [0, 0], [0, 0], [0, -4], [-4, 2], [2, 2], [2, -2], [-2, 0], [0, -3], [-3, -4], [-4, 7], [7, -3], [-3, -2], [-2, -2], [2, 2, -4], [2, -4, 4], [-4, 4, 5], [4, 5, -2], [5, -2, 0], [-2, 0, 2], [0, 2, 2], [2, 2, 1], [2, 1, -1], [1, -1, -2], [-1, -2, -2], [-2, -2, 0], [-2, 0, 5], [0, 5, -8], [5, -8, 5], [-8, 5, 0], [5, 0, 0], [0, 0, 0], [0, 0, -4], [0, -4, 2], [-4, 2, 2], [2, 2, -2], [2, -2, 0], [-2, 0, -3], [0, -3, -4], [-3, -

In [248]:
def encoder(l):
    return [encode_list(item) for item in l]

def calculate_ngram_counts(interval_corpus, upper_bound):
    counts = {}
    for n in range(1, upper_bound + 1):
        events = []
        for sequence in interval_corpus:
            ngrams = get_ngrams(sequence, n)
            events.extend(ngrams)
        events = encoder(events)
        subcounts = Counter(events)
        counts.update(subcounts)
    return counts

def get_conditional_probability(interval, conditionals, counts):
    total_sum = 0
    numerator = 0
    for key in counts.keys():
        event = decode_list(key)
        match = [interval] + conditionals
        if len(event) == 1 + len(conditionals):
            if len(conditionals) > 0:
                tail = event[-len(conditionals):]
                if tail != conditionals:
                    continue
            total_sum += counts[key]
        if event[0] == interval:
            numerator = counts[key]
    N = 15**(1 + len(conditionals))
    return float(numerator + 1) / float(total_sum + N)

def calculate_melody_logprobability(melody, interval_corpus, model=2):
    counts = calculate_ngram_counts(interval_corpus, model)
    probs = []
    for i in range(len(melody)):
        interval = melody[i]
        conditionals = melody[i + 1:i + model]
        p = get_conditional_probability(interval, conditionals, counts)
        probs += [np.log(p)]
    return np.sum(probs)

def calculate_entropy(logprob):
    return -logprob * np.exp(logprob)

import random
fake = [random.randint(60, 75) for _ in range(24)]
fake = [65, 62, 64, 64, 60, 69, 65, 64, 60, 65, 69, 60, 65, 69, 62, 60, 62, 62, 69, 65, 65, 65, 65, 60]
jacques = [60, 62, 64, 60, 60, 62, 64, 60, 64, 65, 67, 64, 65, 67, 67, 69, 67, 65, 64, 60, 67, 69, 67, 65]
farm = [65, 65, 65, 60, 62, 62, 60, 69, 69, 67, 67, 65, 60, 65, 65, 65, 60, 62, 62, 60, 69, 69, 67, 67]
jacques = get_intervals(jacques)
farm = get_intervals(farm)
fake = get_intervals(fake)

model = 6
p_jacques = calculate_melody_logprobability(jacques, interval_seqs, model)
p_farm = calculate_melody_logprobability(farm, interval_seqs, model)
p_fake = calculate_melody_logprobability(fake, interval_seqs, model)

e_jacques = calculate_entropy(p_jacques)
e_farm = calculate_entropy(p_farm)

print(p_jacques)
print(p_farm)
print()

print(e_jacques)
print(e_farm)
print()

print(e_jacques > e_farm)

-300.700797118
-305.109787782

7.68135235084e-129
9.48328781324e-131

True
