In [55]:
import os
import numpy 
import re
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
from itertools import islice
from random import randint


In [56]:
def filename_to_title(filename):
        name = re.sub(r'\([^)]*\)', '', filename)
        new_name = ""
        for character in name:
            if character == '.':
                break
            if character.isalnum() and not character.isnumeric():
                new_name += character
        return new_name
    
def get_files(directory):
    files = {}
    for dirpath, dirnames, filenames in os.walk(directory):
        if not dirnames:
            for filename in filenames:
                if not filename.endswith('.lab') and not filename.endswith('.mp3') and not filename.endswith('m4a'):
                    continue
                song_obj = {
                    "filename": filename,
                    "title": filename_to_title(filename)
                    
                }
                if dirpath not in files:
                    files[dirpath] = [song_obj]
                else:
                    files[dirpath].append(song_obj)
    return files

def generate_song_labels(label_album_path, labels_dict):
    song_label_dict = {}
    file_labels = labels_dict[label_album_path]
    for file in file_labels:
        if not file['filename'].endswith('.lab'):
            continue
        song_label_dict[file['title']] = []
        with open(os.path.join(label_album_path, file['filename'])) as fp:
            line = fp.readline()
            while line:
                tokens = line.split(' ')
                if len(tokens) == 1: tokens = line.split('\t')
                onset = int(float(tokens[0]))
                offset = int(float(tokens[1]))
                chord = tokens[2][:len(tokens[2]) - 1]
                song_label_dict[file['title']].append(chord)
                line = fp.readline()
    return song_label_dict

def random_chunk(li, min_chunk=1, max_chunk=3):
    it = iter(li)
    while True:
        nxt = list(islice(it,randint(min_chunk,max_chunk)))
        if nxt:
            yield nxt
        else:
            break

def load_data():
    data_path = "/Users/jrmylee/Documents/dev/projects/mir/datasets/isophonics/beetles_annotations"
    albums_collection = get_files(data_path)
    albums = []
    for album in albums_collection:
        album_labels = generate_song_labels(album, albums_collection)
        albums.append(album_labels)
    
    return albums
        
def ngrams(arr, n):
    output = []
    for i in range(len(arr)-n+1):
        output.append(arr[i:i+n])
    return output

In [57]:
beetles_albums = load_data()

In [58]:
model = defaultdict(lambda: defaultdict(lambda: 0))

In [59]:
for album in beetles_albums:
    for song_title in album:
        song = album[song_title]
        random_chunks = random_chunk(song, 5, 15)
        
        for chunk in random_chunks:
            trig = ngrams(chunk, 3)
            for c1, c2, c3 in trig:
                model[(c1, c2)][c3] += 1                    

In [60]:
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [61]:
dict(model["G", "C"])

{'C:7': 0.013824884792626729,
 'G': 0.391705069124424,
 'Bb': 0.013824884792626729,
 'D': 0.15668202764976957,
 'F': 0.11059907834101383,
 'N': 0.013824884792626729,
 'D/5': 0.004608294930875576,
 'G/3': 0.03686635944700461,
 'A:min': 0.06451612903225806,
 'E:min/5': 0.004608294930875576,
 'D:7': 0.013824884792626729,
 'E': 0.03686635944700461,
 'D:sus4': 0.018433179723502304,
 'D:min7(4)/b7': 0.004608294930875576,
 'F:7': 0.004608294930875576,
 'C#:dim': 0.018433179723502304,
 'C/7': 0.018433179723502304,
 'F:min': 0.03686635944700461,
 'C': 0.018433179723502304,
 'A': 0.013824884792626729,
 'D/b7': 0.004608294930875576}

In [62]:
import random

# starting words
text = ["C", "D"]
sentence_finished = False
 
while not sentence_finished:
  # select a random probability threshold  
  r = random.random()
  accumulator = .0

  for word in model[tuple(text[-2:])].keys():
      accumulator += model[tuple(text[-2:])][word]
      # select words that are above the probability threshold
      if accumulator >= r:
          text.append(word)
          break

  if text[len(text) - 1] == text[0]:
      sentence_finished = True
 
print (' '.join([t for t in text if t]))

C D A A E B:7 A:7 E:7 A:7 A:7/5 A:7 E:min A D B:min E B:min F#:min B:min F#:min A D C


In [52]:
text = ["C", "D", "d"]
text[-2:]

['D', 'd']