# Simple Segmentation Model


In [11]:
import os
import pandas as pd
import numpy as np 
def load_chants(test_chants_file = "test-chants.csv", 
                train_chants_file = "train-chants.csv",
                test_repr_pitch_file = "test-representation-pitch.csv",
                train_repr_pitch_file = "train-representation-pitch.csv"):
    test_chants = pd.read_csv(test_chants_file, index_col='id')
    train_chants = pd.read_csv(train_chants_file, index_col='id')
    chants = pd.concat([train_chants, test_chants])
    pitch_repr_test = pd.read_csv(test_repr_pitch_file, index_col='id')
    pitch_repr_train = pd.read_csv(train_repr_pitch_file, index_col='id')
    pitch_representations = pd.concat([pitch_repr_train, pitch_repr_test])

    return chants, pitch_representations

def prepare_dataset():
    chants, pitch_repr = load_chants()
    X, y = [], []
    for segments, mode, id_pitches, id_chant in zip(pitch_repr["1-mer"], 
                                                chants['mode'], 
                                                pitch_repr.index, 
                                                chants.index):
        if not id_pitches == id_chant:
            raise ValueError("IDs of features and modes are not equal!")
        X.append(segments.replace(' ', ''))
        y.append(str(mode))

    return np.array(X), np.array(y)

In [51]:
import random
class SimpleSegmentationModel():
  def __init__(self):
    # dictionary of melody string and its counts over all documents (as integer)
    self.melody_counts = {} 
    # number of all segments, the sum over all counts 
    self.total_segments = 0 
    # dictionaryof melody strings and its hashset of chants that contains
    # this melody
    self.melody_in_chants = {}
    # total number of chants
    self.chant_count = 0


  def predict_segments(self, chants, iterations = 1000, 
                       epsilon = 0.05, mu = 5, sigma = 2, 
                       alpha=0.0001, print_each = 20):
    # Do init segmentation, generate model's dictionaries (melody_counts, ...)
    init_segmentation = self.__gaus_rand_segments(chants, mu, sigma)
    # Update chant_count
    self.chant_count = len(chants)
    chant_segmentation = init_segmentation
    for i in range(iterations):
      chant_segmentation = self.__train_iteration(chant_segmentation, epsilon, alpha)
      if i%print_each == 0:
        print("{}. Iteration".format(i))
        top25_melodies = sorted(self.melody_counts, key=self.melody_counts.get, reverse=True)[:25]
        print("\t\t\t", top25_melodies)
        #for topmel in top25_melodies:
        #  print("\t\t\t{}".format(topmel))
    return chant_segmentation
      

  def __gaus_rand_segments(self, chants, mu, sigma):
    rand_segments = []
    for chant_id, chant in enumerate(chants):
      new_chant_segments = []
      i = 0
      while i != len(chant):
        # Find new segment
        new_len = max(int(random.gauss(mu, sigma)), 1)
        k = min(i+new_len, len(chant))
        new_chant_segments.append(chant[i:k])
        last_added_segment = new_chant_segments[-1]
        # Update melody_counts
        if last_added_segment in self.melody_counts:
          self.melody_counts[last_added_segment] += 1
        else:
          self.melody_counts[last_added_segment] = 1
        # Update total_segments count
        self.total_segments += 1
        # Update melody_in_chants
        if last_added_segment in self.melody_in_chants:
          self.melody_in_chants[last_added_segment].add(chant_id)
        else:
          self.melody_in_chants[last_added_segment] = {chant_id}
        # Update i index
        i = k
      rand_segments.append(new_chant_segments)
    return rand_segments

  def __train_iteration(self, segmented_chants, epsilon, alpha):
    new_segmented_chants = []
    join_prev_melody = None
    for chant_id, segments in enumerate(segmented_chants):
      # reset melody_in_chants
      for melody in segments:
        if chant_id in self.melody_in_chants[melody]:
          self.melody_in_chants[melody].remove(chant_id)


      new_segments = []
      for melody in segments:
        self.total_segments -= 1
        self.melody_counts[melody] -= 1

        if join_prev_melody == None:
          # How many documents contains this melody
          chant_frequency = len(self.melody_in_chants[melody])/self.chant_count

          if chant_frequency > epsilon or len(melody) <= 1: 
            # Do nothing, pass the melody to the next stage for joining
            join_prev_melody = melody
          else:
            # Find the best splitting
            max_prob = 0
            left = ""
            right = ""
            for split_point in range(1, len(melody)):
              new_left = melody[:split_point]
              new_right = melody[split_point:]
              left_freq = alpha
              right_freq = alpha
              if new_left in self.melody_counts:
                left_freq += (self.melody_counts[new_left]/self.total_segments)
              if new_right in self.melody_counts:
                right_freq += (self.melody_counts[new_right]/self.total_segments)
              
              if max_prob < left_freq * right_freq:
                max_prob = left_freq * right_freq
                left = new_left
                right = new_right
            # Joining melody with the previous one
            new_segments.append(left)
            new_segments.append(right)
            # Update total_segments count
            self.total_segments += 2
            # Update melody_counts
            if left in self.melody_counts:
              self.melody_counts[left] += 1
            else:
              self.melody_counts[left] = 1
            if right in self.melody_counts:
              self.melody_counts[right] += 1
            else:
              self.melody_counts[right] = 1
        else:
          # Joining melody with the previous one
          new_segments.append(join_prev_melody + melody)
          # Update total_segments count
          self.total_segments += 1
          # Update melody_counts
          if join_prev_melody + melody in self.melody_counts:
            self.melody_counts[join_prev_melody + melody] += 1
          else:
            self.melody_counts[join_prev_melody + melody] = 1
          join_prev_melody = None
          

      # Update melody_in_chants
      for melody in new_segments:
        if melody in self.melody_in_chants:
          self.melody_in_chants[melody].add(chant_id)
        else:
          self.melody_in_chants[melody] = {chant_id}

      new_segmented_chants.append(new_segments)
    return new_segmented_chants

In [52]:
# Get Data
X, y = prepare_dataset()
# Init model
model = SimpleSegmentationModel()
# Train and Fit model
final_segmentation = model.predict_segments(X)

0. Iteration
			 ['g', 'h', 'f', 'k', 'd', 'l', 'e', 'hg', 'j', 'gh', 'kj', 'fgh', 'fed', 'gf', 'fe', 'c', 'm', 'gg', 'hgf', 'kk', 'cd', 'ff', 'ghg', 'ed', 'dd']
20. Iteration
			 ['g', 'h', 'f', 'k', 'd', 'e', 'l', 'j', 'gf', 'kj', 'gh', 'fgh', 'fed', 'hgg', 'cd', 'hgh', 'c', 'ml', 'dd', 'kjh', 'dcd', 'm', 'hh', 'hgfg', 'kl']
40. Iteration
			 ['g', 'h', 'f', 'k', 'd', 'e', 'l', 'j', 'gf', 'gh', 'kj', 'fgh', 'fed', 'hgg', 'cd', 'hgh', 'c', 'ml', 'dd', 'kjh', 'dcd', 'm', 'hgfg', 'hh', 'kl']
60. Iteration
			 ['g', 'h', 'f', 'k', 'd', 'e', 'l', 'j', 'gf', 'gh', 'kj', 'fgh', 'fed', 'hgg', 'cd', 'hgh', 'c', 'ml', 'dd', 'kjh', 'dcd', 'm', 'hgfg', 'hh', 'kl']
80. Iteration
			 ['g', 'h', 'f', 'k', 'd', 'e', 'l', 'j', 'gf', 'gh', 'kj', 'fgh', 'fed', 'hgg', 'cd', 'hgh', 'c', 'ml', 'dd', 'kjh', 'dcd', 'm', 'hgfg', 'hh', 'kl']
100. Iteration
			 ['g', 'h', 'f', 'k', 'd', 'e', 'l', 'j', 'gf', 'gh', 'kj', 'fgh', 'fed', 'hgg', 'cd', 'hgh', 'c', 'ml', 'dd', 'kjh', 'dcd', 'm', 'hgfg', 'hh', 'kl']
12

In [53]:
print(final_segmentation)

[['hgh', 'g', 'hggg', 'jkl', 'lkkkkj', 'k', 'h', 'gggjghj', 'k', 'llhjk', 'hj', 'gggh', 'k', 'kkkkk'], ['jk', 'hgg', 'g', 'k', 'khghgf', 'ghggg', 'hgfgg', 'h', 'ff', 'g', 'fhjk', 'l', 'kj', 'klk', 'h', 'kjhg', 'g', 'hgfgh'], ['hggfggfgghf', 'f', 'gfff', 'fiihggi', 'kih', 'g', 'hgffggf', 'fii', 'h', 'ghggghgf', 'gfed', 'C', 'f', 'f', 'gfgh', 'fgf', 'fhhfgh'], ['g', 'fdefggef', 'e', 'd', 'dfefg', 'dfe', 'f', 'd', 'cd', 'f', 'f', 'E', 'cf', 'd', 'dffffecd'], ['ddef', 'g', 'ge', 'fEd', 'dfefg', 'g', 'd', 'f', 'efdccdfece', 'dfff', 'ecd'], ['de', 'f', 'g', 'gefE', 'f', 'd', 'efgH', 'gdfefddccdfEccd', 'd', 'fffe'], ['cd', 'd', 'efg', 'gf', 'f', 'e', 'D', 'ddfefggdfe', 'f', 'dcdffEcfd'], ['d', 'cdffde', 'fg', 'gf', 'fedd', 'fefg', 'gd', 'f', 'efdcd', 'fefcfddf', 'ffec'], ['d', 'ddef', 'ggefEddc', 'f', 'ghgf', 'g', 'fgf', 'edc', 'eg', 'efdc', 'dd', 'f', 'ghh', 'h', 'hgfg'], ['h', 'gfdef', 'ggefed', 'dfefg', 'gdf', 'ef', 'd', 'cdfec', 'f', 'ddf', 'ffe'], ['cd', 'ddc', 'f', 'gfhhi', 'hggfgh', 'g

In [55]:
print(dict(sorted(model.melody_counts.items(), key=lambda item: item[1], reverse=True)))

{'g': 19754, 'h': 17143, 'f': 13869, 'k': 12575, 'd': 8899, 'e': 7119, 'l': 5688, 'j': 2392, 'gh': 2328, 'gf': 2165, 'kj': 1924, 'fed': 1487, 'ml': 1454, 'dd': 1396, 'fgh': 1350, 'cd': 1291, 'hgg': 1264, 'kjh': 1163, 'c': 1069, 'ff': 1053, 'hgfg': 1049, 'dcd': 1040, 'hgh': 1024, 'kl': 1021, 'kh': 1006, 'hh': 866, 'kk': 854, 'ee': 848, 'm': 833, 'gg': 783, 'll': 764, 'jk': 744, 'dc': 739, 'ghgf': 720, 'jh': 711, 'ef': 709, 'fgf': 704, 'hhh': 703, 'fghh': 701, 'df': 698, 'hggg': 698, 'fg': 663, 'de': 662, 'hgfgh': 659, 'klk': 650, 'jkl': 648, 'kkk': 645, 'i': 644, 'gfe': 639, 'efg': 635, 'fedd': 619, 'hgf': 616, 'ggh': 612, 'lkj': 611, 'ggg': 610, 'gfed': 607, 'ed': 598, 'lml': 583, 'hj': 578, 'gfg': 545, 'kjk': 541, 'n': 526, 'ddd': 525, 'gfgh': 515, 'hG': 509, 'ghg': 494, 'hghk': 492, 'fghg': 490, 'fff': 489, 'lm': 489, 'hhg': 486, 'fgfe': 481, 'lk': 477, 'kjhg': 467, 'hg': 462, 'G': 455, 'kkj': 450, 'ffe': 433, 'ghj': 431, 'ge': 428, 'hk': 428, 'hghg': 420, 'ghh': 410, 'edcd': 392, 'd