<a href="https://colab.research.google.com/github/ktaniguc/RNNSongGenerator/blob/main/predict_LstmSongGen_pytorch_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 使い方
- 正しくマウント
  ```sh
  from google.colab import drive
drive.mount('/content/drive/')
!cd "/content/drive/MyDrive/"
```
- 数セル下の以下の部分を設定
  ```python
  #IN/OUT/出力結果/ハイパーパラメータの名前を設定
DIR = "/content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/" #モデルやら生成物やらの出力先
model_path = DIR + "output_model/batch20_bptt140_hid512_emb512_epoch49.pt" #読み込むモデル
BPM="120"
OUT_NUM = 20 #出力midi 数
INPUT_PHRASE_NUM = 30 #予測時の入力note 数
MEL_LENGTH = 700 #大体700でBPM=120のとき、30秒くらい
  ```
- あとは実行すると、predict_mid というフォルダに結果が入っています。

### 更新点
- 入力文字の数を指定できるように修正(もとは1note で固定)
- 「予測結果のうちどれを選ぶか」の際に重み付きランダムサンプリングを選択した場合、上位何位までから選ぶかの限定ができるよう変更(明らかに突拍子のないものが出ないように)

### memo
- 出力するnote の長さ(MEL_LENGTH)は、MEL_LENGTH=700 のとき大体BPM=120で30秒くらいの感覚
  - note のdelta_time =0 が多ければもっと短くなる

## setup

In [11]:
#!pip install tqdm
!pip install mido keras torch
#!pip install -U torchtext



In [12]:
from google.colab import drive
drive.mount('/content/drive/')
!cd "/content/drive/MyDrive/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [13]:
#IN/OUT/出力結果/ハイパーパラメータの名前を設定
DIR = "/content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/" #モデルやら生成物やらの出力先
model_path = DIR + "output_model/batch20_bptt140_hid512_emb512_epoch49.pt" #読み込むモデル
BPM="120"
OUT_NUM = 20 #出力midi 数
INPUT_PHRASE_NUM = 30 #予測時の入力note 数
MEL_LENGTH = 700

## 以降、学習時の設定によるparameters
augment=12 #data augumentation でnote をどこまで移調するか。+1で半音。この場合+0~+11
argstep=1 #移調のstep
#予測した結果の出力先↓↓
traintext_path = DIR + "train_txt/" #学習用データ midi をtext に変換したものを保管
predtext_path = DIR + "predict_txt/"
predmid_path = DIR + "predict_mid/"
!mkdir -p {predtext_path} #予測した結果をtxt で保管
!mkdir -p {predmid_path} #予測結果txt をmidi に変換して保管

BATCH_SIZE=20 #ミニバッチサイズ
BPTT_LEN = 140 #text とtarget の長さ
EMBEDDING_DIM = 512 #embedding 層
HIDDEN_DIM = 512 #隠れ層
DROPOUT = 0.5

## main

In [14]:
#https://qiita.com/ysit/items/a601cb59523cc1961556
#鬼のimport祭り
import mido
from mido import Message, MidiFile
from pathlib import Path
import sys, os
import random
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
import torch.nn as nn
import torch.nn.utils.rnn as rnn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from tensorflow import keras
import numpy as np
import mido
from mido import Message, MidiFile, MidiTrack, MetaMessage
import sys
import matplotlib.pyplot as plt
import math
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

1.9.0+cu102 True


main.py

In [15]:
#辞書を持っておくオブジェクト
class EncoderDecoder(object):
    def __init__(self):
        # word_to_idの辞書
        self.w2i = {}
        # id_to_wordの辞書
        self.i2w = {}

    # コールされる関数
    def __call__(self, sentence, maxlength=4000):
        return self.transform(sentence, maxlength)

    # 辞書作成
    def make_dict(self, sentences):
      for sentence in sentences:
        if sentence not in self.w2i:
          new_id = len(self.w2i)
          self.w2i[sentence] = new_id
          self.i2w[new_id] = sentence

    # 読み込んだデータをまとめてidに変換する
    def transform(self, sentences, maxlength=4000):
        output = []
        for sentence in sentences:
            idx = self.w2i[sentence]
            output.append(idx)
        return output

    # １文ずつ単語リストに直す
    def decode(self, sentence):
        return [self.i2w[id] for id in sentence if id != 0]

In [16]:
#データの読み込み、準備
#print(torch.__version__)
#chunk の辞書化

class MyDataset(object):
  """データを(時系列長, バッチサイズ)の形状で返す"""

  def __init__(self, data, batch_size, bptt_len):
    nbatch = data.size(0) // batch_size
    data = data.narrow(0, 0, nbatch * batch_size)
    self.data = data.view(batch_size, -1).t().contiguous()
    self.batch_size = batch_size
    self.bptt_len = bptt_len
  @classmethod
  def splits(cls, datasets, batch_size, bptt_len):
    ret = []
    for data in datasets:
      ret.append(cls(data, batch_size, bptt_len))
    return tuple(ret)

  def __len__(self):
    return math.ceil((len(self.data) - 1 ) / self.bptt_len)
  
  def __iter__(self):
    for i in range(0, len(self.data) - 1, self.bptt_len):
      seq_len = min(self.bptt_len, len(self.data) - 1 - i)
      text = self.data[i:i + seq_len]
      target = self.data[i + 1:i + 1 + seq_len]
      yield text, target

#データの準備
inputTextList = Path(traintext_path).glob('**/*.txt')
mididata = []
maxlength = 0
for inputText in inputTextList:
  f = open(inputText, 'r')
  texts = f.read()
  text = texts.split(",")
  text.pop(-1)
  mididata = mididata + text
  if len(mididata) > maxlength:
    maxlength = len(text)
encTool = EncoderDecoder()
encTool.make_dict(mididata)
data_id = encTool(mididata, maxlength=maxlength)

In [17]:
#ネットの定義
#ネットの定義
class RNNLM(nn.Module):
    def __init__(self, vocab_size, emb_dim, h_dim, dropout):
        '''
        vocab_size:語彙の数
        emb_dim:埋め込みベクトルの次元
        h_dim:隠れ層の次元
        dropout: ドロップアウトの確率
        '''
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.drop1 = nn.Dropout(dropout)
        self.lstm1 = nn.LSTM(emb_dim, h_dim)
        self.drop2 = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(h_dim, h_dim)
        self.drop3 = nn.Dropout(dropout)
        self.linear = nn.Linear(h_dim, vocab_size)

        # 重みを初期化
        #他にも色々方法はある https://pytorch.org/docs/stable/nn.init.html
        nn.init.normal_(self.embed.weight, std=0.01)
        nn.init.normal_(self.lstm1.weight_ih_l0, std=1/math.sqrt(emb_dim))
        nn.init.normal_(self.lstm1.weight_hh_l0, std=1/math.sqrt(h_dim))
        nn.init.zeros_(self.lstm1.bias_ih_l0)
        nn.init.zeros_(self.lstm1.bias_hh_l0)
        nn.init.normal_(self.lstm2.weight_ih_l0, std=1/math.sqrt(emb_dim))
        nn.init.normal_(self.lstm2.weight_hh_l0, std=1/math.sqrt(h_dim))
        nn.init.zeros_(self.lstm2.bias_ih_l0)
        nn.init.zeros_(self.lstm2.bias_hh_l0)
        self.linear.weight = self.embed.weight  # 重み共有
        nn.init.zeros_(self.linear.bias)

    def forward(self, sentence, hidden1_prev, hidden2_prev):
        emb = self.embed(sentence)
        emb = self.drop1(emb)
        lstm1_out, hidden1_next = self.lstm1(emb, hidden1_prev)
        lstm1_out = self.drop2(lstm1_out)
        lstm2_out, hidden2_next = self.lstm2(lstm1_out, hidden2_prev)
        lstm2_out = self.drop3(lstm2_out)
        out = self.linear(lstm2_out)
        #hidden_next:隠れ状態と記憶セルのセル状態を含んだタプル
        return out, hidden1_next, hidden2_next

In [18]:
#DS = os.sep
#bs = os.path.dirname(__file__) + DS
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device("cpu")
VOCAB_SIZE = len(encTool.i2w)
#モデル生成
print('read model...')
model = RNNLM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, DROPOUT).to(device)
modelname = torch.load(model_path)
model.load_state_dict(modelname, strict=False)

def make_melody(model, start_ids, length=400, skip_ids=None, prob=True, seed=2021, batch_size=1, top=None):
  '''
  args:
  model:入力モデル
  start_ids:最初の入力となるnote のid
  length:出力長
  prob:確率的に予測結果を選択するか否か
  seed:prob=True のときのランダムシード
  '''
  word_ids = []
  word_ids += start_ids
  model.eval() #BatchNorm やdropout をOFF
  #勾配の自動計算防止
  with torch.no_grad():
    input_id = start_ids
    hidden1, hidden2 = None, None
    while len(word_ids) < length:
      input = torch.tensor(input_id, dtype=torch.long,
                          device=device).view(1, -1).t().contiguous()
      #input = torch.tensor(input_id, device=device)
      output, hidden1, hidden2 = model(input, hidden1, hidden2)
      #output, hidden = model(input)
      # outputは(時系列長, バッチサイズ=1, 語彙数)
      p_list = F.softmax(output[-1].flatten(), dim=0)
      if top is not None:
        sorted_p_list = p_list.sort(descending=True).values[:top]
        sorted_idx = p_list.sort(descending=True).indices[:top]
        p_list = sorted_p_list / sorted_p_list.sum()
      # 確率的に選択(重み付きランダムサンプリング)
      if prob:
        while True:
          rnd = random.random()
          p_sum = 0
          for idx, p in enumerate(p_list):
            p_sum += p.item()
            if rnd < p_sum:
              sampled = idx if top is None else sorted_idx[idx].item()
              break
          # skip_idsに含まれる時はやり直し
          if (skip_ids is None) or (sampled not in skip_ids):
            break
      # 決定的に選択
      else:
        if skip_ids is not None:
          p_list[skip_ids] = 0
        sampled = p_list.argmax().item()

      word_ids.append(sampled)
      input_id = sampled
  return word_ids

print("start generating melody....")
for i_out in range(0, OUT_NUM):
  start_ids=[]
  start_id = random.randint(0, len(data_id)-INPUT_PHRASE_NUM)
  start_ids = data_id[start_id:start_id + INPUT_PHRASE_NUM]
  word_ids = make_melody(model, start_ids, length=MEL_LENGTH, batch_size = BATCH_SIZE, prob=True, top=100)
  text = ','.join(encTool.decode(word_ids))
  resultTextName = predtext_path + "out{}_batch{}_embed{}.txt".format(i_out, BATCH_SIZE, EMBEDDING_DIM)
  file = open(resultTextName,'w+',encoding='utf-8').write(text)


read model...
start generating melody....


chunk2midi.py 

In [19]:
def num2note_on_off(num):
  if int(num, 10) == 1:
    return "note_on"
  else:
    return "note_off"

#args = sys.argv 
resultTextList = Path(predtext_path).glob('**/*.txt')
for resultTextName in tqdm(resultTextList):
  print("converting : ", resultTextName)
  f = open(resultTextName, 'r')
  data = f.read()
  data_per_sound = data.split(",")
  textName = os.path.split(resultTextName)[1].replace(".txt", "")
  outputMidName = predmid_path + textName + ".mid"
  #type(note_on=1, note_off=0)_note_velocity_time
  mid = MidiFile()
  track = MidiTrack()
  mid.tracks.append(track)
  bpm = int(BPM)
  track.append(MetaMessage('set_tempo', tempo=mido.bpm2tempo(bpm)))
  for i_data in data_per_sound:
    #print(i_data)
    if i_data == "":
      continue
    parts = i_data.split("_")
    if len(parts) != 4 or int(parts[0], 10) > 1:
      continue
    if parts[3] == "":
      continue
    if int(parts[2]) > 127:
      continue
    track.append(Message(num2note_on_off(parts[0]), note=int(parts[1], 10), velocity=int(parts[2], 10), time=int(parts[3], 10)))

  mid.save(outputMidName)


6it [00:00, 52.42it/s]

converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out0_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out1_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out2_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out3_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out4_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out5_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out6_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out7_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/

18it [00:00, 47.93it/s]

converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out9_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out10_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out11_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out12_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out13_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out14_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out15_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predict_txt/out16_batch20_embed512.txt
converting :  /content/drive/MyDrive/Colab_Notebooks/dev_Aug2021_pytorch_v5/predi

20it [00:00, 47.04it/s]
