# Making Word2Vec model from data

In [1]:
from konlpy.tag import Okt
import gensim
import torch
import torchvision
import numpy as np
import codecs
import os

os.chdir('C:\\Users\\korra\\Desktop\\BiLSTM')



In [None]:
def read_data(filename):
    with open('./data/' + filename, encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]
    return data

train = read_data('ratings_train.txt')
test = read_data('ratings_test.txt')

tagger = Okt()

def tokenize(doc):
    return ['/'.join(x) for x in tagger.pos(doc, norm=True, stem=True)]

# train Word2Vec model with skip-gram
tokens = [tokenize(row[1]) for row in train]
print(tokens)
model = gensim.models.Word2Vec(size=300, sg=1, min_alpha=0.025, seed=23)
model.build_vocab(tokens)

for epoch in range(30):
    model.train(tokens, total_words=model.corpus_count, epochs=model.epochs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

model.save('Word2Vec.model')
model.most_similar('공포/Noun', topn=20)

# Sentiment analysis with Bi-directional LSTM
The code below is written to use in Google Colab environment.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [1]:
## IMPORT MODELS
import gensim
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import codecs
import os
import numpy as np

model = gensim.models.word2vec.Word2Vec.load(os.getcwd() + '/drive/My Drive/Colab Notebook/Word2Vec.model')
model.wv.most_similar('공포/Noun',topn = 20)

w2v = np.zeros((len(model.wv.vocab)-1, model.trainables.layer1_size))
with codecs.open("metadata.tsv",'w+',encoding='utf-8') as file_metadata:
    for i,word in enumerate(model.wv.index2word[:len(model.wv.vocab)-1]):
        w2v[i] = model.wv[word]
        file_metadata.write(word + "\n")

sess = tf.InteractiveSession()

# Create embedding(2D tensor) which has our embeddings ##  
with tf.device("/cpu:0"):
    embedding = tf.Variable(w2v, trainable = False,  name = 'embedding')

tf.global_variables_initializer().run() 

path = 'word2vec'
saver = tf.train.Saver()
writer = tf.summary.FileWriter(path, sess.graph)

config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.tensor_name = 'embedding'
embed.metadata_path = os.getcwd() + '/drive/My Drive/Colab Notebook/data/metadata.tsv'

# Specify the width and height of a single thumbnail.
projector.visualize_embeddings(writer, config)
saver.save(sess, path + '/model.ckpt' , global_step=max_size)

ModuleNotFoundError: No module named 'gensim'

## Make Word2Vec and BiLSTM classes

In [None]:
# make Word2Vec as a class
class Word2Vec():
    def tokenize(self, doc):
        twitter = Okt()
        return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
    
    def read_data(self, filename):
        with open(filename, 'r',encoding='utf-8') as f:
            data = [line.split('\t') for line in f.read().splitlines()]
            data = data[1:]
        return data  
    
    def word2vec_model(self, model_name):
        model = gensim.models.word2vec.Word2Vec.load(model_name)
        return model
    
    # Convert corpus to vectors
    def convert2vec(self, model_name, doc):  
        word_vec = []
        model = gensim.models.word2vec.Word2Vec.load(model_name)
        for sent in doc:
            sub = []
            for word in sent:
                if(word in model.wv.vocab):
                    sub.append(model.wv[word])
                else:
                    sub.append(np.random.uniform(-0.25,0.25,300))
            word_vec.append(sub)
        
        return np.array(word_vec)
    
    def zeropad(self, train_batch_X, batch_size, seq_maxlen, vector_size):
        zero_pad = np.zeros((batch_size, seq_maxlen, vector_size))
        for i in range(batch_size):
            zero_pad[i,:np.shape(train_batch_X[i])[0],:np.shape(train_batch_X[i])[1]] = train_batch_X[i]
            
        return zero_pad
    
    def onehot(self, data):
        index_dict = {value:index for index,value in enumerate(set(data))}
        result = []
        
        for value in data:
            one_hot = np.zeros(len(index_dict))
            index = index_dict[value]
            one_hot[index] = 1
            result.append(one_hot)
        
        return np.array(result)

In [None]:
# make Bi-LSTM class
class Bi_LSTM():
    def __init__(self, lstm_units, num_class, keep_prob):
        self.lstm_units = lstm_units
        
        # Define Bi_LSTM with tensorflow
        with tf.variable_scope('forward', reuse = tf.AUTO_REUSE):
            self.lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(lstm_units, forget_bias=1.0, state_is_tuple=True)
            self.lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_fw_cell, output_keep_prob = keep_prob)
            
        with tf.variable_scope('backward', reuse = tf.AUTO_REUSE):
            self.lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(lstm_units, forget_bias=1.0, state_is_tuple=True)
            self.lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_bw_cell, output_keep_prob = keep_prob)
        
        with tf.variable_scope('Weights', reuse = tf.AUTO_REUSE):
            self.W = tf.get_variable(name="W", shape=[2 * lstm_units, num_class],
                                dtype=tf.float32, initializer = tf.contrib.layers.xavier_initializer())
            self.b = tf.get_variable(name="b", shape=[num_class], dtype=tf.float32,
                                initializer=tf.zeros_initializer())
            
            
    def logits(self, X, W, b, seq_len):
        (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(self.lstm_fw_cell, self.lstm_bw_cell,dtype=tf.float32,
                                                                            inputs = X, sequence_length = seq_len)
        # concat final states
        outputs = tf.concat([states[0][1], states[1][1]], axis=1)
        pred = tf.matmul(outputs, W) + b        
        return pred
        
    def model_build(self, logits, labels, learning_rate = 0.001):
        
        with tf.variable_scope("loss"):    
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits , labels = labels)) # Softmax loss
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # Adam Optimizer
            
        return loss, optimizer
    
    def graph_build(self):
        self.loss = tf.placeholder(tf.float32)
        self.acc = tf.placeholder(tf.float32)
        tf.summary.scalar('Loss', self.loss)
        tf.summary.scalar('Accuracy', self.acc)
        merged = tf.summary.merge_all()
        
        return merged

In [None]:
## For train data
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

W2V = Word2Vec()
train_data = W2V.read_data(os.getcwd() + "/drive/My Drive/Colab Notebook/data/ratings_train.txt")
test_data = W2V.read_data(os.getcwd() + "/drive/My Drive/Colab Notebook/data/ratings_test.txt")

# tokenize train and test data
print("="*10+"Start Tokenizing!\nPlease wait..."+"="*10)
train_tokens = [[W2V.tokenize(row[1]),int(row[2])] for row in train_data if W2V.tokenize(row[1]) != []]
train_tokens = np.array(train_tokens)
test_tokens = [[W2V.tokenize(row[1]),int(row[2])] for row in test_data if W2V.tokenize(row[1]) != []]
test_tokens = np.array(test_tokens)

print("="*10+"Tokenize Finished!"+"="*10)

train_X = train_tokens[:,0]
train_Y = train_tokens[:,1]
test_X = test_tokens[:,0]
test_Y = test_tokens[:,1]

train_Y_ = W2V.onehot(train_Y)
train_X_ = W2V.convert2vec(os.getcwd() + '/drive/My Drive/Colab Notebook/Word2Vec.model',train_X)  ## import word2vec model where you have trained before
test_Y_ = W2V.onehot(test_Y)
test_X_ = W2V.convert2vec(os.getcwd() + '/drive/My Drive/Colab Notebook/Word2Vec.model',test_X)  ## import word2vec model where you have trained before

# Define basic properties
batch_size = 32
vector_size = 300
train_seq_length = [len(x) for x in train_X]
test_seq_length = [len(x) for x in test_X]
max_seqlen = max(train_seq_length) ## 95
learning_rate = 0.001
lstm_units = 128
num_class = 2
training_epochs = 4
X = tf.placeholder(tf.float32, shape = [None, max_seqlen, vector_size], name = 'X')
Y = tf.placeholder(tf.float32, shape = [None, num_class], name = 'Y')
seq_len = tf.placeholder(tf.int32, shape = [None])
keep_prob = tf.placeholder(tf.float32, shape = None)

BiLSTM = Bi_LSTM(lstm_units, num_class, keep_prob)

with tf.variable_scope("loss", reuse = tf.AUTO_REUSE):
    logits = BiLSTM.logits(X, BiLSTM.W, BiLSTM.b, seq_len)
    loss, optimizer = BiLSTM.model_build(logits, Y, learning_rate)

prediction = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

total_batch = int(len(train_X) / batch_size)
test_batch = int(len(test_X) / batch_size)

print("Start training!")

model_name = os.getcwd() + '/drive/My Drive/Colab Notebook/BiLSTM_model.ckpt'
saver = tf.train.Saver()

train_acc = []
train_loss = []
test_acc = []
test_loss = []


with tf.Session(config = config) as sess:
    start_time = time.time()
    sess.run(init)
    train_writer = tf.summary.FileWriter(os.getcwd() + '/drive/My Drive/Colab Notebook/data/Bidirectional_LSTM', sess.graph)
    merged = BiLSTM.graph_build()
    
    for epoch in range(training_epochs):
        avg_acc, avg_loss = 0. , 0.
        mask = np.random.permutation(len(train_X_))
        train_X_ = train_X_[mask]
        train_Y_ = train_Y_[mask]
        
        for step in range(total_batch):
            train_batch_X = train_X_[step*batch_size : step*batch_size+batch_size]
            train_batch_Y = train_Y_[step*batch_size : step*batch_size+batch_size]
            batch_seq_length = train_seq_length[step*batch_size : step*batch_size+batch_size]
            
            train_batch_X = W2V.zeropad(train_batch_X, batch_size, max_seqlen, vector_size)
            sess.run(optimizer, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
            # Compute average loss
            loss_ = sess.run(loss, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length, keep_prob : 0.75})
            avg_loss += loss_ / total_batch
            
            acc = sess.run(accuracy , feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length, keep_prob : 0.75})
            avg_acc += acc / total_batch

            print("epoch : {:02d} step : {:04d} loss = {:.6f} accuracy= {:.6f}".format(epoch+1, step+1, loss_, acc))
   
        summary = sess.run(merged, feed_dict = {BiLSTM.loss : avg_loss, BiLSTM.acc : avg_acc})       
        train_writer.add_summary(summary, epoch)
    
        t_avg_acc, t_avg_loss = 0., 0.
        print("Test batch could take few minutes")
        for step in range(test_batch):
            test_batch_X = test_X_[step*batch_size : step*batch_size+batch_size]
            test_batch_Y = test_Y_[step*batch_size : step*batch_size+batch_size]
            batch_seq_length = test_seq_length[step*batch_size : step*batch_size+batch_size]
            test_batch_X = W2V.zeropad(test_batch_X, batch_size, max_seqlen, vector_size)
            
            # Compute average loss
            loss2 = sess.run(loss, feed_dict={X: test_batch_X, Y: test_batch_Y, seq_len: batch_seq_length, keep_prob : 1.0})
            t_avg_loss += loss2 / test_batch
            
            t_acc = sess.run(accuracy , feed_dict={X: test_batch_X, Y: test_batch_Y, seq_len: batch_seq_length, keep_prob : 1.0})
            t_avg_acc += t_acc / test_batch

        print("<Train> Loss = {:.6f} Accuracy = {:.6f}".format(avg_loss, avg_acc))
        print("<Test> Loss = {:.6f} Accuracy = {:.6f}".format(t_avg_loss, t_avg_acc))
        train_loss.append(avg_loss)
        train_acc.append(avg_acc)
        test_loss.append(t_avg_loss)
        test_acc.append(t_avg_acc)

    train_loss = pd.DataFrame({"train_loss":train_loss})
    train_acc = pd.DataFrame({"train_acc":train_acc})
    test_loss = pd.DataFrame({"test_loss":test_loss})
    test_acc = pd.DataFrame({"test_acc":test_acc})
    df = pd.concat([train_loss,train_acc,test_loss,test_acc], axis = 1)
    df.to_csv(os.getcwd() + '/drive/My Drive/Colab Notebook/data/loss_accuracy.csv', sep =",", index=False)
    
    train_writer.close()
    duration = time.time() - start_time
    minute = int(duration / 60)
    second = int(duration) % 60
    print("{}minutes {}seconds".format(minute,second))
    save_path = saver.save(sess, model_name)

In [None]:
# For test data

test_size = len(test_X)
test_batch = int(test_size / batch_size)
keep_prob = 1.0

model_name = os.getcwd() + "/drive/My Drive/Colab Notebook/BiLSTM_model.ckpt"
init = tf.global_variables_initializer()
saver = tf.train.Saver()


with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, model_name) # load the variables from disk.
    print("model restored")

    total_acc = 0
    
    for step in range(test_batch):
        test_batch_X = test_X_[step*batch_size : step*batch_size+batch_size]
        test_batch_Y = test_Y_[step*batch_size : step*batch_size+batch_size]
        batch_seq_length = seq_length[step*batch_size : step*batch_size+batch_size]
        test_batch_X = W2V.zeropad(test_batch_X, batch_size, max_seqlen, vector_size)

        acc = sess.run(accuracy , feed_dict={X: test_batch_X, Y: test_batch_Y, seq_len: batch_seq_length})
        print("step :{0} Accuracy :{1}".format(step+1,acc))
        total_acc += acc/test_batch

    print("Total Accuracy : {}".format(total_acc))

## Music generator

In [None]:
from subprocess import Popen, PIPE
import magenta
import os

class MagentaMusic(object):
    def __init__(self, input_midi_dir, output_dir, model_name):
        self.input_midi_dir = input_midi_dir
        self.output_dir = output_dir
        self.model_name = model_name
        self.default_notesequences_dir = "/magenta/midi_result/notesequences_{0}.tfrecord".format(self.model_name)
        
    def create_dataset(self):
        magenta.scripts.convert_dir_to_note_sequences.convert_directory(self.input_midi_dir, self.default_notesequences_dir, recursive=True)
        
    def train(self, config="attention_rnn", batch_size=64, rnn_layer_sizes=[64,64], num_training_steps=20000):
        param = ["C://Anaconda3/envs/base_3.6/python", "C://Anaconda3/envs/base_3.6/Lib/site-packages/magenta/models/melody_rnn/melody_rnn_train.py",
                        "--config="+config,
                        "--run_dir=/magenta/melody_rnn/logdir/run_"+self.model_name,
                        "--sequence_example_file="+self.default_notesequences_dir,
                        "--hparams=batch_size={0},rnn_layer_sizes={1}".format(batch_size, str(rnn_layer_sizes).replace(" ", "")),
                        "--num_training_steps={0}".format(num_training_steps)
                        ]
        melody_rnn_train = Popen(param, shell=True, stdout=PIPE, stderr=PIPE)
        (stdoutdata, stderrdata) = melody_rnn_train.communicate()
        return stdoutdata.decode("cp949"), stderrdata.decode("cp949")
        
    def generate(self, config="attention_rnn", num_outputs=10, num_steps=128, batch_size=64, rnn_layer_sizes=[64,64], primer_melody=[60]):
        param = ["C://Anaconda3/envs/base_3.6/python", "C://Anaconda3/envs/base_3.6/Lib/site-packages/magenta/models/melody_rnn/melody_rnn_generate.py",
                        "--config="+config,
                        "--run_dir=/magenta/melody_rnn/logdir/run_"+self.model_name,
                        "--output_dir="+self.output_dir,
                        "--num_outputs={0}".format(num_outputs),
                        "--num_steps={0}".format(num_steps),
                        "--hparams=batch_size={0},rnn_layer_sizes={1}".format(batch_size, str(rnn_layer_sizes).replace(" ", "")),
                        "--primer_melody={0}".format(primer_melody)
                        ]
        melody_rnn_generate = Popen(param, shell=True, stdout=PIPE, stderr=PIPE)
        (stdoutdata, stderrdata) = melody_rnn_generate.communicate()
        return stdoutdata.decode("cp949"), stderrdata.decode("cp949")

## Train MIDI

In [None]:
# init Magenta music
mm_low = MagentaMusic("/magenta/midi/scale/classic/low", "/magenta/melody_rnn/generated/low", "low")
mm_high = MagentaMusic("/magenta/midi/scale/classic/high", "/magenta/melody_rnn/generated/high", "high")

# change MIDI to notesequences
mm_low.create_dataset()
mm_high.create_dataset()

# train melodyRNN
mm_low.train()
mm_high.train()

## Sentiment classification

In [None]:
sess = tf.Session()
sess.run(init)
saver.restore(sess, model_name)

def predict(sentence):
    tokens = W2V.tokenize(sentence)
    
    embedding = convert2vec(os.getcwd() + "/drive/My Drive/Colab Notebook/Word2Vec.model", tokens)
    zero_pad = W2V.zeropad(embedding, batch_size, max_seqlen, vector_size)
    global sess
    result =  sess.run(tf.argmax(prediction,1), feed_dict = {X: zero_pad , seq_len: [len(tokens)] } ) 
    if(result == 1): 
        print("Positive")
        mm_high.generate(num_steps=256, primer_melody=[70])
    else: 
        print("Negative")
        mm_low.generate(num_steps=64, primer_melody=[45])
            
while True:
    sentence = input("Enter sentence: ")
    if(sentence == ''): break
    else: predict(sentence)

In [139]:
mm = MagentaMusic("/magenta/midi/scale/classic/low", "/magenta/melody_rnn/generated/low", "low")

In [136]:
mm.train()

('',

In [140]:
mm.generate()

('',