In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.models.keyedvectors import KeyedVectors
STOP_WRODS=False

In [2]:
#training set and stopwords
train_set = pd.read_csv('labeledmusic.csv')
stop_words = pd.read_csv('stopwords.txt')
stopwords = set(stop_words['stopwords'].values)
lytrain = train_set['text'].values  
Y_train = train_set['mood'].values

In [3]:
def washstr(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [4]:
import nltk
import string
import re

def tokenizer(text):
    lower_txt = washstr(text)
    tokens = lower_txt.split(" ")
    if not STOP_WRODS:
        return tokens
    nonstop = []
    for token in tokens:
        if token not in stopwords:
            nonstop.append(token)
    return nonstop

In [5]:
X_train = []
Y = []

max_len = 0

for i in range(lytrain.shape[0]):
    X_train.append(tokenizer(lytrain[i]))
    max_len = max(max_len, len(X_train[i]))

    if Y_train[i] == "sad":
        Y.append([0,1])
    else:
        Y.append([1,0])

In [14]:
print(len(X_train), max_len)

1199 1266


In [16]:
# fill each lyric, to make it of 1087 tokens
for i in range(len(X_train)):
    while len(X_train[i]) < max_len:
        X_train[i].append(" ")

In [72]:
import itertools
words = list(itertools.chain.from_iterable(X_train))
from collections import Counter

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

lyrics_ints = []

for each in words:
    lyrics_ints.append([vocab_to_int[word] for word in each.split()])
    

In [73]:
seq_len = 100
features = np.zeros((len(X_train), seq_len), dtype=int)
for i, row in enumerate(lyrics_ints):
    #print("[%d] %s" % (i, row))
    features[i, -len(row):] = np.array(row)[:seq_len]

ValueError: could not broadcast input array from shape (0) into shape (100)

In [74]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = Y[:split_idx], Y[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(959, 100) 
Validation set: 	(120, 100) 
Test set: 		(120, 100)


In [75]:
lstm_size = 32
lstm_layers = 1
batch_size = 110
learning_rate = 0.001

In [76]:
import tensorflow as tf
n_words = len(vocab)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [77]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 128

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [78]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [79]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,initial_state=initial_state)


In [80]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [81]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [82]:
def get_batches(x, y, batch_size=110):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [83]:
epochs = 20

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%10==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, 'sentiment')

TypeError: list indices must be integers or slices, not tuple

In [None]:
test_acc = []
tf.reset_default_graph()
saver = tf.train.import_meta_graph('sentiment.meta')
with tf.Session(graph=graph) as sess:
    
    saver.restore(sess, "sentiment")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))
