In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import utils as utl
from collections import Counter

In [2]:
data = pd.read_csv("data/StockTwits_SPY_Sentiment_2017.gz",
                   encoding="utf-8",
                   compression="gzip",
                   index_col=0)

In [3]:
messages = data.message.values
labels = data.sentiment.values

In [4]:
for i in range(10):
    print("Messages: {}...".format(messages[i]),
          "Sentiment: {}".format(labels[i]))

Messages: $SPY crazy day so far!... Sentiment: bearish
Messages: $SPY Will make a new ATH this week. Watch it!... Sentiment: bullish
Messages: $SPY $DJIA white elephant in room is $AAPL. Up 14% since election. Strong headwinds w/Trump trade & Strong dollar. How many 7's do you see?... Sentiment: bearish
Messages: $SPY blocks above. We break above them We should push to double top... Sentiment: bullish
Messages: $SPY Nothing happening in the market today, guess I'll go to the store and spend some $.... Sentiment: bearish
Messages: $SPY What an easy call. Good jobs report: good economy, markets go up.  Bad jobs report: no more rate hikes, markets go up.  Win-win.... Sentiment: bullish
Messages: $SPY BS market.... Sentiment: bullish
Messages: $SPY this rally all the cheerleaders were screaming about this morning is pretty weak. I keep adding 2 my short at all spikes... Sentiment: bearish
Messages: $SPY Dollar ripping higher!... Sentiment: bearish
Messages: $SPY no reason to go down !... S

In [5]:
messages = np.array([utl.preprocess_ST_message(message) for message in messages])

In [6]:
full_lexicon = " ".join(messages).split()
# print(len(full_lexicon))
print(len(messages))
vocab_to_int, int_to_vocab = utl.create_lookup_tables(full_lexicon)
print(len(vocab_to_int), len(int_to_vocab))

96967
31980 31980


In [7]:

messages_lens = Counter([len(x) for x in messages])
print("Zero-length messages: {}".format(messages_lens[0]))
print("Maximum message length: {}".format(max(messages_lens)))
print("Average message length: {}".format(np.mean([len(x) for x in messages])))

Zero-length messages: 1
Maximum message length: 244
Average message length: 78.21856920395598


In [8]:
messages, labels = utl.drop_empty_messages(messages, labels)

In [9]:
messages = utl.encode_ST_messages(messages, vocab_to_int)
labels = utl.encode_ST_labels(labels)
print(len(messages), messages[123])

96966 [1, 291, 92, 16, 784, 12, 213, 107, 63, 681, 257, 1, 1, 1, 1, 1, 6732, 79, 478, 12, 592, 14031]


In [14]:
messages = utl.zero_pad_messages(messages, seq_len=244)
print(len(messages), messages[123])

96966 [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0   

In [15]:
maximums = [max(i) for i in messages]

In [16]:
print(max(maximums))

31980


In [17]:

train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split(messages, labels, split_frac=0.80)

print("Data Set Size")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

Data Set Size
Train set: 		(77572, 244) 
Validation set: 	(9697, 244) 
Test set: 		(9697, 244)


In [18]:

def model_inputs():
    """
    Create the model inputs
    """
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob_ = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs_, labels_, keep_prob_

In [27]:
def build_embedding_layer(inputs_, vocab_size, embed_size):
    """
    Create the embedding layer
    """
    
    print(inputs_, vocab_size, embed_size)
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    print()
    
    return embed

In [28]:
def build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size):
    """
    Create the LSTM layers
    """
    lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes]
    # Add dropout to the cell
    drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_) for lstm in lstms]
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell(drops)
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    print(embed)
    
    lstm_outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
    return initial_state, lstm_outputs, cell, final_state

In [29]:
def build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate):
    """
    Create the Loss function and Optimizer
    """
    predictions = tf.contrib.layers.fully_connected(lstm_outputs[:, -1], 1, activation_fn=tf.sigmoid)
    loss = tf.losses.mean_squared_error(labels_, predictions)
    optimzer = tf.train.AdadeltaOptimizer(learning_rate).minimize(loss)
    
    return predictions, loss, optimzer

In [30]:

def build_accuracy(predictions, labels_):
    """
    Create accuracy
    """
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    return accuracy


In [31]:
def build_and_train_network(lstm_sizes, vocab_size, embed_size, epochs, batch_size,
                            learning_rate, keep_prob, train_x, val_x, train_y, val_y):
    
    inputs_, labels_, keep_prob_ = model_inputs()
    embed = build_embedding_layer(inputs_, vocab_size, embed_size)
    print('embed', embed)
#     initial_state, lstm_outputs, lstm_cell, final_state = build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size)
#     predictions, loss, optimizer = build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate)
#     accuracy = build_accuracy(predictions, labels_)
    
#     saver = tf.train.Saver()
    
#     with tf.Session() as sess:
        
#         sess.run(tf.global_variables_initializer())
#         n_batches = len(train_x)//batch_size
#         for e in range(epochs):
#             state = sess.run(initial_state)
            
#             train_acc = []
#             for ii, (x, y) in enumerate(utl.get_batches(train_x, train_y, batch_size), 1):
#                 feed = {inputs_: x,
#                         labels_: y[:, None],
#                         keep_prob_: keep_prob,
#                         initial_state: state}
#                 loss_, state, _,  batch_acc = sess.run([loss, final_state, optimizer, accuracy], feed_dict=feed)
#                 train_acc.append(batch_acc)
                
#                 if (ii + 1) % n_batches == 0:
                    
#                     val_acc = []
#                     val_state = sess.run(lstm_cell.zero_state(batch_size, tf.float32))
#                     for xx, yy in utl.get_batches(val_x, val_y, batch_size):
#                         feed = {inputs_: xx,
#                                 labels_: yy[:, None],
#                                 keep_prob_: 1,
#                                 initial_state: val_state}
#                         val_batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
#                         val_acc.append(val_batch_acc)
                    
#                     print("Epoch: {}/{}...".format(e+1, epochs),
#                           "Batch: {}/{}...".format(ii+1, n_batches),
#                           "Train Loss: {:.3f}...".format(loss_),
#                           "Train Accruacy: {:.3f}...".format(np.mean(train_acc)),
#                           "Val Accuracy: {:.3f}".format(np.mean(val_acc)))
    
#         saver.save(sess, "checkpoints/sentiment.ckpt")

In [32]:
# Define Inputs and Hyperparameters
lstm_sizes = [100, 50]
vocab_size = len(vocab_to_int) + 1 #add one for padding
# vocab_size = 244
# print('vocab_size', vocab_size)
embed_size = 200
epochs = 50
batch_size = 256
learning_rate = 0.1
keep_prob = 0.5

print(len(messages[0]), vocab_size)

244 31981


In [33]:
with tf.Graph().as_default():
    build_and_train_network(lstm_sizes, vocab_size, embed_size, epochs, batch_size,
                            learning_rate, keep_prob, train_x, val_x, train_y, val_y)

Tensor("inputs:0", shape=(?, ?), dtype=int32) 31981 200

embed Tensor("embedding_lookup/Identity:0", shape=(?, ?, 200), dtype=float32)
