# stock2vec

Create vectors for stocks based on their relative volatility.

In [1]:
import datetime
import math
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import time

In [2]:
df = pd.read_csv('data/wiki_prices.csv', 
                        usecols=["ticker", "date", "adj_close", "adj_high", "adj_low", "adj_volume"],
                        parse_dates=['date'])

df.sort_values('date', inplace=True)

df = df[pd.notnull(df['adj_high']) & pd.notnull(df['adj_low']) & pd.notnull(df['adj_close'])]

print(df.head())

        ticker       date   adj_high    adj_low  adj_close  adj_volume
7561760     KO 1962-01-02   0.273859   0.266600   0.266600   1612800.0
5620436     GE 1962-01-02   0.341163   0.332214   0.334451   2073600.0
1005749   ARNC 1962-01-02   3.492621   3.472967   3.472967     44800.0
6739482    IBM 1962-01-02  15.738806  15.561965  15.561965    387200.0
1416507     BA 1962-01-02   0.488470   0.480022   0.480022    352200.0


## Preprocessing

Filter ticks for the past ~15 years of stocks with volume > 10000 and volatility > 0.

In [3]:
df = df[df['date'] >= datetime.date(2000,1,1)]
df = df[df['adj_volume'] > 10000]
del df['adj_volume']

df['volt'] = (df['adj_high'] - df['adj_low']) / df['adj_close']
del df['adj_high']
del df['adj_low']
del df['adj_close']

df = df[df['volt'] > 0]

df.sort_values(['date', 'volt'], inplace=True)

n_stocks = len(df['ticker'].unique())
print('n_stocks', n_stocks)

print(df.head())

n_stocks 3186
         ticker       date      volt
3017202   CNBKA 2000-01-03  0.003679
1872359     BMI 2000-01-03  0.004000
5209067    FLIC 2000-01-03  0.004068
5368021    FRED 2000-01-03  0.004391
12380997    STL 2000-01-03  0.004499


**Get Nearby Stocks**

For each stock, find C stocks that have the closest volatility to that ticker for that day.

In [4]:
ticker_to_int = {}
int_to_ticker = {}

def get_ticker_int(ticker):
    key = ticker_to_int.get(ticker, None)
    if key is None:
        key = ticker_to_int[ticker] = len(ticker_to_int)
        int_to_ticker[key] = ticker
    return key

def get_stock_date(stocks, idx):
    return stocks.iloc[[idx], 1].values[0]

def get_stock_ticker(stocks, idx):
    return stocks.iloc[[idx], 0].values[0]

def get_stock_int(stocks, idx):
    return get_ticker_int(get_stock_ticker(stocks, idx))

def get_window(stocks, idx, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    
    stock_int = get_stock_int(stocks, idx)
    stock_date = get_stock_date(stocks, idx)
    
    window = []
    
    for i in range(start, stop+1):
        nearby_stock_int = get_stock_int(stocks, i)
        nearby_stock_date = get_stock_date(stocks, i)
        if nearby_stock_int != stock_int and nearby_stock_date == stock_date:
            window.append(nearby_stock_int)
    
    return window

for idx in range(0, 10, 5):
    print('window for', get_stock_ticker(df, idx), get_stock_int(df, idx))
    for nearby_int in get_window(df, idx, 5):
        print(nearby_int, int_to_ticker[nearby_int])

window for CNBKA 0
1 BMI
2 FLIC
3 FRED
4 STL
5 WEYS
window for WEYS 5
0 CNBKA
1 BMI
2 FLIC
3 FRED
4 STL
6 UMBF
7 CRRC
8 THO
9 MAC
10 MAS


In [5]:
def get_batches(stocks, batch_size, window_size=5):    
    for start in range(0, len(stocks), batch_size):
        x, y = [], []
        stop = start + batch_size if (start + batch_size) < len(stocks) else len(stocks)
        
        for i in range(start, stop):   
            batch_x = get_stock_int(stocks, i)
            batch_y = get_window(stocks, i, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
            
        yield x, y

## Build the Graph

In [6]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

n_embedding = 300 # Number of embedding features 

with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_stocks, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

# Negative sampling

In [7]:
# Number of negative labels to sample
n_sampled = 100

with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_stocks, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_stocks))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_stocks)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

In [8]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

# Training

In [None]:
epochs = 10
batch_size = 1000
window_size = 5

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        batches = get_batches(df, batch_size, window_size)
        start = time.time()
        for x, y in batches:           
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = int_to_stock[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = int_to_vocab[nearest[k]]
                        log = '%s %s,' % (log, close_word)
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

Epoch 1/10 Iteration: 1 Avg. Training loss: 0.0517 0.0408 sec/batch
Epoch 1/10 Iteration: 2 Avg. Training loss: 0.0482 0.0391 sec/batch
Epoch 1/10 Iteration: 3 Avg. Training loss: 0.0450 0.0376 sec/batch
Epoch 1/10 Iteration: 4 Avg. Training loss: 0.0515 0.0370 sec/batch
Epoch 1/10 Iteration: 5 Avg. Training loss: 0.0503 0.0378 sec/batch
Epoch 1/10 Iteration: 6 Avg. Training loss: 0.0503 0.0377 sec/batch
Epoch 1/10 Iteration: 7 Avg. Training loss: 0.0512 0.0374 sec/batch
Epoch 1/10 Iteration: 8 Avg. Training loss: 0.0506 0.0377 sec/batch
