# stock2vec

Create vectors for stocks based on their relative volatility.

In [1]:
import csv
import datetime
import math
import multiprocessing as mp
import numpy as np
import pandas as pd
import random
import sys
import tensorflow as tf
import time
from functools import partial
from tqdm import tqdm

Load the diluted earnings per share by ticker.

In [2]:
from urllib.request import urlretrieve
from os.path import isfile, isdir

dataset_folder_path = 'input'
dataset_filename = 'input/prices.csv'
dataset_name = 'Prices'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
        urlretrieve(
            'https://s3.amazonaws.com/perl-ml/prices.csv?response-content-disposition=attachment&X-Amz-Security-Token=FQoDYXdzEN3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDDGDXIQxfhjhlalnoyKqAiqteedReEObibGFinGZUTbCNLqOsBrBfhb3m%2B9WSc202KdlXdoi8bxYATvctErFAeNF%2FlVgdPlu%2BRy8dLOHw5a%2BvTNM92V8V1XiJnuYgpe69GI914L1xceQGmcJ9qQ1Fg2iSi5cGj2%2FNL26CHIOmdblBGp6VUFUqtu0ZoRb18XXYBlSGQIGk4kxGfwiN5%2BbnQNB%2FInBx0YkDI5XFOIOXa1HzF4anoHgoSSjwdq8FXLQh8LXD5mYvqkTLokIssfZeJrc4TyPy9gZW4hewwbI4NAauQvJfde2Z%2BA%2B5iV4%2B%2B8wFFcDMeM%2Fg%2BYyrTVhaRVZ%2FIU033J6CXshjaL0uHwFleXw%2FHlzMjQst2YZmQu0EqxNCowwwxcugVsKcMaPdMq%2BWJ66qWxN5DZcC3oo%2FJvbxgU%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20170325T201144Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAIZGCD6XQ355X2AMA%2F20170325%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=0c2703d3dbef5f58006a3b7e89ff85b2b86e67542b861784fbff2da48434e0df',
            dataset_filename,
            pbar.hook)

In [3]:
chunksize = 1000000
price_rows = 9191528

price_reader = pd.read_csv('input/prices.csv', 
                      header=None,
                      parse_dates=[1],
                      chunksize=chunksize, 
                      iterator=True)

df_prices = pd.DataFrame()

with tqdm(total=price_rows, desc='rows') as pbar:
    for chunk in price_reader:
        df_prices = df_prices.append(chunk)
        pbar.update(chunksize);

df_prices.columns = ['adj_close', 'date', 'ticker', 'epsdil', 'pe']

# Sort by date, then ticker
df_prices.sort_values(['date', 'pe'], inplace=True)

print(df_prices.head())

prices = df_prices['adj_close'].values.tolist()
dates = df_prices['date'].values.tolist()
tickers = df_prices['ticker'].values.tolist()
pes = df_prices['pe'].values.tolist()

rows: 10000000it [00:14, 668326.67it/s]                            


         adj_close       date ticker  epsdil         pe
45526    13.745073 2001-06-18   OLED   -0.87 -15.798935
3046709   6.147269 2001-06-18    YUM    0.69   8.909085
1778402  43.419875 2001-06-18    SWY    2.31  18.796483
315864   29.645033 2001-06-18    PEP    1.50  19.763356
1743245  11.213455 2001-06-18    SVU    0.47  23.858415


## Build context

For each stock, find C stocks that have the closest volatility to that ticker for that day.

In [5]:
ticker_to_int = {}
int_to_ticker = {}

def get_ticker_int(idx):
    ticker = tickers[idx]
    key = ticker_to_int.get(ticker, None)
    if key is None:
        key = ticker_to_int[ticker] = len(ticker_to_int)
        int_to_ticker[key] = ticker
    return key

def get_window(idx, total, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R if (idx + R) < total else total

    stock_int = get_ticker_int(idx)
    stock_date = dates[idx]
    
    window = []
    
    for i in range(start, stop):
        nearby_stock_int = get_ticker_int(i)
        nearby_stock_date = dates[i]
        if nearby_stock_int != stock_int and nearby_stock_date == stock_date:
            window.append(nearby_stock_int)
    
    return window

for idx in range(0, 20, 9):
    print('window for', idx, tickers[idx], get_ticker_int(idx))
    for nearby_int in get_window(idx, len(tickers), 5):
        print(nearby_int, int_to_ticker[nearby_int])

window for 0 OLED 0
1 YUM
window for 9 SVU 2
3 SWY
4 PEP
window for 18 PEP 4
3 SWY


In [6]:
batch_size = 1000
window_size = 10

total_prices = len(prices)

pbar = tqdm(total=int(total_prices / batch_size))

def get_batch(start):
    x, y = [], []

    stop = start + batch_size if (start + batch_size) < total_prices else total_prices

    for i in range(start, stop):
        batch_x = get_ticker_int(i)
        batch_y = get_window(i, total_prices, window_size)
        y.extend(batch_y)
        x.extend([batch_x]*len(batch_y))

    pbar.update();

    return [x, y]

def get_batches():
    batches = []
    
    for start in range(0, total_prices, batch_size):
        batches.append(get_batch(start))
   
    return batches

batches = get_batches()

100%|██████████| 9191/9191 [01:23<00:00, 106.73it/s]

## Build the Graph

In [8]:
n_stocks = len(df_prices['ticker'].unique())

train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

n_embedding = 400 # Number of embedding features 

with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_stocks, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

# Negative sampling

In [9]:
# Number of negative labels to sample
n_sampled = 100

with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_stocks, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_stocks))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_stocks)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

          9192it [01:40, 106.73it/s]

In [10]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

# Training

In [None]:
epochs = 10

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        start = time.time()
        for batch in batches:
            x = batch[0]
            y = batch[1]
            
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_stock = int_to_ticker[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_stock
                    for k in range(top_k):
                        try:
                            close_stock = int_to_ticker[nearest[k]]
                            log = '%s %s,' % (log, close_stock)
                        except Exception:
                            print('nearest[k]', nearest[k])
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

Epoch 1/10 Iteration: 100 Avg. Training loss: 4.7353 0.1324 sec/batch
Epoch 1/10 Iteration: 200 Avg. Training loss: 4.3297 0.1308 sec/batch
Epoch 1/10 Iteration: 300 Avg. Training loss: 3.9655 0.1325 sec/batch
Epoch 1/10 Iteration: 400 Avg. Training loss: 3.5779 0.1372 sec/batch
Epoch 1/10 Iteration: 500 Avg. Training loss: 3.4165 0.1408 sec/batch
Epoch 1/10 Iteration: 600 Avg. Training loss: 3.3865 0.1415 sec/batch
Epoch 1/10 Iteration: 700 Avg. Training loss: 3.4901 0.1380 sec/batch
Epoch 1/10 Iteration: 800 Avg. Training loss: 3.3706 0.1374 sec/batch
Epoch 1/10 Iteration: 900 Avg. Training loss: 3.1838 0.1338 sec/batch
Epoch 1/10 Iteration: 1000 Avg. Training loss: 3.2497 0.1291 sec/batch
Nearest to ACAS: LCI, MCP, AI, AIT, FLR, CTWS, BWLD, GNE,
Nearest to CLFD: A, GRT, USM, ITMN, NMRX, ESIO, CAS, LMIA,
Nearest to CSC: FITB, RFP, JAH, EQT, AME, ACAD, AAMC, CBT,
Nearest to RVLT: AMAG, PES, CPWR, HP, PLUG, DTLK, RNWK, DXLG,
Nearest to FLS: SYK, QGEN, DIS, CUR, INTC, TIF, NEM, ALEX,
Ne