# stock2vec

Create vectors for stocks based on their relative volatility.

In [1]:
import csv
# from dateutil.relativedelta import relativedelta
import datetime
import math
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import time
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

Load the diluted earnings per share by ticker.

In [2]:
chunksize = 1000000
fund_rows = 94011143

fund_reader = pd.read_csv('input/sharadar_fundamentals.csv', 
                      header=None,
                      parse_dates=[1],
                      chunksize=chunksize, 
                      iterator=True)

df_fund = pd.DataFrame()

with tqdm(total=fund_rows, desc='rows') as pbar:
    for chunk in fund_reader:
        # Select diluted earnings per share data
        chunk = chunk[chunk[0].str.contains('EPSDIL_MRT')]
        # Extract the ticker
        chunk[0] = chunk[0].str.extract('(.+?(?=_))', expand=False)
        df_fund = df_fund.append(chunk)
        pbar.update(chunksize);

# Sort by date, then ticker
df_fund.sort_values([1, 0], inplace=True)

earliest_date = df_fund.values[0][1]

df_fund.head()

rows: 95000000it [02:14, 703865.57it/s]                              


Unnamed: 0,0,1,2
65171973,PEP,2001-06-16,1.5
80257284,SVU,2001-06-16,0.47
80572836,SWY,2001-06-16,2.31
93428968,YUM,2001-06-16,0.69
62307830,OLED,2001-06-17,-0.87


Load the ticker prices.

In [3]:
price_rows = 14684263

# Import the prices

prices_reader = pd.read_csv('input/wiki_prices.csv', 
                        chunksize=chunksize,
                        parse_dates=['date'],
                        usecols=['adj_close', 'date', 'ticker'])

df_prices = pd.DataFrame()

with tqdm(total=price_rows, desc='rows') as pbar:
    for chunk in prices_reader:
        chunk = chunk[pd.notnull(chunk['adj_close'])]  
        chunk = chunk[chunk['date'] >= earliest_date]
        df_prices = df_prices.append(chunk)
        pbar.update(chunksize);

df_prices.sort_values(['date', 'ticker'], inplace=True)

print(df_prices.head())

rows: 15000000it [00:21, 693364.20it/s]                              


      ticker       date  adj_close
397        A 2001-06-18  28.510446
12636    AAN 2001-06-18   4.669972
19603   AAON 2001-06-18   1.883050
32593   AAPL 2001-06-18   1.323831
43324   ABAX 2001-06-18   5.458700


Append diluted earnings per share.

In [4]:
def get_fund(date, ticker):
    # Find the most recent fundamentals before the date
    fund = df_fund[(df_fund[0] == ticker) & (df_fund[1] <= date)].tail(1)    
    if fund.shape[0] < 1:
        return None
    return fund.values[0]

def price_to_earnings(price, epsdil):
    if epsdil is None:
        return None
    if epsdil == 0:
        return 0
    return price / epsdil

def get_fund_cols(row, pbar):
    pbar.update()
    
    fund = get_fund(row.date, row.ticker)
    
    if fund is None:
        return pd.Series([None, None, None])
    
    date = fund[1]
    epsdil = fund[2]
    pe = price_to_earnings(row.adj_close, epsdil)
    
    return pd.Series([date, epsdil, pe])

with tqdm(total=df_prices.shape[0], desc='rows') as pbar:
    fund_cols = df_prices.apply(lambda r: get_fund_cols(r, pbar), axis=1)
    fund_cols.columns = ['fund_date', 'epsdil', 'pe']
    df_prices = df_prices.join(fund_cols)

print(df_prices.head())

rows:   0%|          | 1631/9969557 [01:13<106:16:34, 26.05it/s]


KeyboardInterrupt: 

## Preprocessing

Filter ticks for the past ~15 years of stocks with volume > 10000 and volatility > 0.

In [None]:
df = df[df['date'] >= datetime.date(2016,1,1)]
df = df[df['adj_volume'] > 10000]
del df['adj_volume']

df['volt'] = (df['adj_high'] - df['adj_low']) / df['adj_close']
del df['adj_high']
del df['adj_low']
del df['adj_close']

df = df[df['volt'] > 0]

df.sort_values(['date', 'volt'], inplace=True)

n_stocks = len(df['ticker'].unique())
print('n_stocks', n_stocks)

print(df.head())

**Get Nearby Stocks**

For each stock, find C stocks that have the closest volatility to that ticker for that day.

In [None]:
ticker_to_int = {}
int_to_ticker = {}

def get_ticker_int(ticker):
    key = ticker_to_int.get(ticker, None)
    if key is None:
        key = ticker_to_int[ticker] = len(ticker_to_int)
        int_to_ticker[key] = ticker
    return key

def get_stock_date(stocks, idx):
    return stocks.iloc[[idx], 1].values[0]

def get_stock_ticker(stocks, idx):
    return stocks.iloc[[idx], 0].values[0]

def get_stock_int(stocks, idx):
    return get_ticker_int(get_stock_ticker(stocks, idx))

def get_window(stocks, idx, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R if (idx + R) < len(stocks) else len(stocks)

    stock_int = get_stock_int(stocks, idx)
    stock_date = get_stock_date(stocks, idx)
    
    window = []
    
    for i in range(start, stop):
        nearby_stock_int = get_stock_int(stocks, i)
        nearby_stock_date = get_stock_date(stocks, i)
        if nearby_stock_int != stock_int and nearby_stock_date == stock_date:
            window.append(nearby_stock_int)
    
    return window

for idx in range(0, 10, 5):
    print('window for', get_stock_ticker(df, idx), get_stock_int(df, idx))
    for nearby_int in get_window(df, idx, 5):
        print(nearby_int, int_to_ticker[nearby_int])

In [None]:
def get_batches(stocks, batch_size, window_size=5):    
    for start in range(0, len(stocks), batch_size):
        x, y = [], []
        stop = start + batch_size if (start + batch_size) < len(stocks) else len(stocks)
        
        for i in range(start, stop):   
            batch_x = get_stock_int(stocks, i)
            batch_y = get_window(stocks, i, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
            
        yield x, y

## Build the Graph

In [None]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

n_embedding = 400 # Number of embedding features 

with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_stocks, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

# Negative sampling

In [None]:
# Number of negative labels to sample
n_sampled = 100

with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_stocks, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_stocks))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_stocks)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

In [None]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

# Training

In [None]:
epochs = 50
batch_size = 1000
window_size = 5

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        batches = get_batches(df, batch_size, window_size)
        start = time.time()
        for x, y in batches:           
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_stock = int_to_ticker[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_stock
                    for k in range(top_k):
                        try:
                            close_stock = int_to_ticker[nearest[k]]
                            log = '%s %s,' % (log, close_stock)
                        except Exception:
                            print('nearest[k]', nearest[k])
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)