# stock2vec

Create vectors for stocks based on their relative volatility.

In [37]:
import datetime
import math
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import time
from tqdm import tqdm

In [41]:
total_fundamentals_rows = 94011143

df_eps = pd.read_csv('input/sharadar_fundamentals.csv', 
                     header=None,
                     parse_dates=[0],
                     chunksize=300000, iterator=True
                    )

df = df_eps.get_chunk()

df = df[df[0].str.contains("EPSDIL_MRT")]

print(df)

# TODO parse chunk and append to DF

with tqdm(total=total_fundamentals_rows) as pbar:
    pbar.update(1000)
    
# df.sort_values([1], inplace=True)

# earliest_date = df.values[0][1]

  0%|          | 1000/94011143 [00:00<00:06, 15141891.70it/s]

                      0           1     2
229     AAAP_EPSDIL_MRT  2012-12-31 -0.76
230     AAAP_EPSDIL_MRT  2013-12-31 -0.44
231     AAAP_EPSDIL_MRT  2014-12-31 -0.30
232     AAAP_EPSDIL_MRT  2015-12-31 -0.50
1732    AABC_EPSDIL_MRT  2003-03-31  0.72
1733    AABC_EPSDIL_MRT  2003-06-30  0.97
1734    AABC_EPSDIL_MRT  2003-09-30  1.09
1735    AABC_EPSDIL_MRT  2003-12-31  0.94
1736    AABC_EPSDIL_MRT  2004-03-31  0.88
1737    AABC_EPSDIL_MRT  2004-06-30  0.84
1738    AABC_EPSDIL_MRT  2004-09-30  0.75
1739    AABC_EPSDIL_MRT  2004-12-31  0.78
1740    AABC_EPSDIL_MRT  2005-03-31  0.82
1741    AABC_EPSDIL_MRT  2005-06-30  0.90
1742    AABC_EPSDIL_MRT  2005-09-30  0.96
7660    AACC_EPSDIL_MRT  2003-09-30  1.16
7661    AACC_EPSDIL_MRT  2003-12-31  1.33
7662    AACC_EPSDIL_MRT  2004-03-31 -0.03
7663    AACC_EPSDIL_MRT  2004-06-30 -0.02
7664    AACC_EPSDIL_MRT  2004-09-30 -0.03
7665    AACC_EPSDIL_MRT  2004-12-31  0.02
7666    AACC_EPSDIL_MRT  2005-03-31  1.41
7667    AACC_EPSDIL_MRT  2005-06-3




In [15]:
# Import the earnings per share

df_eps = pd.read_csv('input/sharadar_fundamentals.csv', 
                        usecols=["datekey", "dimension", "epsdil", "ticker"],
                        parse_dates=["datekey"])

df_eps.sort_values(['datekey', 'ticker', "dimension"], inplace=True)

print(df_eps.head())

  ticker dimension    datekey  epsdil
0   AAPL       ARQ 2001-08-13    0.01
1   AAPL       ARQ 2001-12-21    0.01
2   AAPL       ARQ 2002-02-11    0.01
3   AAPL       ARQ 2002-05-14    0.01
4   AAPL       ARQ 2002-08-09    0.01


In [5]:
# Import the prices

df_prices = pd.read_csv('input/wiki_prices.csv', 
                        usecols=["adj_close", "date", "ticker"],
                        parse_dates=['date'])

df_prices.sort_values(['date', 'ticker'], inplace=True)

df_prices = df_prices[pd.notnull(df_prices['adj_close'])]

print(df_prices.head())

        ticker       date  adj_close
7561760     KO 1962-01-02   0.266600
5620436     GE 1962-01-02   0.334451
1005749   ARNC 1962-01-02   3.472967
6739482    IBM 1962-01-02  15.561965
1416507     BA 1962-01-02   0.480022


## Preprocessing

Filter ticks for the past ~15 years of stocks with volume > 10000 and volatility > 0.

In [3]:
df = df[df['date'] >= datetime.date(2016,1,1)]
df = df[df['adj_volume'] > 10000]
del df['adj_volume']

df['volt'] = (df['adj_high'] - df['adj_low']) / df['adj_close']
del df['adj_high']
del df['adj_low']
del df['adj_close']

df = df[df['volt'] > 0]

df.sort_values(['date', 'volt'], inplace=True)

n_stocks = len(df['ticker'].unique())
print('n_stocks', n_stocks)

print(df.head())

n_stocks 2866
         ticker       date      volt
1581020    BDBD 2016-01-04  0.000911
1879551     BMR 2016-01-04  0.001688
9523109    NTLS 2016-01-04  0.002186
10150364    PCP 2016-01-04  0.002369
14300884    WPP 2016-01-04  0.002446


**Get Nearby Stocks**

For each stock, find C stocks that have the closest volatility to that ticker for that day.

In [4]:
ticker_to_int = {}
int_to_ticker = {}

def get_ticker_int(ticker):
    key = ticker_to_int.get(ticker, None)
    if key is None:
        key = ticker_to_int[ticker] = len(ticker_to_int)
        int_to_ticker[key] = ticker
    return key

def get_stock_date(stocks, idx):
    return stocks.iloc[[idx], 1].values[0]

def get_stock_ticker(stocks, idx):
    return stocks.iloc[[idx], 0].values[0]

def get_stock_int(stocks, idx):
    return get_ticker_int(get_stock_ticker(stocks, idx))

def get_window(stocks, idx, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R if (idx + R) < len(stocks) else len(stocks)

    stock_int = get_stock_int(stocks, idx)
    stock_date = get_stock_date(stocks, idx)
    
    window = []
    
    for i in range(start, stop):
        nearby_stock_int = get_stock_int(stocks, i)
        nearby_stock_date = get_stock_date(stocks, i)
        if nearby_stock_int != stock_int and nearby_stock_date == stock_date:
            window.append(nearby_stock_int)
    
    return window

for idx in range(0, 10, 5):
    print('window for', get_stock_ticker(df, idx), get_stock_int(df, idx))
    for nearby_int in get_window(df, idx, 5):
        print(nearby_int, int_to_ticker[nearby_int])

window for BDBD 0
1 BMR
2 NTLS
3 PCP
4 WPP
window for CCG 5
2 NTLS
3 PCP
4 WPP
6 MDAS
7 PBY


In [5]:
def get_batches(stocks, batch_size, window_size=5):    
    for start in range(0, len(stocks), batch_size):
        x, y = [], []
        stop = start + batch_size if (start + batch_size) < len(stocks) else len(stocks)
        
        for i in range(start, stop):   
            batch_x = get_stock_int(stocks, i)
            batch_y = get_window(stocks, i, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
            
        yield x, y

## Build the Graph

In [6]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

n_embedding = 400 # Number of embedding features 

with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_stocks, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

# Negative sampling

In [7]:
# Number of negative labels to sample
n_sampled = 100

with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_stocks, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_stocks))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_stocks)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

In [None]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

# Training

In [None]:
epochs = 50
batch_size = 1000
window_size = 5

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        batches = get_batches(df, batch_size, window_size)
        start = time.time()
        for x, y in batches:           
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_stock = int_to_ticker[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_stock
                    for k in range(top_k):
                        try:
                            close_stock = int_to_ticker[nearest[k]]
                            log = '%s %s,' % (log, close_stock)
                        except Exception:
                            print('nearest[k]', nearest[k])
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

Epoch 1/50 Iteration: 100 Avg. Training loss: 4.4161 3.3119 sec/batch
Epoch 1/50 Iteration: 200 Avg. Training loss: 4.3350 3.1031 sec/batch
Epoch 1/50 Iteration: 300 Avg. Training loss: 4.2741 3.1113 sec/batch
Epoch 1/50 Iteration: 400 Avg. Training loss: 4.2321 3.1114 sec/batch
Epoch 1/50 Iteration: 500 Avg. Training loss: 4.1339 3.0831 sec/batch
Epoch 1/50 Iteration: 600 Avg. Training loss: 4.1440 3.0780 sec/batch
Epoch 1/50 Iteration: 700 Avg. Training loss: 4.0824 3.0774 sec/batch
Epoch 1/50 Iteration: 800 Avg. Training loss: 4.0500 3.0775 sec/batch
Epoch 2/50 Iteration: 900 Avg. Training loss: 3.9751 3.0165 sec/batch
Epoch 2/50 Iteration: 1000 Avg. Training loss: 3.8908 3.0740 sec/batch
Nearest to PG: NLY, FGL, CKP, UMH, ETR, AROW, MDU, HTS,
Nearest to CCG: BLT, CSGP, HW, FUR, PRE, NWBO, GRIF, UTX,
Nearest to PMCS: GK, EAT, LKQ, COST, EXC, SGM, LVNTA, PVTB,
Nearest to SJM: CVG, CNC, SLRC, COF, LMT, HOT, DOV, AVY,
Nearest to POM: ASC, NILE, AOSL, UNT, PAYX, STRA, TFX, GPC,
Nearest 