In [7]:
import networkx as nx
from arctic import Arctic
import pandas as pd

In [30]:
import numpy as np 

In [5]:
a = Arctic('localhost')

In [6]:
lib = a['fund_holding']

In [8]:
holding=pd.concat([lib.read(sym).data.assign(symbol=sym) for sym in lib.list_symbols()])

In [74]:
holding = holding.query('holding > 0.0')

In [10]:
holding.index.name='ts_code'
holding = holding.reset_index()

In [75]:
all_funds = holding.symbol.unique()
all_stocks = holding.ts_code.unique()

In [76]:
stock_to_id = {s:i for i, s in enumerate(all_stocks)}

Build bipartite graph of stock-fund according to fund holdings

In [77]:
g = nx.Graph()
g.add_nodes_from(all_funds, bipartite='fund')
g.add_nodes_from(all_stocks, bipartite='stock')
w_edges = holding[['symbol', 'ts_code', 'holding']].to_records(index=False).tolist()
g.add_weighted_edges_from(w_edges)

generate random walks on fund-stock graph

In [78]:
def choose(g, fund_or_stock):
    nodes = list(g[fund_or_stock])
    wghts = np.array([d['weight'] for _, d in g[fund_or_stock].items()])
    normed_wghts = wghts / wghts.sum()
    node = np.random.choice(nodes, p=normed_wghts)
    return node

In [82]:
paths = []
for i in range(1000):
    stock = np.random.choice(all_stocks)
    path = [stock_to_id[stock]]
    for t in range(200):
        # choose a random fund connected by stock
        fund = choose(g, stock)
        #print('{}->{}->'.format(stock, fund))
        # choose a random stock connected by fund
        stock = choose(g, fund)

        path.append(stock_to_id[stock])
    paths.append(path)

In [84]:
paths = np.array(paths)

In [92]:
def get_target(path, idx, window_size=5):
    """
    Get a list of stocks around a stock in `path`
    """
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target = set(path[start:idx])
    target.update(path[idx+1:stop+1])
    
    return list(target)

In [93]:
x_batches = []
y_batches = []
for path in paths:
    for i in range(len(path)):
        x = path[i]
        y = get_target(path, i, 5)
        y_batches.extend(y)
        x_batches.extend([x]*len(y))

In [95]:
x_batches[:10], y_batches[:10]

([613, 613, 613, 75, 75, 224, 224, 224, 224, 224],
 [224, 75, 166, 224, 613, 224, 613, 166, 75, 13])

In [97]:
tf.reset_default_graph()

In [98]:
inputs = tf.placeholder(tf.int32, [None], name='inputs')
labels = tf.placeholder(tf.int32, [None, None], name='labels')

In [99]:
n_stocks = len(all_stocks)
n_embedding = 32 # Number of embedding features 
embedding = tf.Variable(tf.random_uniform((n_stocks, n_embedding), -1, 1))
embed = tf.nn.embedding_lookup(embedding, inputs)

In [100]:
# Number of negative labels to sample
n_sampled = 100
softmax_w = tf.Variable(tf.truncated_normal((n_stocks, n_embedding), stddev=0.1))
softmax_b = tf.Variable(tf.zeros(n_stocks))

# Calculate the loss using negative sampling
loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                  labels, embed,
                                  n_sampled, n_stocks)

cost = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer().minimize(cost)

In [103]:
import time

In [107]:
epochs = 2
batch_size = 128

with tf.Session() as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        start = time.time()
        for idx in range(0, len(x_batches), batch_size):
            x = x_batches[idx:idx+batch_size]
            y = y_batches[idx:idx+batch_size]
            
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            iteration += 1
            
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    embed_mat = sess.run(normalized_embedding)

Epoch 1/2 Iteration: 100 Avg. Training loss: 5.3346 0.0054 sec/batch
Epoch 1/2 Iteration: 200 Avg. Training loss: 5.3439 0.0030 sec/batch
Epoch 1/2 Iteration: 300 Avg. Training loss: 5.2803 0.0030 sec/batch
Epoch 1/2 Iteration: 400 Avg. Training loss: 5.1203 0.0031 sec/batch
Epoch 1/2 Iteration: 500 Avg. Training loss: 5.1350 0.0032 sec/batch
Epoch 1/2 Iteration: 600 Avg. Training loss: 4.9596 0.0030 sec/batch
Epoch 1/2 Iteration: 700 Avg. Training loss: 4.7762 0.0033 sec/batch
Epoch 1/2 Iteration: 800 Avg. Training loss: 4.6565 0.0031 sec/batch
Epoch 1/2 Iteration: 900 Avg. Training loss: 4.4653 0.0030 sec/batch
Epoch 1/2 Iteration: 1000 Avg. Training loss: 4.4613 0.0030 sec/batch
Epoch 1/2 Iteration: 1100 Avg. Training loss: 4.3459 0.0037 sec/batch
Epoch 1/2 Iteration: 1200 Avg. Training loss: 4.3773 0.0034 sec/batch
Epoch 1/2 Iteration: 1300 Avg. Training loss: 4.2550 0.0032 sec/batch
Epoch 1/2 Iteration: 1400 Avg. Training loss: 4.2175 0.0030 sec/batch
Epoch 1/2 Iteration: 1500 Avg

In [111]:
sims = np.dot(embed_mat, embed_mat.T)

In [117]:
embed_mat[0]

array([-0.21077634,  0.25182042, -0.24591663, -0.20395333,  0.0882552 ,
        0.05960291,  0.06305382, -0.03513731,  0.03025971, -0.31243148,
       -0.19897626,  0.3326856 , -0.08734539,  0.00362529, -0.07794005,
       -0.00431162,  0.0969455 ,  0.27191335,  0.24857497, -0.09293828,
       -0.06077765,  0.0853738 ,  0.18494187,  0.0061985 ,  0.13451672,
        0.22502415, -0.15815932,  0.27767122,  0.3023177 , -0.02005056,
       -0.22100852, -0.02289582], dtype=float32)