In [4]:
import json
import numpy as np
import os

In [2]:
def build_wiki_relation(market_name, connection_file, tic_wiki_file,
                        sel_path_file):
    # readin tickers
    tickers = np.genfromtxt(tic_wiki_file, dtype=str, delimiter=',',
                            skip_header=False)
    print('#tickers selected:', tickers.shape)
    wikiid_ticind_dic = {}
    for ind, tw in enumerate(tickers):
        if not tw[-1] == 'unknown':
            wikiid_ticind_dic[tw[-1]] = ind
    print('#tickers aligned:', len(wikiid_ticind_dic))

    # readin selected paths/connections
    sel_paths = np.genfromtxt(sel_path_file, dtype=str, delimiter=' ',
                              skip_header=False)
    print('#paths selected:', len(sel_paths))
    sel_paths = set(sel_paths[:, 0])

    # readin connections
    with open(connection_file, 'r') as fin:
        connections = json.load(fin)
    print('#connection items:', len(connections))

    # get occured paths
    occur_paths = set()
    for sou_item, conns in connections.items():
        for tar_item, paths in conns.items():
            for p in paths:
                path_key = '_'.join(p)
                if path_key in sel_paths:
                    occur_paths.add(path_key)

    # generate
    valid_path_index = {}
    for ind, path in enumerate(occur_paths):
        valid_path_index[path] = ind
    print('#valid paths:', len(valid_path_index))
    for path, ind in valid_path_index.items():
        print(path, ind)
    # one_hot_path_embedding = np.identity(len(valid_path_index) + 1, dtype=int)
    wiki_relation_embedding = np.zeros(
        [tickers.shape[0], tickers.shape[0], len(valid_path_index) + 1],
        dtype=int
    )
    conn_count = 0
    for sou_item, conns in connections.items():
        for tar_item, paths in conns.items():
            for p in paths:
                path_key = '_'.join(p)
                if path_key in valid_path_index.keys():
                    aaa = wikiid_ticind_dic[sou_item]
                    bbb = wikiid_ticind_dic[tar_item]
                    ccc = valid_path_index[path_key]
                    wiki_relation_embedding[wikiid_ticind_dic[sou_item]][wikiid_ticind_dic[tar_item]][valid_path_index[path_key]] = 1
                    conn_count += 1
    print('connections count:', conn_count, 'ratio:', conn_count / float(tickers.shape[0] * tickers.shape[0]))

    # handle self relation
    for i in range(tickers.shape[0]):
        wiki_relation_embedding[i][i][-1] = 1
    print(wiki_relation_embedding.shape)
#     np.save(market_name + '_wiki_relation', wiki_relation_embedding)


In [11]:
# single thread version
# path = '/home/kikyo/data/qt/fill_data'
path = '/home/kikyo/data/qt/relation/wikidata'
build_wiki_relation('NASDAQ',
                    os.path.join(path, 'NASDAQ_connections.json'),
                    os.path.join(path, 'NASDAQ_wiki.csv'),
                    os.path.join(path, 'selected_wiki_connections.csv'))
print('----------')
build_wiki_relation('NYSE',
                    os.path.join(path, 'NYSE_connections.json'),
                    os.path.join(path, 'NYSE_wiki.csv'),
                    os.path.join(path, 'selected_wiki_connections.csv'))

#tickers selected: (1026, 2)
#tickers aligned: 512
#paths selected: 57
#connection items: 511
#valid paths: 42
P127_P169 0
P1056_P1056 1
P169_P112 2
P1056_P400 3
P169_P127 4
P452_P31 5
P306_P1056 6
P1056_P31 7
P31_P1056 8
P199_P355 9
P166_P166 10
P3320_P169 11
P355 12
P169_P169 13
P121_P121 14
P749 15
P169_P3320 16
P155 17
P127_P112 18
P156 19
P361_P361 20
P1056_P306 21
P31_P366 22
P1344_P1344 23
P2770_P452 24
P452_P452 25
P112_P112 26
P452_P2770 27
P127 28
P463_P463 29
P112_P127 30
P452_P1056 31
P400_P1056 32
P355_P199 33
P31_P452 34
P355_P355 35
P127_P127 36
P1056_P452 37
P112_P169 38
P3320_P127 39
P366_P31 40
P127_P3320 41
connections count: 2171 ratio: 0.0020623629682827386
(1026, 1026, 43)
----------
#tickers selected: (1737, 2)
#tickers aligned: 597
#paths selected: 57
#connection items: 597
#valid paths: 32
P1056_P1056 0
P155_P355 1
P127_P1830 2
P114_P114 3
P452_P31 4
P166_P166 5
P1056_P121 6
P121_P121 7
P169_P169 8
P1830_P749 9
P355 10
P749 11
P355_P127 12
P749_P1830 13
P127_P3