In [41]:
import json
import numpy as np
import h5py

In [47]:
# Use tensorflow 1 behavior to match the Universal Sentence Encoder
# examples (https://tfhub.dev/google/universal-sentence-encoder/2).
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub

In [56]:
FILE_NAME = 'stackoverflow-512-angular.hdf5'
DATA_FILE = 'posts.json'
BATCH_SIZE = 1000

dim = 512

In [57]:
def write_batch(dataset, titles):
    title_vectors = session.run(embeddings, feed_dict={text_ph: titles})
    size = len(titles)
    
    dataset.resize(dataset.shape[0] + size, axis=0)   
    dataset[-size:] = title_vectors

In [58]:
print("Downloading pre-trained embeddings from tensorflow hub...")
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
text_ph = tf.placeholder(tf.string)
embeddings = embed(text_ph)
print("Done.")

print("Creating tensorflow session...")
session = tf.Session()
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print("Done.")

with open(DATA_FILE) as data_file, h5py.File("temp-file.hdf5", 'w') as h5_file:
    titles = []
    count = 0
    
    dataset = h5_file.create_dataset('all', (0, dim), maxshape=(None, dim), dtype='i8', chunks=(BATCH_SIZE, dim))
    
    for line in data_file:
        line = line.strip()

        doc = json.loads(line)
        if doc["type"] != "question":
            continue

        titles.append(doc["title"])

        count += 1
        if count % BATCH_SIZE == 0:
            write_batch(dataset, titles)
            titles = []
            
            print("Wrote {} documents.".format(count))

    if titles:
        write_batch(dataset, titles)
        print("Wrote {} documents.".format(count))
    
    print("Done writing.")

print("Closing tensorflow session...")
session.close()
print("Done.")

Downloading pre-trained embeddings from tensorflow hub...
Done.
Creating tensorflow session...


KeyboardInterrupt: 

In [62]:
def train_test_split(X, test_size=10000):
    import sklearn.model_selection
    print('Splitting %d*%d into train/test' % X.shape)
    return sklearn.model_selection.train_test_split(
        X, test_size=test_size, random_state=1)

In [68]:
# Everything below this line is related to creating datasets
# You probably never need to do this at home,
# just rely on the prepared datasets at http://ann-benchmarks.com

def write_output(train, test, fn, distance, point_type='float', count=100):
    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
    n = 0
    f = h5py.File(fn, 'w')
    f.attrs['distance'] = distance
    f.attrs['point_type'] = point_type
    print('train size: %9d * %4d' % train.shape)
    print('test size:  %9d * %4d' % test.shape)
    f.create_dataset('train', (len(train), len(
        train[0])), dtype=train.dtype)[:] = train
    f.create_dataset('test', (len(test), len(
        test[0])), dtype=test.dtype)[:] = test
    neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
    distances = f.create_dataset('distances', (len(test), count), dtype='f')
    bf = BruteForceBLAS(distance, precision=train.dtype)
    bf.fit(train)
    queries = []
    for i, x in enumerate(test):
        if i % 1000 == 0:
            print('%d/%d...' % (i, test.shape[0]))
        res = list(bf.query_with_distances(x, count))
        res.sort(key=lambda t: t[-1])
        neighbors[i] = [j for j, _ in res]
        distances[i] = [d for _, d in res]
    f.close()




In [70]:
f = h5py.File(FILE_NAME, 'r')
dataset = f['all']
print(dataset.shape)
X_train, X_test = train_test_split(np.array(dataset))
f.close()

write_output(np.array(X_train), np.array(X_test), FILE_NAME, 'angular')

(18848, 512)
Splitting 18848*512 into train/test


ModuleNotFoundError: No module named 'ann_benchmarks'