# Test BasicDataset
---

In [None]:
import time
import json
from tqdm import tqdm
from transformers import InputExample, BertTokenizer
from valerie.datasets import BasicDataset
from valerie.utils import get_logger

In [2]:
logger = get_logger()

In [3]:
with open("data/phase1/all_examples.json") as fi:
    examples = [InputExample(**e) for e in tqdm(json.load(fi))]

100%|██████████| 101419/101419 [00:00<00:00, 458262.45it/s]


In [4]:
len(examples)

101419

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

[2020-04-27 13:48:17,961] INFO:transformers.tokenization_utils: loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/jay/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [6]:
# single cpu
start = time.time()
dataset = BasicDataset(examples, tokenizer, label_list=[0,1,2])
print(time.time() - start)

[2020-04-27 13:48:19,568] INFO:valerie.datasets: ... converting examples to features ...
100%|██████████| 101419/101419 [02:56<00:00, 575.01it/s]

176.5986201763153





In [7]:
# multiprocessing
start = time.time()
dataset = BasicDataset(examples, tokenizer, label_list=[0,1,2], nproc=6)
print(time.time() - start)

[2020-04-27 13:51:16,182] INFO:valerie.datasets: ... converting examples to features ...
100%|██████████| 101419/101419 [00:59<00:00, 1691.50it/s]


62.12821292877197


In [8]:
# tests
assert len(examples) == len(dataset.features)
for i in tqdm(range(len(examples))):
    assert dataset.features[i].input_ids == tokenizer.encode_plus(examples[i].text_a, examples[i].text_b, max_length=tokenizer.max_len, pad_to_max_length=True).input_ids, i

100%|██████████| 101419/101419 [02:56<00:00, 574.00it/s]


In [11]:
dataset.save("logs/temp.cache")

[2020-04-27 13:58:24,121] INFO:valerie.datasets: .. saving features to cached file logs/temp.cache


In [12]:
!du -sh logs/temp.cache

310M	logs/temp.cache


In [15]:
from_cache_dataset = BasicDataset(None, tokenizer, label_list=[0,1,2], cached_features_file="logs/temp.cache")

[2020-04-27 14:05:41,256] INFO:valerie.datasets: ... loading features from cached file logs/temp.cache


# heapq.nlargest() result is Sorted
---

In [None]:
import copy
import heapq
import timeit

import numpy as np

In [None]:
def iterator(l, n=5):
    return [l.pop(np.argmax(l)) for _ in range(n)]
    
def partitioner(l, n=5):
    return [l[i] for i in np.argpartition(l, -n)[-n:][::-1]]
    
def heaper(l, n=5):
    return heapq.nlargest(n, l)
    
def sorter(l, n=5):
    return sorted(l, reverse=True)[:n]

In [None]:
l = list(np.random.rand(60))

In [None]:
assert iterator(copy.copy(l)) ==  partitioner(copy.copy(l)) == heaper(copy.copy(l)) == sorter(copy.copy(l))

In [None]:
setup = "import numpy as np; import heapq; l = list(np.random.rand(60)); n=5"
number = 1

print(timeit.timeit("[l.pop(np.argmax(l)) for _ in range(n)]", setup=setup, number=number))
print(timeit.timeit("[l[i] for i in np.argpartition(l, -n)[-n:][::-1]]", setup=setup, number=number))
print(timeit.timeit("heapq.nlargest(n, l)", setup=setup, number=number))
print(timeit.timeit("sorted(l, reverse=True)[:n]", setup=setup, number=number))

# scipy cosine vs sklearn cosine
---

In [None]:
import timeit

setup1 = "import numpy as np; arrs1 = [np.random.rand(400) for _ in range(60)];arrs2 = [np.random.rand(400) for _ in range(60)]"
setup2 = "import numpy as np; arrs1 = [np.random.rand(400) for _ in range(60)];arrs2 = [np.random.rand(400) for _ in range(60)]"

import1 = "from sklearn.metrics.pairwise import cosine_similarity"
stmt1 = "[float(cosine_similarity(arr1.reshape(1,-1), arr2.reshape(1,-1))) for arr1, arr2 in zip(arrs1, arrs2)]"

import2 = "from scipy.spatial.distance import cosine"
stmt2 = "[float(1 - cosine(arr1, arr2)) for arr1, arr2 in zip(arrs1, arrs2)]"

print("sklearn: ", timeit.timeit(stmt1, setup=import1 + ";" + setup1, number=1000))
print("scipy:   ", timeit.timeit(stmt2, setup=import2 + ";" + setup2, number=1000))