# Test conversions

In [None]:
#@formatter:off
%load_ext autoreload
%autoreload 2
#@formatter:on

In [8]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
from varname.helpers import Wrapper
from src.utils.Config import Config
from src.data.TextDataset import TextDataset
from src.utils.Utils import Utils

Utils.tensorflow_shutup()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:

config = {
    "DATA": {
        "WINDOW_SIZE": 5,
        "STRIDE": 3,
        "DATA_PATH": os.path.join("data", "example.txt"),
    },
    "TRAINING": {},
}
config = Config(**config)

DataConfig(WINDOW_SIZE=5, STRIDE=3, DATA_PATH='data/example.txt')
TrainingConfig(HIDDEN_STATE_SIZE=[], EPOCHS=0, BATCH_SIZE=0, DROPOUT=0.5, LR=0.001, BUFFER_SIZE=10000, SAVE_DIR='models', PRED_EVERY=5, PRED_LEN=100, PRED_TEMP=0.75)


## Reading Dataset

In [10]:

dr = TextDataset(config, verbosity=2).read()

# [TextDataset]:	Config(DATA=DataConfig(WINDOW_SIZE=5, STRIDE=3, DATA_PATH='data/example.txt'), TRAINING=TrainingConfig(HIDDEN_STATE_SIZE=[], EPOCHS=0, BATCH_SIZE=0, DROPOUT=0.5, LR=0.001, BUFFER_SIZE=10000, SAVE_DIR='models', PRED_EVERY=5, PRED_LEN=100, PRED_TEMP=0.75), EMBED=None)
# [TextDataset - read]:
1. Reading `data/example.txt`...
	19 total chars in text
# [Vocab]:	label=example	encoding=utf-8	verbosity=2
# [Embeddings - init]:	label=example	encoding=utf-8
2. Converting __items__ to ids...
	ids: (len=19 min=1, max=12)
	data: <TensorSliceDataset shapes: (), types: tf.int64>
3. Creating sequences...
	data: <FlatMapDataset shapes: (6,), types: tf.int64>
4. Splitting into inputs and targets...
	dataset_ids: <MapDataset shapes: ((5,), (5,)), types: (tf.int64, tf.int64)>
5. One-hot encoding...
	dataset_oh: <MapDataset shapes: ((5, 13), (5, 13)), types: (tf.int64, tf.int64)>


## Validating Conversions

In [11]:

# This is equivalent to the first sample in the dataset
text_str = Wrapper("hello")
test_list = [text_str]

# Grab the first sample from the dataset in both IDS and OH representations
for ids, _ in dr.dataset_ids.take(1):
    text_ids = Wrapper(ids)
    test_list.append(text_ids)

for oh, _ in dr.dataset_oh.take(1):
    text_oh = Wrapper(oh)
    test_list.append(text_oh)


# Making sure that the conversions are correct
def assert_same(x, expected):
    # Check that the types of x and y are the same
    print("-" * 60)
    print("  - Checking Types")
    assert type(x) == type(expected), f"TYPES: {type(x)} != {type(expected)}"
    print("  - Checking Values")
    assert np.all(x == expected), f"VALUES: {x} != {expected}"


def test_conversion_func(func, xvals: list, expected):
    print(f"# Testing {func.__name__}:")
    for i, x in enumerate(xvals):
        print("=" * 80)
        print(f"{i + 1}. {x.name} => {func.__name__}")
        try:
            x_ = func(x.value)
            assert_same(x_, expected)
        except Exception as e:
            print_exception(e, x)
    print(f"OK: {func.__name__} passed")


def print_exception(e, x):
    print(f"  - ERROR: {e}")
    print("=" * 80)
    print("Type:", type(x.value))
    print(x.value)
    print("=" * 80)
    raise e


for i, v in enumerate(test_list):
    print(f"{i + 1}. {v.name} =")
    print(v)
    print("-" * 60)

1. text_str =
'hello'
------------------------------------------------------------
2. text_ids =
<tf.Tensor: shape=(5,), dtype=int64, numpy=array([6, 5, 7, 7, 8])>
------------------------------------------------------------
3. text_oh =
<tf.Tensor: shape=(5, 13), dtype=int64, numpy=
array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])>
------------------------------------------------------------


## Conversion to String

In [12]:
test_conversion_func(config.EMBED.to_string, test_list, test_list[0].value)

# Testing to_string:
1. text_str => to_string
* to_ids -> input is a String; converting to IDs first...
  - ERROR: 'int' object has no attribute 'decode'
Type: <class 'str'>
hello


AttributeError: 'int' object has no attribute 'decode'

## Conversion to IDS

In [None]:
test_conversion_func(config.EMBED.to_ids, test_list, test_list[1].value)

## Conversion to One-Hot

In [None]:
test_conversion_func(config.EMBED.to_onehot, test_list, test_list[2].value)