From d18592ff6f8b1b9049176c8f326c15e74f2f7e77 Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Sun, 16 Feb 2020 19:24:56 +0800
Subject: [PATCH 1/7] checked the compatibility with PyTorch 1.4.0: still need
 some bug fix from upstream

---
 .coveragerc                                   |   1 +
 .travis.yml                                   |   6 +-
 README.md                                     |   6 +
 tensorkit/__init__.py                         |   3 +-
 tensorkit/backend/{losses.py => optim.py}     |   6 +-
 tensorkit/backend/pytorch_/core.py            |  42 +-
 tensorkit/backend/pytorch_/flows.py           | 132 ++--
 tensorkit/backend/pytorch_/init.py            |   4 +-
 tensorkit/backend/pytorch_/layers.py          | 266 ++++---
 tensorkit/backend/pytorch_/losses.py          |  19 -
 tensorkit/backend/pytorch_/nn.py              |   2 +
 tensorkit/backend/pytorch_/optim.py           | 131 ++++
 tensorkit/backend/pytorch_/train.py           |  91 +++
 tensorkit/backend/pytorch_/utils.py           | 182 +++++
 tensorkit/backend/train.py                    |   9 +
 tensorkit/backend/utils.py                    |   9 +
 tensorkit/distributions/flow.py               |  20 +-
 tensorkit/examples/.gitignore                 |   1 +
 .../losses => tensorkit/examples}/__init__.py |   0
 tensorkit/examples/classification/__init__.py |   0
 tensorkit/examples/classification/mnist.py    |  99 +++
 .../examples/classification/mnist_resnet.py   | 104 +++
 tensorkit/examples/utils/__init__.py          |   3 +
 tensorkit/examples/utils/fit_model_.py        |  39 ++
 tensorkit/examples/utils/ops.py               |   9 +
 tensorkit/examples/utils/prepare_data.py      |  69 ++
 tensorkit/flows/act_norm.py                   |  59 +-
 tensorkit/flows/coupling.py                   |  28 +-
 tensorkit/flows/rearrangement.py              |  12 +-
 tensorkit/flows/reshape_.py                   |  72 +-
 tensorkit/flows/split_.py                     |  52 +-
 tensorkit/init/std_data_init.py               |   5 +-
 tensorkit/layers/__init__.py                  |   1 +
 tensorkit/layers/activation.py                |  26 +-
 tensorkit/layers/builder.py                   | 651 ++++++++++++++++++
 tensorkit/layers/contextual.py                |  33 +-
 tensorkit/layers/flow_layer.py                |  10 +-
 tensorkit/layers/gated.py                     |   5 +-
 tensorkit/layers/pixelcnn.py                  |  63 +-
 tensorkit/layers/pool.py                      |  20 +-
 tensorkit/layers/resnet.py                    |  20 +-
 tensorkit/layers/shape_.py                    |  33 +-
 tensorkit/layers/split_.py                    |   4 +-
 tensorkit/losses/core.py                      |   4 -
 tensorkit/optim/__init__.py                   |   2 +
 tensorkit/optim/core.py                       |   4 +
 tensorkit/optim/lr_scheduler.py               |  71 ++
 tensorkit/tensor/__init__.py                  |   2 +-
 tensorkit/tensor/utils.py                     |   4 +
 tensorkit/{losses => train}/__init__.py       |   0
 tensorkit/train/core.py                       |   4 +
 tensorkit/utils/__init__.py                   |   2 +
 tensorkit/utils/data_utils.py                 |  97 +++
 tensorkit/utils/tensor_stream.py              |  48 ++
 tests/distributions/test_flow.py              |  28 +-
 tests/flows/test_core.py                      | 120 ++--
 tests/flows/test_coupling.py                  |   2 +-
 tests/flows/test_shape_.py                    |   4 +-
 tests/flows/test_split_.py                    |  28 +-
 tests/init/test_core.py                       |   2 +-
 tests/layers/test_contextual.py               |  58 --
 tests/layers/test_core.py                     |  39 +-
 tests/layers/test_flow_layer.py               |  14 +-
 tests/layers/test_pixelcnn.py                 |   6 +-
 tests/layers/test_resnet.py                   |  10 +-
 tests/losses/test_core.py                     |  31 -
 tests/ops.py                                  |   2 +-
 tests/tensor/test_core.py                     |  32 +-
 tests/tensor/test_nn.py                       |   6 +
 tests/tensor/test_utils.py                    | 119 ++++
 tests/train/__init__.py                       |   0
 tests/train/test_core.py                      |  67 ++
 72 files changed, 2472 insertions(+), 681 deletions(-)
 create mode 100644 README.md
 rename tensorkit/backend/{losses.py => optim.py} (59%)
 delete mode 100644 tensorkit/backend/pytorch_/losses.py
 create mode 100644 tensorkit/backend/pytorch_/optim.py
 create mode 100644 tensorkit/backend/pytorch_/train.py
 create mode 100644 tensorkit/backend/pytorch_/utils.py
 create mode 100644 tensorkit/backend/train.py
 create mode 100644 tensorkit/backend/utils.py
 create mode 100644 tensorkit/examples/.gitignore
 rename {tests/losses => tensorkit/examples}/__init__.py (100%)
 create mode 100644 tensorkit/examples/classification/__init__.py
 create mode 100644 tensorkit/examples/classification/mnist.py
 create mode 100644 tensorkit/examples/classification/mnist_resnet.py
 create mode 100644 tensorkit/examples/utils/__init__.py
 create mode 100644 tensorkit/examples/utils/fit_model_.py
 create mode 100644 tensorkit/examples/utils/ops.py
 create mode 100644 tensorkit/examples/utils/prepare_data.py
 create mode 100644 tensorkit/layers/builder.py
 delete mode 100644 tensorkit/losses/core.py
 create mode 100644 tensorkit/optim/__init__.py
 create mode 100644 tensorkit/optim/core.py
 create mode 100644 tensorkit/optim/lr_scheduler.py
 create mode 100644 tensorkit/tensor/utils.py
 rename tensorkit/{losses => train}/__init__.py (100%)
 create mode 100644 tensorkit/train/core.py
 create mode 100644 tensorkit/utils/__init__.py
 create mode 100644 tensorkit/utils/data_utils.py
 create mode 100644 tensorkit/utils/tensor_stream.py
 delete mode 100644 tests/losses/test_core.py
 create mode 100644 tests/tensor/test_utils.py
 create mode 100644 tests/train/__init__.py
 create mode 100644 tests/train/test_core.py

diff --git a/.coveragerc b/.coveragerc
index e21aa69..33df70b 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -15,6 +15,7 @@ ignore_errors = True
 omit =
     tests/*
     scripts/*
+    tensorkit/examples/*
     tensorkit/backend/pytorch_/_make_dtypes_mapper.py
     setup.py
     *.pyt
diff --git a/.travis.yml b/.travis.yml
index e7c24ef..c17b9d8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,9 +8,13 @@ install:
   - pip install --upgrade coverage coveralls
   - pip install -r requirements-dev.txt
 script:
-  # run tests with PyTorch
+  # run tests with PyTorch 1.3.1
   - pip install torch==1.3.1
   - TENSORKIT_BACKEND=PyTorch TENSORKIT_DISABLE_JIT=true coverage run -a -m pytest
   - TENSORKIT_BACKEND=PyTorch TENSORKIT_DISABLE_JIT=false coverage run -a -m pytest
+#  # run tests with PyTorch 1.4.0
+#  - pip install torch==1.4.0
+#  - TENSORKIT_BACKEND=PyTorch TENSORKIT_DISABLE_JIT=true coverage run -a -m pytest
+#  - TENSORKIT_BACKEND=PyTorch TENSORKIT_DISABLE_JIT=false coverage run -a -m pytest
 after_success:
   - coveralls
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..70cf60a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+## TensorKit
+
+### Requirements
+
+* PyTorch >= 1.4.0
+
diff --git a/tensorkit/__init__.py b/tensorkit/__init__.py
index 6fdcd0e..90191fb 100644
--- a/tensorkit/__init__.py
+++ b/tensorkit/__init__.py
@@ -1,7 +1,8 @@
 __version__ = '0.0.1'
 
 
-from . import backend, distributions, flows, init, layers, losses, variational
+from . import (backend, distributions, flows, init, layers, optim, train,
+               utils, variational)
 from .bayes import *
 # from .distributions import *
 # from .layers import *
diff --git a/tensorkit/backend/losses.py b/tensorkit/backend/optim.py
similarity index 59%
rename from tensorkit/backend/losses.py
rename to tensorkit/backend/optim.py
index 0ae2540..9f5958d 100644
--- a/tensorkit/backend/losses.py
+++ b/tensorkit/backend/optim.py
@@ -1,9 +1,9 @@
 from ..settings_ import settings
 
 if settings.backend == 'PyTorch':
-    from .pytorch_ import losses
-    from .pytorch_.losses import *
+    from .pytorch_ import optim
+    from .pytorch_.optim import *
 else:
     RuntimeError(f'Backend {settings.backend} not supported.')
 
-__all__ = losses.__all__
+__all__ = optim.__all__
diff --git a/tensorkit/backend/pytorch_/core.py b/tensorkit/backend/pytorch_/core.py
index 68318ad..8c01417 100644
--- a/tensorkit/backend/pytorch_/core.py
+++ b/tensorkit/backend/pytorch_/core.py
@@ -59,7 +59,7 @@
 
     # reduce operators
     'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
-    'log_sum_exp', 'log_mean_exp',
+    'argmax', 'argmin', 'log_sum_exp', 'log_mean_exp',
     # 'all', 'any',
     'calculate_mean_and_var', 'norm_except_axis',
 
@@ -1108,6 +1108,16 @@ def reduce_min(input: Tensor,
             return input
 
 
+@jit
+def argmax(input: Tensor, axis: int, keepdims: bool = False) -> Tensor:
+    return torch.argmax(input, dim=axis, keepdim=keepdims)
+
+
+@jit
+def argmin(input: Tensor, axis: int, keepdims: bool = False) -> Tensor:
+    return torch.argmin(input, dim=axis, keepdim=keepdims)
+
+
 @jit
 def log_sum_exp(input: Tensor,
                 axis: Optional[List[int]] = None,
@@ -1363,33 +1373,47 @@ def matrix_inverse(matrix: Tensor) -> Tensor:
 
 
 # ---- gradient utilities ----
-if settings.disable_jit:
+if settings.disable_jit or not torch.__version__.startswith('1.3.'):
+    @jit
     def grad(outputs: List[Tensor],
              inputs: List[Tensor],
              grad_outputs: Optional[List[Optional[Tensor]]] = None,
-             keep_graph: Optional[bool] = None,
+             retain_graph: Optional[bool] = None,
              create_graph: bool = False,
              allow_unused: bool = False) -> List[Optional[Tensor]]:
-        return list(
+        grad_outs = list(
             torch.autograd.grad(
                 outputs=outputs,
                 inputs=inputs,
                 grad_outputs=grad_outputs,
-                retain_graph=keep_graph,
+                retain_graph=retain_graph,
                 create_graph=create_graph,
                 allow_unused=allow_unused,
             )
         )
 
+        if not allow_unused:
+            for i in range(len(grad_outs)):
+                if grad_outs[i] is None:
+                    raise RuntimeError(
+                        'One of the differentiated Tensors '
+                        'appears to not have been used in the graph. '
+                        'Set allow_unused=True if this is the desired '
+                        'behavior.'
+                    )
+
+        return grad_outs
+
+
+    def is_null_grad(origin: Tensor, grad: Optional[Tensor]) -> bool:
+        return grad is None
 
-    def is_null_grad(origin: Tensor, gradient: Optional[Tensor]) -> bool:
-        return gradient is None
 else:
     @jit
     def grad(outputs: List[Tensor],
              inputs: List[Tensor],
              grad_outputs: Optional[List[Optional[Tensor]]] = None,
-             keep_graph: Optional[bool] = None,
+             retain_graph: Optional[bool] = None,
              create_graph: bool = False,
              allow_unused: bool = False) -> List[Tensor]:
         grad_outs = list(
@@ -1397,7 +1421,7 @@ def grad(outputs: List[Tensor],
                 outputs=outputs,
                 inputs=inputs,
                 grad_outputs=grad_outputs,
-                keep_graph=keep_graph,
+                keep_graph=retain_graph,
                 create_graph=create_graph,
                 allow_unused=allow_unused,
             )
diff --git a/tensorkit/backend/pytorch_/flows.py b/tensorkit/backend/pytorch_/flows.py
index 3f5892b..216bc72 100644
--- a/tensorkit/backend/pytorch_/flows.py
+++ b/tensorkit/backend/pytorch_/flows.py
@@ -13,17 +13,17 @@
 from .nn import *
 
 __all__ = [
-    'BaseFlow', 'FeatureMappingFlow',
+    'Flow', 'FeatureMappingFlow',
     'InverseFlow', 'SequentialFlow',
     'LooseInvertibleMatrix', 'StrictInvertibleMatrix',
     'InvertibleDense', 'InvertibleConv1d', 'InvertibleConv2d',
     'InvertibleConv3d',
-    'BaseScale', 'SigmoidScale', 'ExpScale', 'LinearScale',
+    'Scale', 'SigmoidScale', 'ExpScale', 'LinearScale',
 ]
 
 
 # ---- base flow classes ----
-class BaseFlow(BaseLayer):
+class Flow(BaseLayer):
     """
     Base class for normalizing flows.
 
@@ -61,7 +61,19 @@ def __init__(self,
         self.y_event_ndims = int(y_event_ndims)
         self.explicitly_invertible = bool(explicitly_invertible)
 
-    def invert(self) -> 'BaseFlow':
+    @jit_method
+    def get_x_event_ndims(self) -> int:
+        return self.x_event_ndims
+
+    @jit_method
+    def get_y_event_ndims(self) -> int:
+        return self.y_event_ndims
+
+    @jit_method
+    def is_explicitly_invertible(self) -> bool:
+        return self.explicitly_invertible
+
+    def invert(self) -> 'Flow':
         """
         Get the inverse flow from this flow.
 
@@ -78,12 +90,12 @@ def invert(self) -> 'BaseFlow':
         """
         return InverseFlow(self)
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         raise NotImplementedError()
 
     def forward(self,
@@ -133,7 +145,7 @@ def forward(self,
                 )
 
         # compute the transformed output and log-det
-        output, output_log_det = self._forward(
+        output, output_log_det = self._transform(
             input, input_log_det, inverse, compute_log_det)
 
         if output_log_det is not None:
@@ -150,10 +162,10 @@ def forward(self,
         return output, output_log_det
 
 
-class FeatureMappingFlow(BaseFlow):
+class FeatureMappingFlow(Flow):
     """Base class for flows mapping input features to output features."""
 
-    __constants__ = BaseFlow.__constants__ + ('axis',)
+    __constants__ = Flow.__constants__ + ('axis',)
 
     axis: int
     """The feature axis (negative index)."""
@@ -194,44 +206,48 @@ def __init__(self,
                          explicitly_invertible=explicitly_invertible)
         self.axis = axis
 
-    @property
-    def event_ndims(self) -> int:
+    @jit_method
+    def get_axis(self) -> int:
+        return self.axis
+
+    @jit_method
+    def get_event_ndims(self) -> int:
         """Get the number of event dimensions in both `x` and `y`."""
         return self.x_event_ndims
 
 
 # ---- composite flows ----
-class InverseFlow(BaseFlow):
+class InverseFlow(Flow):
     """A flow that inverts another given flow."""
 
-    __constants__ = BaseFlow.__constants__ + ('original_flow',)
+    __constants__ = Flow.__constants__ + ('original_flow',)
 
     original_flow: Module
     """The original flow, which is inverted by this :class:`InverseFlow`."""
 
     def __init__(self, flow: Module):
-        if (not isinstance(flow, BaseFlow) and not is_jit_layer(flow)) or \
-                not flow.explicitly_invertible:
+        if (not isinstance(flow, Flow) and not is_jit_layer(flow)) or \
+                not flow.is_explicitly_invertible():
             raise TypeError(
                 f'`flow` must be an explicitly invertible flow: '
                 f'got {flow!r}'
             )
 
         super().__init__(
-            x_event_ndims=flow.y_event_ndims,
-            y_event_ndims=flow.x_event_ndims,
-            explicitly_invertible=flow.explicitly_invertible,
+            x_event_ndims=flow.get_y_event_ndims(),
+            y_event_ndims=flow.get_x_event_ndims(),
+            explicitly_invertible=flow.is_explicitly_invertible(),
         )
         self.original_flow = flow
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return self.original_flow
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
         return self.original_flow(
             input, input_log_det, not inverse, compute_log_det)
 
@@ -247,9 +263,9 @@ def forward(self,
         raise RuntimeError('Not an explicitly invertible flow.')
 
 
-class SequentialFlow(BaseFlow):
+class SequentialFlow(Flow):
 
-    __constants__ = BaseFlow.__constants__ + ('_chain', '_inverse_chain')
+    __constants__ = Flow.__constants__ + ('_chain', '_inverse_chain')
 
     _chain: ModuleList
 
@@ -267,22 +283,22 @@ def __init__(self, *flows: Union[Module, Sequence[Module]]):
             raise ValueError('`flows` must not be empty.')
 
         for i, flow in enumerate(flows):
-            if not isinstance(flow, BaseFlow) and not is_jit_layer(flow):
+            if not isinstance(flow, Flow) and not is_jit_layer(flow):
                 raise TypeError(f'`flows[{i}]` is not a flow: got {flow!r}')
 
         for i, (flow1, flow2) in enumerate(zip(flows[:-1], flows[1:])):
-            if flow2.x_event_ndims != flow1.y_event_ndims:
+            if flow2.get_x_event_ndims() != flow1.get_y_event_ndims():
                 raise ValueError(
                     f'`x_event_ndims` of `flows[{i + 1}]` != '
                     f'`y_event_ndims` of `flows[{i}]`: '
-                    f'{flow2.x_event_ndims} vs {flow1.y_event_ndims}.'
+                    f'{flow2.get_x_event_ndims()} vs {flow1.get_y_event_ndims()}.'
                 )
 
         super().__init__(
-            x_event_ndims=flows[0].x_event_ndims,
-            y_event_ndims=flows[-1].y_event_ndims,
+            x_event_ndims=flows[0].get_x_event_ndims(),
+            y_event_ndims=flows[-1].get_y_event_ndims(),
             explicitly_invertible=all(
-                flow.explicitly_invertible for flow in flows)
+                flow.is_explicitly_invertible() for flow in flows)
         )
 
         self._chain = ModuleList(flows)
@@ -291,12 +307,12 @@ def __init__(self, *flows: Union[Module, Sequence[Module]]):
         else:
             self._inverse_chain = ModuleList([_NotInvertibleFlow()])
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         output, output_log_det = input, input_log_det
 
         if inverse:
@@ -504,16 +520,16 @@ def __init__(self,
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _linear_transform(self, input: Tensor, weight: Tensor) -> Tensor:
+    def _affine_transform(self, input: Tensor, weight: Tensor) -> Tensor:
         raise NotImplementedError()
 
     @jit_method
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         # obtain the weight
         weight, log_det = self.invertible_matrix(
             inverse=inverse, compute_log_det=compute_log_det)
@@ -522,7 +538,7 @@ def _forward(self,
 
         # compute the output
         output, front_shape = flatten_to_ndims(input, spatial_ndims + 2)
-        output = self._linear_transform(output, weight)
+        output = self._affine_transform(output, weight)
         output = unflatten_from_ndims(output, front_shape)
 
         # compute the log_det
@@ -545,7 +561,7 @@ def _get_spatial_ndims(self) -> int:
         return 0
 
     @jit_method
-    def _linear_transform(self, input: Tensor, weight: Tensor) -> Tensor:
+    def _affine_transform(self, input: Tensor, weight: Tensor) -> Tensor:
         return torch.nn.functional.linear(input, weight)
 
 
@@ -556,7 +572,7 @@ def _get_spatial_ndims(self) -> int:
         return 1
 
     @jit_method
-    def _linear_transform(self, input: Tensor, weight: Tensor) -> Tensor:
+    def _affine_transform(self, input: Tensor, weight: Tensor) -> Tensor:
         return torch.nn.functional.conv1d(input, weight)
 
 
@@ -567,7 +583,7 @@ def _get_spatial_ndims(self) -> int:
         return 2
 
     @jit_method
-    def _linear_transform(self, input: Tensor, weight: Tensor) -> Tensor:
+    def _affine_transform(self, input: Tensor, weight: Tensor) -> Tensor:
         return torch.nn.functional.conv2d(input, weight)
 
 
@@ -578,12 +594,12 @@ def _get_spatial_ndims(self) -> int:
         return 3
 
     @jit_method
-    def _linear_transform(self, input: Tensor, weight: Tensor) -> Tensor:
+    def _affine_transform(self, input: Tensor, weight: Tensor) -> Tensor:
         return torch.nn.functional.conv3d(input, weight)
 
 
 # ---- scale modules, for transforming input to output by a scale ----
-class BaseScale(BaseLayer):
+class Scale(BaseLayer):
     """Base class for scaling `input`."""
 
     def _scale_and_log_scale(self,
@@ -669,7 +685,7 @@ def forward(self,
         return output, output_log_det
 
 
-class ExpScale(BaseScale):
+class ExpScale(Scale):
     """
     Scaling `input` with `exp` activation.
 
@@ -701,7 +717,7 @@ def _scale_and_log_scale(self,
         return scale, log_scale
 
 
-class SigmoidScale(BaseScale):
+class SigmoidScale(Scale):
     """
     Scaling `input` with `sigmoid` activation.
 
@@ -745,7 +761,7 @@ def _scale_and_log_scale(self,
         return scale, log_scale
 
 
-class LinearScale(BaseScale):
+class LinearScale(Scale):
     """
     Scaling `input` with `linear` activation.
 
diff --git a/tensorkit/backend/pytorch_/init.py b/tensorkit/backend/pytorch_/init.py
index d6bb26b..f7a144d 100644
--- a/tensorkit/backend/pytorch_/init.py
+++ b/tensorkit/backend/pytorch_/init.py
@@ -271,11 +271,11 @@ def register(self, layer: Module, initialized: bool = False) -> None:
         _ = DataDependentInitializerForwardPreHook(
             self, layer, initialized=initialized)
 
-    def _forward(self, layer: Module, inputs: List[Tensor]) -> None:
+    def _init(self, layer: Module, inputs: List[Tensor]) -> None:
         raise NotImplementedError()
 
     def __call__(self, layer: Module, inputs: List[Tensor]) -> None:
-        self._forward(layer, list(inputs))
+        self._init(layer, list(inputs))
 
     def __repr__(self) -> str:
         buf = []
diff --git a/tensorkit/backend/pytorch_/layers.py b/tensorkit/backend/pytorch_/layers.py
index 96dc9b8..53fdc72 100644
--- a/tensorkit/backend/pytorch_/layers.py
+++ b/tensorkit/backend/pytorch_/layers.py
@@ -14,12 +14,12 @@
     'DEFAULT_GATE_BIAS', 'DEFAULT_WEIGHT_INIT', 'DEFAULT_BIAS_INIT',
 
     # utils
-    'add_parameter', 'get_parameter', 'get_parameters',
-    'add_buffer', 'get_buffer', 'get_buffers',
+    'add_parameter', 'get_parameter', 'get_parameters', 'get_named_parameters',
+    'add_buffer', 'get_buffer', 'get_buffers', 'get_named_buffers',
     'set_train_mode',
 
     # parameter store modules
-    'BaseParamStore', 'SimpleParamStore',
+    'ParamStore', 'SimpleParamStore',
     'NormedWeightStore', 'NormedAndScaledWeightStore',
     'get_weight_store', 'get_bias_store',
 
@@ -27,10 +27,7 @@
     'Identity',
 
     # base layers and composition layers
-    'BaseLayer', 'BaseSingleVariateLayer', 'BaseMultiVariateLayer',
-    'BaseSplitLayer', 'BaseMergeLayer',
-    'ModuleList', 'Sequential',
-    'BaseContextualLayer', 'BaseMultiVariateContextualLayer',
+    'BaseLayer', 'ModuleList', 'Sequential',
 
     # linear layers
     'CoreLinear', 'Linear',
@@ -70,7 +67,12 @@ def get_parameter(layer: Module, name: str) -> Optional[Variable]:
 
 
 def get_parameters(layer: Module, recursive: bool = True
-                   ) -> Iterator[Tuple[str, Variable]]:
+                   ) -> Iterator[Variable]:
+    return layer.parameters(recurse=recursive)
+
+
+def get_named_parameters(layer: Module, recursive: bool = True
+                         ) -> Iterator[Tuple[str, Variable]]:
     return layer.named_parameters(recurse=recursive)
 
 
@@ -87,7 +89,12 @@ def get_buffer(layer: Module, name: str) -> Optional[Tensor]:
 
 
 def get_buffers(layer: Module, recursive: bool = True
-                ) -> Iterator[Tuple[str, Tensor]]:
+                ) -> Iterator[Tensor]:
+    return layer.buffers(recurse=recursive)
+
+
+def get_named_buffers(layer: Module, recursive: bool = True
+                      ) -> Iterator[Tuple[str, Tensor]]:
     return layer.named_buffers(recurse=recursive)
 
 
@@ -97,7 +104,16 @@ def set_train_mode(layer: Module, training: bool = True):
 
 
 # ---- weight wrapper: a simple weight, or a normed weight ----
-class BaseParamStore(Module):
+class _NullParamStore(Module):
+    # This module is actually not used in any context.
+    # It is just a place-holder module, to gain JIT support.
+
+    def forward(self) -> Tensor:  # pragma: no cover
+        zero_shape: List[int] = []
+        return torch.zeros(zero_shape, dtype=torch.float32)
+
+
+class ParamStore(Module):
     """
     Base class for a component that stores a trainable parameter,
     or a set of trainable parameters that can be used to derive
@@ -125,7 +141,7 @@ def set(self, value: TensorOrData) -> None:
         raise NotImplementedError()
 
 
-class SimpleParamStore(BaseParamStore):
+class SimpleParamStore(ParamStore):
     """A module that carries a direct variable as the parameter."""
 
     def __init__(self,
@@ -165,10 +181,10 @@ def weight_norm_decompose(weight: Tensor,
     return v, v_norm
 
 
-class NormedWeightStore(BaseParamStore):
+class NormedWeightStore(ParamStore):
     """A module that carries the weight-normed `weight`, without `g`."""
 
-    __constants__ = BaseParamStore.__constants__ + ('feature_axis', 'epsilon')
+    __constants__ = ParamStore.__constants__ + ('feature_axis', 'epsilon')
 
     norm_axis: int
     epsilon: float
@@ -202,10 +218,10 @@ def set(self, value: TensorOrData) -> None:
             assign_data(self.v, v)
 
 
-class NormedAndScaledWeightStore(BaseParamStore):
+class NormedAndScaledWeightStore(ParamStore):
     """A module that carries the weight-normed `weight`, with `v` and `g`."""
 
-    __constants__ = BaseParamStore.__constants__ + ('feature_axis', 'epsilon')
+    __constants__ = ParamStore.__constants__ + ('feature_axis', 'epsilon')
 
     norm_axis: int
     epsilon: float
@@ -245,7 +261,7 @@ def get_weight_store(shape: List[int],
                      initializer: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                      norm_axis: int = 1,
                      weight_norm: WeightNormArgType = False
-                     ) -> BaseParamStore:
+                     ) -> ParamStore:
     """
     Create a module which carries the `weight` parameter.
 
@@ -275,7 +291,7 @@ def get_weight_store(shape: List[int],
 def get_bias_store(shape: List[int],
                    initializer: TensorInitArgType = DEFAULT_BIAS_INIT,
                    use_bias: bool = True
-                   ) -> Optional[BaseParamStore]:
+                   ) -> Optional[ParamStore]:
     """
     Create a module that carries the `bias` parameter.
 
@@ -300,7 +316,26 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 # ---- base layers and composition layers ----
-class BaseLayer(Module):
+class BaseLayerMeta(type):
+
+    def __new__(cls, name, parents, dct):
+        if torch.__version__ == '1.4.0':
+            # strange bug, that PyTorch 1.4.0 does not support annotations
+            # with type `Module` or `ModuleList`
+            if '__annotations__' in dct:
+                annotations = dct['__annotations__']
+                annotation_keys = list(annotations)
+                for attr in annotation_keys:
+                    if annotations[attr] in (Module, ModuleList):
+                        annotations.pop(attr)
+
+        return super().__new__(cls, name, parents, dct)
+
+
+class BaseLayer(Module, metaclass=BaseLayerMeta):
+
+    def _is_attr_included_in_repr(self, attr: str, value: Any) -> bool:
+        return True
 
     def extra_repr(self) -> str:
         buf = []
@@ -314,102 +349,17 @@ def extra_repr(self) -> str:
             if attr.startswith('_'):
                 continue
             attr_val = getattr(self, attr, None)
-            if attr_val is None or isinstance(attr_val, Module) or \
-                    isinstance(attr_val, Tensor):
+            if attr_val is None or \
+                    isinstance(attr_val, Module) or \
+                    isinstance(attr_val, Tensor) or \
+                    is_jit_layer(attr_val):
                 continue
-            buf.append(f'{attr}={attr_val!r}')
+            if self._is_attr_included_in_repr(attr, attr_val):
+                buf.append(f'{attr}={attr_val!r}')
 
         return ', '.join(buf)
 
 
-class BaseSingleVariateLayer(BaseLayer):
-    """
-    Base class for single-input, single-output layers.
-
-    Sub-classes should override `_call(input: Tensor) -> Tensor` to
-    actually implement the module.
-    """
-
-    def _forward(self, input: Tensor) -> Tensor:
-        raise NotImplementedError()
-
-    def forward(self, input: Tensor) -> Tensor:
-        return self._forward(input)
-
-
-class BaseMultiVariateLayer(BaseLayer):
-    """
-    Base class for multiple-input, multiple-output layers.
-    The inputs and outputs should be given as a list of Tensors.
-    """
-
-    def _forward(self, inputs: List[Tensor]) -> List[Tensor]:
-        raise NotImplementedError()
-
-    def forward(self, inputs: List[Tensor]) -> List[Tensor]:
-        return self._forward(inputs)
-
-
-class BaseSplitLayer(BaseLayer):
-    """
-    Base class for single-input, multiple-output layers.
-    The outputs should be given as a list of Tensors.
-    """
-
-    def _forward(self, input: Tensor) -> List[Tensor]:
-        raise NotImplementedError()
-
-    def forward(self, input: Tensor) -> List[Tensor]:
-        return self._forward(input)
-
-
-class BaseMergeLayer(BaseLayer):
-    """
-    Base class for multiple-input, single-output layers.
-    The inputs should be given as a list of Tensors.
-    """
-
-    def _forward(self, inputs: List[Tensor]) -> Tensor:
-        raise NotImplementedError()
-
-    def forward(self, inputs: List[Tensor]) -> Tensor:
-        return self._forward(inputs)
-
-
-class BaseContextualLayer(BaseLayer):
-    """
-    Base class layers that produces the output according to the input tensor
-    and contextual tensors.
-    """
-
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
-        raise NotImplementedError()
-
-    def forward(self,
-                input: Tensor,
-                context: Optional[List[Tensor]] = None) -> Tensor:
-        if context is None:
-            context = []
-        return self._forward(input, context)
-
-
-class BaseMultiVariateContextualLayer(BaseLayer):
-    """
-    Base class layers that produces the output tensors according to the
-    input tensors and contextual tensors.
-    """
-
-    def _forward(self, inputs: List[Tensor], context: List[Tensor]) -> List[Tensor]:
-        raise NotImplementedError()
-
-    def forward(self,
-                inputs: List[Tensor],
-                context: Optional[List[Tensor]] = None) -> List[Tensor]:
-        if context is None:
-            context = []
-        return self._forward(inputs, context)
-
-
 class Sequential(torch_nn.Sequential):
 
     def __init__(self, *layers: Union[Module, Sequence[Module]]):
@@ -432,7 +382,8 @@ class CoreLinear(BaseLayer):
     )
 
     weight_store: Module
-    bias_store: Optional[Module]
+    bias_store: Module
+    use_bias: bool
 
     def __init__(self,
                  weight_shape: List[int],
@@ -447,6 +398,8 @@ def __init__(self,
             weight_shape, initializer=weight_init, weight_norm=weight_norm)
         bias_store = get_bias_store(
             bias_shape, initializer=bias_init, use_bias=use_bias)
+        if bias_store is None:
+            bias_store = _NullParamStore()
 
         if data_init is not None:
             if not isinstance(data_init, init.DataDependentInitializer) and \
@@ -460,40 +413,37 @@ def __init__(self,
         super().__init__()
         self.weight_store = weight_store
         self.bias_store = bias_store
+        self.use_bias = use_bias
 
         if data_init is not None:
             data_init.register(self)
 
-    def __repr__(self) -> str:
-        attributes = []
-        for attr in self.__annotations__:
-            val = getattr(self, attr, None)
-            if val is not None:
-                if attr == 'use_bias':
-                    if not val:
-                        attributes.append(f'{attr}={val}')
-                elif not isinstance(val, (Module, Tensor)):
-                    attributes.append(f'{attr}={val!r}')
-        return f'{self.__class__.__qualname__}({", ".join(attributes)})'
-
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _is_attr_included_in_repr(self, attr: str, value: Any) -> bool:
+        return attr != 'use_bias' or not value
+
+    def __repr__(self):
+        return f'{self.__class__.__qualname__}({self.extra_repr()})'
+
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         raise NotImplementedError()
 
     def forward(self, input: Tensor) -> Tensor:
         weight = self.weight_store()
-        if self.bias_store is None:
-            bias = None
+        if self.use_bias:
+            bias: Optional[Tensor] = self.bias_store()
         else:
-            bias = self.bias_store()
-        return self._forward(input, weight, bias)
+            bias: Optional[Tensor] = None
+        return self._linear_transform(input, weight, bias)
 
 
 class Linear(CoreLinear):
 
     in_features: int
     out_features: int
-    use_bias: bool
 
     def __init__(self,
                  in_features: int,
@@ -509,7 +459,6 @@ def __init__(self,
 
         self.in_features = in_features
         self.out_features = out_features
-        self.use_bias = use_bias
 
         super().__init__(
             weight_shape=[out_features, in_features],
@@ -522,8 +471,11 @@ def __init__(self,
         )
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         output, front_shape = flatten_to_ndims(input, 2)
         output = torch.nn.functional.linear(output, weight, bias)
         output = unflatten_from_ndims(output, front_shape)
@@ -539,7 +491,6 @@ class LinearConvNd(CoreLinear):
     padding: List[Tuple[int, int]]
     _symmetric_padding: Optional[List[int]]
     dilation: List[int]
-    use_bias: bool
 
     def __init__(self,
                  in_channels: int,
@@ -570,7 +521,6 @@ def __init__(self,
         self.dilation = dilation
         self.padding = padding
         self._symmetric_padding = _symmetric_padding
-        self.use_bias = use_bias
 
         super().__init__(
             weight_shape=[out_channels, in_channels] + kernel_size,
@@ -592,8 +542,11 @@ def _get_spatial_ndims(self) -> int:
         return 1
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         if self._symmetric_padding is not None:
             return torch.nn.functional.conv1d(
                 input=input, weight=weight, bias=bias, stride=self.stride,
@@ -613,8 +566,11 @@ def _get_spatial_ndims(self) -> int:
         return 2
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         if self._symmetric_padding is not None:
             return torch.nn.functional.conv2d(
                 input=input, weight=weight, bias=bias, stride=self.stride,
@@ -634,8 +590,11 @@ def _get_spatial_ndims(self) -> int:
         return 3
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         if self._symmetric_padding is not None:
             return torch.nn.functional.conv3d(
                 input=input, weight=weight, bias=bias, stride=self.stride,
@@ -659,7 +618,6 @@ class LinearConvTransposeNd(CoreLinear):
     is_symmetric_padding: bool
     dilation: List[int]
     output_padding: List[int]
-    use_bias: bool
 
     def __init__(self,
                  in_channels: int,
@@ -694,7 +652,6 @@ def __init__(self,
         self._symmetric_padding = _symmetric_padding
         self.output_padding = output_padding
         self.dilation = dilation
-        self.use_bias = use_bias
 
         super().__init__(
             weight_shape=[in_channels, out_channels] + kernel_size,
@@ -731,8 +688,11 @@ def _get_spatial_ndims(self) -> int:
         return 1
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         if self._symmetric_padding is not None:
             return torch.nn.functional.conv_transpose1d(
                 input=input, weight=weight, bias=bias, stride=self.stride,
@@ -754,8 +714,11 @@ def _get_spatial_ndims(self) -> int:
         return 2
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         if self._symmetric_padding is not None:
             return torch.nn.functional.conv_transpose2d(
                 input=input, weight=weight, bias=bias, stride=self.stride,
@@ -777,8 +740,11 @@ def _get_spatial_ndims(self) -> int:
         return 3
 
     @jit_method
-    def _forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
-                 ) -> Tensor:
+    def _linear_transform(self,
+                          input: Tensor,
+                          weight: Tensor,
+                          bias: Optional[Tensor]
+                          ) -> Tensor:
         if self._symmetric_padding is not None:
             return torch.nn.functional.conv_transpose3d(
                 input=input, weight=weight, bias=bias, stride=self.stride,
@@ -859,7 +825,7 @@ def _check_input_dim(self, input: Tensor):
 Dropout = torch_nn.Dropout
 
 
-class Dropout1d(BaseSingleVariateLayer):
+class Dropout1d(BaseLayer):
     """Randomly zero out entire channels of the 1d convolution input."""
 
     __constants__ = ('p', '_keep_prob')
@@ -872,7 +838,7 @@ def __init__(self, p: float = 0.5):
         self.p = p
         self._keep_prob = 1. - p
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         if input.dim() < 2:  # pragma: no cover
             raise ValueError('`input` must be at least 2d, but the '
                              'input shape is {}.'.format(shape(input)))
diff --git a/tensorkit/backend/pytorch_/losses.py b/tensorkit/backend/pytorch_/losses.py
deleted file mode 100644
index 4013e77..0000000
--- a/tensorkit/backend/pytorch_/losses.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from .core import *
-from .layers import *
-
-
-__all__ = [
-    'BaseSupervisedLossLayer',
-]
-
-
-class BaseSupervisedLossLayer(BaseLayer):
-
-    def _forward(self, output: Tensor, target: Tensor) -> Tensor:
-        raise NotImplementedError()
-
-    def forward(self, output: Tensor, target: Tensor) -> Tensor:
-        ret = self._forward(output, target)
-        if ret.numel() > 1:
-            ret = ret.mean()
-        return ret
diff --git a/tensorkit/backend/pytorch_/nn.py b/tensorkit/backend/pytorch_/nn.py
index 7177cf6..a083b39 100644
--- a/tensorkit/backend/pytorch_/nn.py
+++ b/tensorkit/backend/pytorch_/nn.py
@@ -117,6 +117,8 @@ def cross_entropy_with_logits(logits: Tensor,
 
     logits, front_shape = flatten_to_ndims(logits, 2)
     labels, _ = flatten_to_ndims(labels, 1)
+    if labels.dtype != torch.int64:
+        labels = labels.to(torch.int64)
 
     ret = torch.nn.functional.cross_entropy(
         logits, labels, reduction=reduction)
diff --git a/tensorkit/backend/pytorch_/optim.py b/tensorkit/backend/pytorch_/optim.py
new file mode 100644
index 0000000..70e3ca9
--- /dev/null
+++ b/tensorkit/backend/pytorch_/optim.py
@@ -0,0 +1,131 @@
+from contextlib import contextmanager
+from typing import *
+
+import torch
+from torch.optim import (adam, adadelta, adagrad, adamax,
+                         rmsprop, sgd)
+from torch.optim.optimizer import Optimizer as TorchOptimizer
+
+from .core import *
+
+__all__ = [
+    'Optimizer', 'SGD', 'Adam',
+]
+
+
+class Optimizer(object):
+
+    @property
+    def lr(self) -> float:
+        raise NotImplementedError()
+
+    def set_lr(self, lr: float):
+        raise NotImplementedError()
+
+    def add_params(self, params: List[Variable]):
+        raise NotImplementedError()
+
+    def clear_grad(self):
+        raise NotImplementedError()
+
+    @contextmanager
+    def capture_grad(self) -> Generator[None, None, None]:
+        raise NotImplementedError()
+
+    def minimize(self, loss: Tensor):
+        raise NotImplementedError()
+
+    def maximize(self, loss: Tensor):
+        raise NotImplementedError()
+
+    def state_dict(self) -> Dict[str, Any]:
+        raise NotImplementedError()
+
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        raise NotImplementedError()
+
+
+class BackendOptimizer(Optimizer):
+
+    _lr: float = None
+    torch_optimizer: TorchOptimizer
+
+    def __init__(self,
+                 lr: float,
+                 torch_optimizer: TorchOptimizer):
+        self.torch_optimizer = torch_optimizer
+        self.set_lr(lr)
+
+    def lr(self) -> float:
+        return self._lr
+
+    def set_lr(self, lr: float):
+        if self._lr != lr:
+            for group in self.torch_optimizer.param_groups:
+                group['lr'] = lr
+        self._lr = lr
+
+    def add_params(self, params: Sequence[Variable]):
+        self.torch_optimizer.add_param_group({
+            'params': list(params),
+            'lr': self._lr,
+        })
+
+    def clear_grad(self):
+        self.torch_optimizer.zero_grad()
+
+    @contextmanager
+    def capture_grad(self) -> Generator[None, None, None]:
+        yield
+
+    def minimize(self, loss: Tensor):
+        loss.backward()
+        self.torch_optimizer.step()
+
+    def maximize(self, loss: Tensor):
+        self.minimize(-loss)
+
+    def state_dict(self) -> Dict[str, Any]:
+        return self.torch_optimizer.state_dict()
+
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        self.torch_optimizer.load_state_dict(state_dict)
+
+
+class SGD(BackendOptimizer):
+
+    def __init__(self,
+                 params: Iterable[Variable],
+                 lr: float,
+                 momentum: float = 0.,
+                 nesterov: bool = False):
+        super().__init__(
+            lr=lr,
+            torch_optimizer=torch.optim.sgd.SGD(
+                params=params,
+                lr=lr,
+                momentum=momentum,
+                nesterov=nesterov,
+            )
+        )
+
+
+class Adam(BackendOptimizer):
+
+    def __init__(self,
+                 params: Iterable[Variable],
+                 lr: float = 1e-3,
+                 beta_1: float = 0.9,
+                 beta_2: float = 0.999,
+                 epsilon: float = 1e-8,
+                 amsgrad: bool = False):
+        super().__init__(
+            lr=lr,
+            torch_optimizer=adam.Adam(
+                params=params,
+                lr=lr,
+                betas=(beta_1, beta_2),
+                eps=epsilon,
+                amsgrad=amsgrad,
+            )
+        )
diff --git a/tensorkit/backend/pytorch_/train.py b/tensorkit/backend/pytorch_/train.py
new file mode 100644
index 0000000..1632113
--- /dev/null
+++ b/tensorkit/backend/pytorch_/train.py
@@ -0,0 +1,91 @@
+import os
+from typing import *
+
+import torch
+from mltk import StatefulObject, BaseCheckpoint
+
+__all__ = ['Checkpoint']
+
+
+class _TorchCheckpointableObject(StatefulObject):
+    """Wraps a PyTorch checkpointable object into :class:`StatefulObject`."""
+
+    def __init__(self, torch_object):
+        self.torch_object = torch_object
+
+    def get_state_dict(self) -> Dict[str, Any]:
+        return self.torch_object.state_dict()
+
+    def set_state_dict(self, state: Dict[str, Any]):
+        self.torch_object.load_state_dict(state)
+
+
+class Checkpoint(BaseCheckpoint):
+    """
+    A checkpoint object that supports to save and recover PyTorch checkpointable
+    objects (i.e., objects with method :meth:`state_dict()` and
+    :meth:`load_state_dict()`).
+
+    Usage::
+
+        # create the PyTorch objects
+        class Net(torch.nn.Module):
+            ...
+
+        net = Net(...)
+        optimizer = T.optim.Adam(...)
+
+        # construct the checkpoint object
+        checkpoint = T.train.Checkpoint(net=net, optimizer=optimizer)
+
+        # save a checkpoint
+        checkpoint.save(checkpoint_path)
+
+        # restore the checkpoint
+        checkpoint.restore(checkpoint_path)
+    """
+
+    def __init__(self, **torch_objects: Any):
+        def to_stateful_object(obj) -> StatefulObject:
+            if isinstance(obj, StatefulObject):
+                return obj
+            elif hasattr(obj, 'state_dict') and hasattr(obj, 'load_state_dict'):
+                return _TorchCheckpointableObject(obj)
+            else:
+                raise TypeError(
+                    f'Object must be a :class:`StatefulObject`, or has '
+                    f'`state_dict()` and `load_state_dict()` methods: '
+                    f'got {obj!r}'
+                )
+
+        self._objects: Dict[str, StatefulObject] = {
+            k: to_stateful_object(o)
+            for k, o in torch_objects.items()
+        }
+
+    def _restore(self, checkpoint_path: str) -> None:
+        data_path = os.path.join(checkpoint_path, 'data.pth')
+        state_dict = torch.load(data_path)
+
+        # check whether or not all keys exist
+        for k in self._objects:
+            if k not in state_dict:
+                raise ValueError(f'Key {k!r} does not exist in the '
+                                 f'state dict recovered from: {data_path}')
+
+        # load the state dict
+        for k, o in self._objects.items():
+            o.set_state_dict(state_dict[k])
+
+    def _save(self, checkpoint_path: str) -> None:
+        # generate the state dict
+        state_dict = {
+            k: o.get_state_dict()
+            for k, o in self._objects.items()
+        }
+
+        # save the objects
+        if not os.path.exists(checkpoint_path):
+            os.makedirs(checkpoint_path, exist_ok=True)
+        data_path = os.path.join(checkpoint_path, 'data.pth')
+        torch.save(state_dict, data_path)
diff --git a/tensorkit/backend/pytorch_/utils.py b/tensorkit/backend/pytorch_/utils.py
new file mode 100644
index 0000000..c2efeab
--- /dev/null
+++ b/tensorkit/backend/pytorch_/utils.py
@@ -0,0 +1,182 @@
+from typing import *
+
+from .core import jit
+
+__all__ = [
+    'split_channel_spatial_shape', 'unsplit_channel_spatial_shape',
+    'calculate_deconv_output_padding',
+    'calculate_conv_output_size', 'calculate_deconv_output_size',
+]
+
+
+@jit
+def _check_conv_args(input_size: List[int],
+                     padding: List[Tuple[int, int]],
+                     arg_values: List[List[int]],
+                     arg_names: List[str]) -> int:
+    spatial_ndims = len(input_size)
+    if spatial_ndims not in (1, 2, 3):
+        raise ValueError(
+            '`input_size` is not a 1d, 2d or 3d convolutional input size: '
+            'got input size {}.'.format(input_size)
+        )
+
+    if len(padding) != spatial_ndims:
+        raise ValueError(
+            '`padding` is not for {}d convolution: got `padding` {}.'.
+            format(spatial_ndims, padding)
+        )
+
+    for i in range(len(arg_values)):
+        arg_val = arg_values[i]
+        if len(arg_val) != spatial_ndims:
+            arg_name = arg_names[i]
+            raise ValueError(
+                '`{}` is not for {}d convolution: got `{}` {}.'.
+                format(arg_name, spatial_ndims, arg_name, arg_val)
+            )
+
+    return spatial_ndims
+
+
+@jit
+def split_channel_spatial_shape(shape: List[int]) -> Tuple[int, List[int]]:
+    if len(shape) not in (2, 3, 4):
+        raise ValueError('Invalid `shape`: {}'.format(shape))
+    return shape[0], shape[1:]
+
+
+@jit
+def unsplit_channel_spatial_shape(channels: int, size: List[int]) -> List[int]:
+    if len(size) not in (1, 2, 3):
+        raise ValueError('Invalid `size`: {}'.format(size))
+    return [channels] + size
+
+
+@jit
+def calculate_deconv_output_padding(input_size: List[int],
+                                    output_size: List[int],
+                                    kernel_size: List[int],
+                                    stride: List[int],
+                                    padding: List[Tuple[int, int]],
+                                    dilation: List[int]):
+    """
+    Calculate the `output_padding` for deconvolution (conv_transpose).
+
+    Args:
+        input_size: The input size (shape) of the spatial dimensions.
+        output_size: The output size (shape) of the spatial dimensions.
+        kernel_size: The kernel size.
+        stride: The stride.
+        padding: The padding.
+        dilation: The dilation.
+
+    Returns:
+        The output padding, can be used to construct a deconvolution
+        (conv transpose) layer.
+
+    Raises:
+        ValueError: If any argument is invalid, or no output padding
+            can satisfy the specified arguments.
+    """
+    spatial_ndims = _check_conv_args(
+        input_size, padding,
+        [output_size, kernel_size, stride, dilation],
+        ['output_size', 'kernel_size', 'stride', 'dilation'],
+    )
+
+    ret: List[int] = []
+    for i in range(spatial_ndims):
+        op = output_size[i] - (
+            (input_size[i] - 1) * stride[i] -
+            (padding[i][0] + padding[i][1]) +
+            (kernel_size[i] - 1) * dilation[i] + 1
+        )
+        if op < 0 or (op >= stride[i] and op >= dilation[i]):
+            raise ValueError(
+                'No `output_padding` can satisfy the deconvolution task: '
+                'input_size == {}, output_size == {}, '
+                'kernel_size == {}, stride == {}, '
+                'padding == {}, dilation == {}.'.format(
+                    input_size, output_size, kernel_size, stride, padding,
+                    dilation
+                )
+            )
+        ret.append(op)
+
+    return ret
+
+
+@jit
+def calculate_conv_output_size(input_size: List[int],
+                               kernel_size: List[int],
+                               stride: List[int],
+                               padding: List[Tuple[int, int]],
+                               dilation: List[int]) -> List[int]:
+    """
+    Calculate the convolution output size for specified arguments.
+
+    Args:
+        input_size: The input size (shape) of the spatial dimensions.
+        kernel_size: The kernel size.
+        stride: The stride.
+        padding: The padding.
+        dilation: The dilation.
+
+    Returns:
+        The output size.
+    """
+    spatial_ndims = _check_conv_args(
+        input_size, padding,
+        [input_size, kernel_size, stride, dilation],
+        ['input_size', 'kernel_size', 'stride', 'dilation'],
+    )
+
+    ret: List[int] = []
+    for i in range(spatial_ndims):
+        ret.append(
+            1 + (input_size[i] + padding[i][0] + padding[i][1] -
+                 (kernel_size[i] - 1) * dilation[i] - 1) // stride[i]
+        )
+
+    return ret
+
+
+@jit
+def calculate_deconv_output_size(input_size: List[int],
+                                 kernel_size: List[int],
+                                 stride: List[int],
+                                 padding: List[Tuple[int, int]],
+                                 output_padding: List[int],
+                                 dilation: List[int]) -> List[int]:
+    """
+    Calculate the deconvolution output size for specified arguments.
+
+    Args:
+        input_size: The input size (shape) of the spatial dimensions.
+        kernel_size: The kernel size.
+        stride: The stride.
+        padding: The padding.
+        output_padding: The output padding.
+        dilation: The dilation.
+
+    Returns:
+        The output size.
+    """
+    spatial_ndims = _check_conv_args(
+        input_size, padding,
+        [input_size, kernel_size, stride, output_padding, dilation],
+        ['input_size', 'kernel_size', 'stride', 'output_padding', 'dilation'],
+    )
+
+    ret: List[int] = []
+    for i in range(spatial_ndims):
+        ret.append(
+            output_padding[i] +
+            (input_size[i] - 1) * stride[i] -
+            (padding[i][0] + padding[i][1]) +
+            (kernel_size[i] - 1) * dilation[i] +
+            1
+        )
+
+    return ret
diff --git a/tensorkit/backend/train.py b/tensorkit/backend/train.py
new file mode 100644
index 0000000..dbdd318
--- /dev/null
+++ b/tensorkit/backend/train.py
@@ -0,0 +1,9 @@
+from ..settings_ import settings
+
+if settings.backend == 'PyTorch':
+    from .pytorch_ import train
+    from .pytorch_.train import *
+else:
+    RuntimeError(f'Backend {settings.backend} not supported.')
+
+__all__ = train.__all__
diff --git a/tensorkit/backend/utils.py b/tensorkit/backend/utils.py
new file mode 100644
index 0000000..af30799
--- /dev/null
+++ b/tensorkit/backend/utils.py
@@ -0,0 +1,9 @@
+from ..settings_ import settings
+
+if settings.backend == 'PyTorch':
+    from .pytorch_ import utils
+    from .pytorch_.utils import *
+else:
+    RuntimeError(f'Backend {settings.backend} not supported.')
+
+__all__ = utils.__all__
diff --git a/tensorkit/distributions/flow.py b/tensorkit/distributions/flow.py
index 7926f96..6064903 100644
--- a/tensorkit/distributions/flow.py
+++ b/tensorkit/distributions/flow.py
@@ -1,7 +1,7 @@
 from typing import *
 
 from .. import tensor as T
-from ..flows import BaseFlow
+from ..flows import Flow
 from ..stochastic import StochasticTensor
 from .base import Distribution
 from .utils import copy_distribution, get_overrided_parameterized
@@ -18,7 +18,7 @@ class FlowDistribution(Distribution):
     _base_distribution: Distribution
     """The base distribution, which is transform by the `flow`."""
 
-    flow: BaseFlow
+    flow: Flow
     """The flow instance, which transforms the `distribution`."""
 
     _base_group_ndims: int
@@ -26,7 +26,7 @@ class FlowDistribution(Distribution):
 
     def __init__(self,
                  distribution: Distribution,
-                 flow: BaseFlow,
+                 flow: Flow,
                  reparameterized: Optional[bool] = None,
                  event_ndims: Optional[int] = None,
                  validate_tensors: Optional[bool] = None):
@@ -34,7 +34,7 @@ def __init__(self,
         if not isinstance(distribution, Distribution):
             raise TypeError(f'`distribution` is not an instance of '
                             f'`Distribution`: got {distribution!r}')
-        if not isinstance(flow, BaseFlow) and not T.is_jit_layer(flow):
+        if not isinstance(flow, Flow) and not T.is_jit_layer(flow):
             raise TypeError(f'`flow` is not a flow: {flow!r}')
 
         # `distribution` is required to be continuous and have float dtype.
@@ -53,19 +53,19 @@ def __init__(self,
 
         # requirement: distribution.event_ndims <= flow.x_event_ndims <= distribution.value_ndims
         #              otherwise the distribution cannot be transformed by the flow
-        if not (distribution.event_ndims <= flow.x_event_ndims <=
+        if not (distribution.event_ndims <= flow.get_x_event_ndims() <=
                 distribution.value_ndims):
             raise ValueError(
                 f'`distribution.event_ndims <= flow.x_event_ndims <= '
                 f'distribution.value_ndims` is not satisfied: '
                 f'`distribution.event_ndims` is {distribution.event_ndims}, '
-                f'while `flow.x_event_ndims` is {flow.x_event_ndims}.'
+                f'while `flow.x_event_ndims` is {flow.get_x_event_ndims()}.'
             )
 
         # requirement: min_event_ndims <= event_ndims <= max_event_ndims
-        min_event_ndims = flow.y_event_ndims
+        min_event_ndims = flow.get_y_event_ndims()
         max_event_ndims = (distribution.value_ndims +
-                           (flow.y_event_ndims - flow.x_event_ndims))
+                           (flow.get_y_event_ndims() - flow.get_x_event_ndims()))
         if event_ndims is not None and \
                 not (min_event_ndims <= event_ndims <= max_event_ndims):
             raise ValueError(
@@ -76,7 +76,7 @@ def __init__(self,
 
         # obtain the arguments
         if event_ndims is None:
-            event_ndims = flow.y_event_ndims
+            event_ndims = flow.get_y_event_ndims()
         batch_ndims = max_event_ndims - event_ndims
         batch_shape = distribution.batch_shape[:batch_ndims]
         reparameterized = get_overrided_parameterized(
@@ -87,7 +87,7 @@ def __init__(self,
         if validate_tensors is None:
             validate_tensors = distribution.validate_tensors
 
-        base_group_ndims = flow.x_event_ndims - distribution.event_ndims
+        base_group_ndims = flow.get_x_event_ndims() - distribution.event_ndims
 
         # now construct the instance
         super(FlowDistribution, self).__init__(
diff --git a/tensorkit/examples/.gitignore b/tensorkit/examples/.gitignore
new file mode 100644
index 0000000..68bcbc9
--- /dev/null
+++ b/tensorkit/examples/.gitignore
@@ -0,0 +1 @@
+results/
\ No newline at end of file
diff --git a/tests/losses/__init__.py b/tensorkit/examples/__init__.py
similarity index 100%
rename from tests/losses/__init__.py
rename to tensorkit/examples/__init__.py
diff --git a/tensorkit/examples/classification/__init__.py b/tensorkit/examples/classification/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tensorkit/examples/classification/mnist.py b/tensorkit/examples/classification/mnist.py
new file mode 100644
index 0000000..7b43db0
--- /dev/null
+++ b/tensorkit/examples/classification/mnist.py
@@ -0,0 +1,99 @@
+import mltk
+import tensorkit as tk
+from tensorkit import tensor as T
+from tensorkit.examples import utils
+
+
+class Config(mltk.Config):
+    max_epoch: int = 10
+    batch_size: int = 32
+    test_batch_size: int = 64
+    lr: float = 0.001
+    lr_anneal_ratio: float = 0.5
+    lr_anneal_epochs: int = 5
+
+
+def main(exp: mltk.Experiment[Config]):
+    # prepare the data
+    train_stream, val_stream, test_stream = utils.get_mnist_streams(
+        batch_size=exp.config.batch_size,
+        test_batch_size=exp.config.test_batch_size,
+        val_batch_size=exp.config.test_batch_size,
+        val_portion=0.2,
+        flatten=True,
+        x_range=(-1., 1.),
+    )
+
+    # build the network
+    net: T.Module = tk.layers.SequentialBuilder(784). \
+        set_args('dense',
+                 activation=tk.layers.LeakyReLU,
+                 data_init=tk.init.StdDataInit()). \
+        dense(500). \
+        dense(500). \
+        linear(10). \
+        log_softmax(). \
+        build()
+
+    # define the train and evaluate functions
+    def train_step(x, y):
+        logits = net(x)
+        loss = T.nn.cross_entropy_with_logits(logits, y, reduction='mean')
+        acc = utils.calculate_acc(logits, y)
+        return {'loss': loss, 'acc': acc}
+
+    def evaluate(x, y):
+        with T.no_grad():
+            logits = net(x)
+            acc = utils.calculate_acc(logits, y)
+        return {'acc': acc}
+
+    # build the optimizer and the train loop
+    loop = mltk.TrainLoop(max_epoch=exp.config.max_epoch)
+    optimizer = tk.optim.Adam(tk.layers.get_parameters(net))
+    lr_scheduler = tk.optim.lr_scheduler.AnnealingLR(
+        loop=loop,
+        optimizer=optimizer,
+        initial_lr=exp.config.lr,
+        ratio=exp.config.lr_anneal_ratio,
+        epochs=exp.config.lr_anneal_epochs
+    )
+
+    # add a callback to do early-stopping on the network parameters
+    # according to the validation metric.
+    loop.add_callback(
+        mltk.callbacks.EarlyStopping(
+            checkpoint=tk.train.Checkpoint(net=net),
+            root_dir=exp.abspath('./checkpoint/early-stopping'),
+            # note for `loop.validation()`, the prefix "val_" will be
+            # automatically prepended to any metrics generated by the
+            # `evaluate` function.
+            metric_name='val_acc',
+            smaller_is_better=False,
+        )
+    )
+
+    # run validation after every epoch
+    if val_stream is not None:
+        loop.run_after_every(
+            lambda: loop.validation().run(evaluate, val_stream),
+            epochs=1,
+        )
+
+    # run test after every epoch
+    loop.run_after_every(
+        lambda: loop.test().run(evaluate, test_stream),
+        epochs=1,
+    )
+
+    # train the model
+    utils.fit_model(loop=loop, optimizer=optimizer, fn=train_step,
+                    stream=train_stream)
+
+    # do the final test with the best network parameters (according to validation)
+    results = mltk.TestLoop().run(evaluate, test_stream)
+
+
+if __name__ == '__main__':
+    with mltk.Experiment(Config) as exp:
+        main(exp)
diff --git a/tensorkit/examples/classification/mnist_resnet.py b/tensorkit/examples/classification/mnist_resnet.py
new file mode 100644
index 0000000..4b5aacd
--- /dev/null
+++ b/tensorkit/examples/classification/mnist_resnet.py
@@ -0,0 +1,104 @@
+import mltk
+import tensorkit as tk
+from tensorkit import tensor as T
+from tensorkit.examples import utils
+
+
+class Config(mltk.Config):
+    max_epoch: int = 10
+    batch_size: int = 32
+    test_batch_size: int = 64
+    lr: float = 0.01
+    lr_anneal_ratio: float = 0.5
+    lr_anneal_epochs: int = 2
+
+
+def main(exp: mltk.Experiment[Config]):
+    # prepare the data
+    train_stream, val_stream, test_stream = utils.get_mnist_streams(
+        batch_size=exp.config.batch_size,
+        test_batch_size=exp.config.test_batch_size,
+        val_batch_size=exp.config.test_batch_size,
+        val_portion=0.2,
+        x_range=(-1., 1.),
+    )
+
+    # build the network
+    net: T.Module = tk.layers.SequentialBuilder(train_stream.data_shapes[0]). \
+        set_args('res_block2d',
+                 kernel_size=3,
+                 activation=tk.layers.LeakyReLU,
+                 normalizer=tk.layers.BatchNorm2d,
+                 dropout=0.5,
+                 data_init=tk.init.StdDataInit()). \
+        res_block2d(16). \
+        res_block2d(32, stride=2). \
+        res_block2d(32). \
+        res_block2d(64, stride=2). \
+        res_block2d(64). \
+        global_avg_pool2d(). \
+        linear(10). \
+        log_softmax(). \
+        build()
+
+    # the train, test and validate functions
+    def train_step(x, y):
+        logits = net(x)
+        loss = T.nn.cross_entropy_with_logits(logits, y, reduction='mean')
+        acc = utils.calculate_acc(logits, y)
+        return {'loss': loss, 'acc': acc}
+
+    def evaluate(x, y):
+        with T.no_grad():
+            logits = net(x)
+            acc = utils.calculate_acc(logits, y)
+        return {'acc': acc}
+
+    # build the optimizer and the train loop
+    loop = mltk.TrainLoop(max_epoch=exp.config.max_epoch)
+    optimizer = tk.optim.Adam(tk.layers.get_parameters(net))
+    lr_scheduler = tk.optim.lr_scheduler.AnnealingLR(
+        loop=loop,
+        optimizer=optimizer,
+        initial_lr=exp.config.lr,
+        ratio=exp.config.lr_anneal_ratio,
+        epochs=exp.config.lr_anneal_epochs
+    )
+
+    # add a callback to do early-stopping on the network parameters
+    # according to the validation metric.
+    loop.add_callback(
+        mltk.callbacks.EarlyStopping(
+            checkpoint=tk.train.Checkpoint(net=net),
+            root_dir=exp.abspath('./checkpoint/early-stopping'),
+            # note for `loop.validation()`, the prefix "val_" will be
+            # automatically prepended to any metrics generated by the
+            # `evaluate` function.
+            metric_name='val_acc',
+            smaller_is_better=False,
+        )
+    )
+
+    # run validation after every epoch
+    loop.run_after_every(
+        lambda: loop.validation().run(evaluate, val_stream),
+        epochs=1,
+    )
+
+    # run test after every epoch
+    loop.run_after_every(
+        lambda: loop.test().run(evaluate, test_stream),
+        epochs=1,
+    )
+
+    # train the model
+    utils.fit_model(loop=loop, optimizer=optimizer, fn=train_step,
+                    stream=train_stream)
+
+    # do the final test with the best network parameters (according to validation)
+    results = mltk.TestLoop().run(evaluate, test_stream)
+
+
+if __name__ == '__main__':
+    with mltk.Experiment(Config) as exp:
+        main(exp)
diff --git a/tensorkit/examples/utils/__init__.py b/tensorkit/examples/utils/__init__.py
new file mode 100644
index 0000000..b6d3a03
--- /dev/null
+++ b/tensorkit/examples/utils/__init__.py
@@ -0,0 +1,3 @@
+from .fit_model_ import *
+from .ops import *
+from .prepare_data import *
diff --git a/tensorkit/examples/utils/fit_model_.py b/tensorkit/examples/utils/fit_model_.py
new file mode 100644
index 0000000..eb6d3a7
--- /dev/null
+++ b/tensorkit/examples/utils/fit_model_.py
@@ -0,0 +1,39 @@
+from typing import *
+
+import mltk
+
+import tensorkit as tk
+from tensorkit import tensor as T
+from tensorkit.typing_ import TensorOrData
+
+__all__ = ['fit_model']
+
+
+def fit_model(loop: mltk.TrainLoop,
+              optimizer: tk.optim.Optimizer,
+              fn: Callable[..., Dict[str, TensorOrData]],
+              stream: mltk.DataStream,
+              loss_metric: str = 'loss',
+              minimize_loss: bool = True):
+    def step(*train_data):
+        optimizer.clear_grad()
+        with optimizer.capture_grad():
+            metrics = fn(*train_data)
+            try:
+                loss = metrics[loss_metric]
+                if not isinstance(loss, T.Tensor):
+                    raise TypeError()
+            except Exception:
+                raise ValueError(
+                    f'`train_fn` is expected to return a dict, carrying '
+                    f'the train loss in the "{loss_metric}" entry: got '
+                    f'{metrics!r}.'
+                )
+            else:
+                if minimize_loss:
+                    optimizer.minimize(loss)
+                else:
+                    optimizer.maximize(loss)
+            return metrics
+
+    loop.run(step, stream)
diff --git a/tensorkit/examples/utils/ops.py b/tensorkit/examples/utils/ops.py
new file mode 100644
index 0000000..1fcf83b
--- /dev/null
+++ b/tensorkit/examples/utils/ops.py
@@ -0,0 +1,9 @@
+from tensorkit import tensor as T
+
+__all__ = ['calculate_acc']
+
+
+def calculate_acc(logits: T.Tensor, y: T.Tensor) -> T.Tensor:
+    with T.no_grad():
+        out_y = T.argmax(logits, axis=-1)
+        return T.reduce_mean(T.cast(T.equal(out_y, y), dtype=T.float32))
diff --git a/tensorkit/examples/utils/prepare_data.py b/tensorkit/examples/utils/prepare_data.py
new file mode 100644
index 0000000..2f3950f
--- /dev/null
+++ b/tensorkit/examples/utils/prepare_data.py
@@ -0,0 +1,69 @@
+from typing import *
+
+import mltk
+import numpy as np
+
+import tensorkit as tk
+from tensorkit import tensor as T
+
+__all__ = [
+    'get_mnist_streams'
+]
+
+
+def _scale_pixels_to_range(x, x_min, x_max):
+    scale = (x_max - x_min) / 255.
+    return np.minimum(np.maximum(x * scale + x_min, x_min), x_max)
+
+
+def get_mnist_streams(batch_size: int,
+                      test_batch_size: Optional[int] = None,
+                      val_batch_size: Optional[int] = None,
+                      val_portion: Optional[float] = None,
+                      flatten: bool = False,
+                      x_range: Optional[Tuple[float, float]] = None,
+                      y_dtype: Union[str, np.dtype] = np.int32,
+                      as_tensor_stream: bool = True,
+                      prefetch: Optional[int] = 5,
+                      ) -> Tuple[mltk.DataStream, Optional[mltk.DataStream], mltk.DataStream]:
+    # check the arguments
+    if test_batch_size is None:
+        test_batch_size = batch_size
+    if val_batch_size is None:
+        val_batch_size = batch_size
+
+    # load data
+    x_shape = [784] if flatten else [28, 28, 1]
+    (train_x, train_y), (test_x, test_y) = mltk.data.load_mnist(
+        x_shape=x_shape, x_dtype=np.float32, y_dtype=y_dtype)
+
+    if not flatten:
+        train_x = tk.utils.numpy_channel_from_last_to_default2d(train_x)
+        test_x = tk.utils.numpy_channel_from_last_to_default2d(test_x)
+
+    # scale pixels to the desired range
+    if x_range is not None:
+        train_x = _scale_pixels_to_range(train_x, *x_range)
+        test_x = _scale_pixels_to_range(test_x, *x_range)
+
+    # split train & valid set, and construct the streams
+    def make_stream(arrays, **kwargs):
+        stream = mltk.DataStream.arrays(arrays, **kwargs)
+        if as_tensor_stream:
+            stream = tk.utils.as_tensor_stream(stream, prefetch=prefetch)
+        return stream
+
+    if val_portion is not None:
+        (train_x, train_y), (val_x, val_y) = \
+            mltk.utils.split_numpy_arrays([train_x, train_y], portion=val_portion)
+        val_stream = make_stream([val_x, val_y], batch_size=val_batch_size)
+    else:
+        val_stream = None
+
+    train_stream = make_stream(
+        [train_x, train_y], batch_size=batch_size, shuffle=True,
+        skip_incomplete=True)
+    test_stream = make_stream([test_x, test_y], batch_size=test_batch_size)
+
+    # return the streams
+    return train_stream, val_stream, test_stream
diff --git a/tensorkit/flows/act_norm.py b/tensorkit/flows/act_norm.py
index f2ec1fc..239da4b 100644
--- a/tensorkit/flows/act_norm.py
+++ b/tensorkit/flows/act_norm.py
@@ -2,7 +2,9 @@
 from typing import *
 
 from .. import init, tensor as T
-from ..tensor import Tensor, Module, reshape
+from ..tensor import (Tensor, Module, reshape, shape, int_range,
+                      calculate_mean_and_var, assert_finite,
+                      as_tensor_backend, maximum, log, sqrt)
 from ..layers import *
 from ..typing_ import *
 from .core import *
@@ -103,61 +105,64 @@ def __init__(self,
     def set_initialized(self, initialized: bool = True) -> None:
         self.initialized = initialized
 
-    @T.jit_ignore
-    def initialize_with_input(self, input: Tensor) -> bool:
+    @T.jit_method
+    def calculate_bias_and_pre_scale_for_init(self, input: Tensor) -> Tuple[Tensor, Tensor]:
         # PyTorch 1.3.1 bug: cannot mark this method as returning `None`.
         input_rank = T.rank(input)
 
-        if not isinstance(input, Tensor) or input_rank < self.event_ndims + 1:
+        if not isinstance(input, Tensor) or input_rank < self.x_event_ndims + 1:
             raise ValueError(
-                f'`input` is required to be a tensor with '
-                f'at least {self.event_ndims + 1} dimensions: got input shape '
-                f'{T.shape(input)!r}, while `event_ndims` of '
-                f'the ActNorm layer {self!r} is {self.event_ndims}.')
+                '`input` is required to be a tensor with at least {} '
+                'dimensions: got input shape {}.'.
+                format(self.x_event_ndims, shape(input))
+            )
 
         # calculate the axis to reduce
         feature_axis = input_rank + self.axis
         reduce_axis = (
-            T.int_range(0, feature_axis) +
-            T.int_range(feature_axis + 1, input_rank)
+            int_range(0, feature_axis) +
+            int_range(feature_axis + 1, input_rank)
         )
 
         # calculate sample mean and variance
-        input_mean, input_var = T.calculate_mean_and_var(
+        input_mean, input_var = calculate_mean_and_var(
             input, axis=reduce_axis, unbiased=True)
-        input_var = T.assert_finite(input_var, 'input_var')
+        input_var = assert_finite(input_var, 'input_var')
 
         # calculate the initial_value for `bias`
         bias = -input_mean
 
         # calculate the initial value for `pre_scale`
-        epsilon = T.as_tensor_backend(self.epsilon, dtype=input_var.dtype)
+        epsilon = as_tensor_backend(self.epsilon, dtype=input_var.dtype)
         if self.scale_type == 'exp':
-            pre_scale = -0.5 * T.log(T.maximum(input_var, epsilon))
+            pre_scale = -0.5 * log(maximum(input_var, epsilon))
         else:
-            pre_scale = 1. / T.sqrt(T.maximum(input_var, epsilon))
+            pre_scale = 1. / sqrt(maximum(input_var, epsilon))
 
-        # assign the initial values to the layer parameters
+        return bias, pre_scale
+
+    @T.jit_ignore
+    def _initialize_act_norm(self, input: Tensor) -> None:
+        bias, pre_scale = self.calculate_bias_and_pre_scale_for_init(input)
         with T.no_grad():
             T.assign(get_parameter(self, 'bias'), bias)
             T.assign(get_parameter(self, 'pre_scale'), pre_scale)
-
         self.set_initialized(True)
-        return True
 
     @T.jit_method
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         # initialize the parameters
         if not self.initialized:
             if inverse:
                 raise RuntimeError(
                     '`ActNorm` must be initialized with `inverse = False`.')
-            self.initialize_with_input(input)
+            # self.initialize_with_input(input)
+            self._initialize_act_norm(input)
             self.set_initialized(True)
 
         # do transformation
@@ -169,7 +174,7 @@ def _forward(self,
             output, output_log_det = self.scale(
                 input=input,
                 pre_scale=pre_scale,
-                event_ndims=self.event_ndims,
+                event_ndims=self.x_event_ndims,
                 input_log_det=input_log_det,
                 compute_log_det=compute_log_det,
                 inverse=True,
@@ -179,7 +184,7 @@ def _forward(self,
             output, output_log_det = self.scale(
                 input=input + shift,
                 pre_scale=pre_scale,
-                event_ndims=self.event_ndims,
+                event_ndims=self.x_event_ndims,
                 input_log_det=input_log_det,
                 compute_log_det=compute_log_det,
                 inverse=False,
diff --git a/tensorkit/flows/coupling.py b/tensorkit/flows/coupling.py
index 6caad02..4d1729a 100644
--- a/tensorkit/flows/coupling.py
+++ b/tensorkit/flows/coupling.py
@@ -2,7 +2,7 @@
 
 from .. import tensor as T
 from ..tensor import Tensor, Module, concat, split
-from .core import (FeatureMappingFlow, BaseScale, ExpScale, SigmoidScale,
+from .core import (FeatureMappingFlow, Scale, ExpScale, SigmoidScale,
                    LinearScale)
 
 __all__ = [
@@ -53,8 +53,8 @@ def __init__(self,
                  shift_and_pre_scale: Module,
                  axis: int = -1,
                  event_ndims: int = 1,
-                 scale: Union[str, BaseScale, Type[BaseScale],
-                              Callable[[], BaseScale]] = 'exp',
+                 scale: Union[str, Scale, Type[Scale],
+                              Callable[[], Scale]] = 'exp',
                  secondary: bool = False,
                  sigmoid_scale_bias: float = 2.,
                  epsilon: float = T.EPSILON):
@@ -106,7 +106,7 @@ def __init__(self,
                 scale = INVALID
 
         if isinstance(scale, Module):
-            if not isinstance(scale, BaseScale) and not T.is_jit_layer(scale):
+            if not isinstance(scale, Scale) and not T.is_jit_layer(scale):
                 scale = INVALID
         elif isinstance(scale, type) or callable(scale):
             if scale is SigmoidScale:
@@ -131,12 +131,12 @@ def __init__(self,
         self.sigmoid_scale_bias = sigmoid_scale_bias
         self.epsilon = epsilon
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         # split the tensor
         n_features = input.shape[self.axis]
         n1 = n_features // 2
@@ -152,7 +152,7 @@ def _forward(self,
             y2, output_log_det = self.scale(
                 input=x2,
                 pre_scale=pre_scale,
-                event_ndims=self.event_ndims,
+                event_ndims=self.x_event_ndims,
                 input_log_det=input_log_det,
                 compute_log_det=compute_log_det,
                 inverse=True,
@@ -162,7 +162,7 @@ def _forward(self,
             y2, output_log_det = self.scale(
                 input=x2 + shift,
                 pre_scale=pre_scale,
-                event_ndims=self.event_ndims,
+                event_ndims=self.x_event_ndims,
                 input_log_det=input_log_det,
                 compute_log_det=compute_log_det,
                 inverse=False,
@@ -180,8 +180,8 @@ class CouplingLayerNd(CouplingLayer):
 
     def __init__(self,
                  shift_and_pre_scale: Module,
-                 scale: Union[str, BaseScale, Type[BaseScale],
-                              Callable[[], BaseScale]] = 'exp',
+                 scale: Union[str, Scale, Type[Scale],
+                              Callable[[], Scale]] = 'exp',
                  secondary: bool = False,
                  sigmoid_scale_bias: float = 2.,
                  epsilon: float = T.EPSILON):
diff --git a/tensorkit/flows/rearrangement.py b/tensorkit/flows/rearrangement.py
index 91d5ffb..539771c 100644
--- a/tensorkit/flows/rearrangement.py
+++ b/tensorkit/flows/rearrangement.py
@@ -54,12 +54,12 @@ def __init__(self,
         add_parameter(self, 'inv_permutation', inv_permutation,
                       requires_grad=False)
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             output = index_select(input, self.inv_permutation, axis=self.axis)
         else:
diff --git a/tensorkit/flows/reshape_.py b/tensorkit/flows/reshape_.py
index 5f90b46..9f1eec8 100644
--- a/tensorkit/flows/reshape_.py
+++ b/tensorkit/flows/reshape_.py
@@ -11,7 +11,7 @@
 ]
 
 
-class ReshapeFlow(BaseFlow):
+class ReshapeFlow(Flow):
     """
     A flow which reshapes the last `x_event_ndims` of `x` into `y_event_shape`.
 
@@ -26,7 +26,7 @@ class ReshapeFlow(BaseFlow):
         # log_det == tf.zeros([2])
     """
 
-    __constants__ = BaseFlow.__constants__ + ('x_event_shape', 'y_event_shape')
+    __constants__ = Flow.__constants__ + ('x_event_shape', 'y_event_shape')
 
     x_event_shape: List[int]
     y_event_shape: List[int]
@@ -71,12 +71,12 @@ def check_shape(name, event_shape):
         self.x_event_shape = x_event_shape
         self.y_event_shape = y_event_shape
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             output = reshape_tail(input, self.y_event_ndims, self.x_event_shape)
         else:
@@ -88,7 +88,7 @@ def _forward(self,
         return output, output_log_det
 
 
-class SpaceDepthTransformFlow(BaseFlow):
+class SpaceDepthTransformFlow(Flow):
 
     __constants__ = ('block_size',)
 
@@ -115,21 +115,21 @@ def __init__(self, block_size: int):
     def _get_spatial_ndim(self) -> int:
         raise NotImplementedError()
 
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         raise NotImplementedError()
 
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         raise NotImplementedError()
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
-            output = self._inv_transform(input)
+            output = self._transform_inverse(input)
         else:
-            output = self._transform(input)
+            output = self._transform_forward(input)
 
         output_log_det = input_log_det
         if compute_log_det and output_log_det is None:
@@ -145,14 +145,14 @@ def _get_spatial_ndim(self) -> int:
         return 1
 
     @jit_method
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         return space_to_depth1d(input, self.block_size)
 
     @jit_method
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         return depth_to_space1d(input, self.block_size)
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return DepthToSpace1d(self.block_size)
 
 
@@ -163,14 +163,14 @@ def _get_spatial_ndim(self) -> int:
         return 2
 
     @jit_method
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         return space_to_depth2d(input, self.block_size)
 
     @jit_method
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         return depth_to_space2d(input, self.block_size)
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return DepthToSpace2d(self.block_size)
 
 
@@ -181,14 +181,14 @@ def _get_spatial_ndim(self) -> int:
         return 3
 
     @jit_method
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         return space_to_depth3d(input, self.block_size)
 
     @jit_method
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         return depth_to_space3d(input, self.block_size)
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return DepthToSpace3d(self.block_size)
 
 
@@ -199,14 +199,14 @@ def _get_spatial_ndim(self) -> int:
         return 1
 
     @jit_method
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         return depth_to_space1d(input, self.block_size)
 
     @jit_method
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         return space_to_depth1d(input, self.block_size)
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return SpaceToDepth1d(self.block_size)
 
 
@@ -217,14 +217,14 @@ def _get_spatial_ndim(self) -> int:
         return 2
 
     @jit_method
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         return depth_to_space2d(input, self.block_size)
 
     @jit_method
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         return space_to_depth2d(input, self.block_size)
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return SpaceToDepth2d(self.block_size)
 
 
@@ -235,12 +235,12 @@ def _get_spatial_ndim(self) -> int:
         return 3
 
     @jit_method
-    def _transform(self, input: Tensor) -> Tensor:
+    def _transform_forward(self, input: Tensor) -> Tensor:
         return depth_to_space3d(input, self.block_size)
 
     @jit_method
-    def _inv_transform(self, input: Tensor) -> Tensor:
+    def _transform_inverse(self, input: Tensor) -> Tensor:
         return space_to_depth3d(input, self.block_size)
 
-    def invert(self) -> BaseFlow:
+    def invert(self) -> Flow:
         return SpaceToDepth3d(self.block_size)
diff --git a/tensorkit/flows/split_.py b/tensorkit/flows/split_.py
index 9788f08..b055bfa 100644
--- a/tensorkit/flows/split_.py
+++ b/tensorkit/flows/split_.py
@@ -9,7 +9,7 @@
 ]
 
 
-class SplitFlow(BaseFlow):
+class SplitFlow(Flow):
     """
     A flow which splits input `x` into halves, apply different flows on each
     half, then concat the output together.
@@ -26,7 +26,7 @@ class SplitFlow(BaseFlow):
         log_det = log_det1 + log_det2
     """
 
-    __constants__ = BaseFlow.__constants__ + (
+    __constants__ = Flow.__constants__ + (
         'left', 'right', 'x_sections', 'x_axis', 'y_sections', 'y_axis',
     )
 
@@ -39,8 +39,8 @@ class SplitFlow(BaseFlow):
 
     def __init__(self,
                  x_sections: Sequence[int],
-                 left: BaseFlow,
-                 right: Optional[BaseFlow] = None,
+                 left: Flow,
+                 right: Optional[Flow] = None,
                  y_sections: Optional[Sequence[int]] = None,
                  x_axis: int = -1,
                  y_axis: Optional[int] = None):
@@ -79,23 +79,23 @@ def __init__(self,
                                  f'two positive integers: got {y_sections!r}.')
             y_sections = list(map(int, y_sections))
 
-        if not isinstance(left, BaseFlow) and not T.is_jit_layer(left):
+        if not isinstance(left, Flow) and not T.is_jit_layer(left):
             raise TypeError(f'`left` is not a flow: got {left!r}.')
-        x_event_ndims = left.x_event_ndims
-        y_event_ndims = left.y_event_ndims
+        x_event_ndims = left.get_x_event_ndims()
+        y_event_ndims = left.get_y_event_ndims()
 
         if right is not None:
-            if not isinstance(right, BaseFlow) and not T.is_jit_layer(right):
+            if not isinstance(right, Flow) and not T.is_jit_layer(right):
                 raise TypeError(f'`right` is not a flow: got {right!r}.')
-            if right.x_event_ndims != x_event_ndims or \
-                    right.y_event_ndims != y_event_ndims:
+            if right.get_x_event_ndims() != x_event_ndims or \
+                    right.get_y_event_ndims() != y_event_ndims:
                 raise ValueError(
                     f'`left` and `right` flows must have same `x_event_ndims` '
                     f'and `y_event_ndims`: '
-                    f'got `left.x_event_ndims` == {left.x_event_ndims!r}, '
-                    f'`left.y_event_ndims` == {left.y_event_ndims}, '
-                    f'`right.x_event_ndims` == {right.x_event_ndims}, '
-                    f'and `right.y_event_ndims` == {right.y_event_ndims}.'
+                    f'got `left.x_event_ndims` == {left.get_x_event_ndims()!r}, '
+                    f'`left.y_event_ndims` == {left.get_y_event_ndims()}, '
+                    f'`right.x_event_ndims` == {right.get_x_event_ndims()}, '
+                    f'and `right.y_event_ndims` == {right.get_y_event_ndims()}.'
                 )
 
         if x_event_ndims != y_event_ndims:
@@ -124,12 +124,12 @@ def __init__(self,
         self.y_sections = y_sections
         self.y_axis = y_axis
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             out_left, out_right = split(
                 input, sections=self.y_sections, axis=self.y_axis)
@@ -159,8 +159,8 @@ class SplitFlowNd(SplitFlow):
 
     def __init__(self,
                  x_sections: Sequence[int],
-                 left: BaseFlow,
-                 right: Optional[BaseFlow] = None,
+                 left: Flow,
+                 right: Optional[Flow] = None,
                  y_sections: Optional[Sequence[int]] = None):
         """
         Construct a new convolutional split flow.
@@ -181,13 +181,13 @@ def __init__(self,
             # type error deferred to the base class, thus we only check
             # the event ndims if `arg` looks like a flow.
             if arg is not None and hasattr(arg, 'x_event_ndims'):
-                if arg.x_event_ndims != event_ndims or \
-                        arg.y_event_ndims != event_ndims:
+                if arg.get_x_event_ndims() != event_ndims or \
+                        arg.get_y_event_ndims() != event_ndims:
                     raise ValueError(
                         f'The `x_event_ndims` and `y_event_ndims` of '
                         f'`{arg_name}` are required to be {event_ndims}: '
-                        f'got `x_event_ndims` == {arg.x_event_ndims}, '
-                        f'and `y_event_ndims` == {arg.y_event_ndims}.'
+                        f'got `x_event_ndims` == {arg.get_x_event_ndims()}, '
+                        f'and `y_event_ndims` == {arg.get_y_event_ndims()}.'
                     )
 
         super().__init__(
diff --git a/tensorkit/init/std_data_init.py b/tensorkit/init/std_data_init.py
index f970148..0527771 100644
--- a/tensorkit/init/std_data_init.py
+++ b/tensorkit/init/std_data_init.py
@@ -21,7 +21,7 @@ def __init__(self, epsilon: float = T.EPSILON):
         super().__init__()
         self.epsilon = epsilon
 
-    def _forward(self, layer: Module, inputs: List[Tensor]) -> None:
+    def _init(self, layer: Module, inputs: List[Tensor]) -> None:
         if T.is_jit_layer(layer):
             raise TypeError(f'JIT compiled layer is not supported: got {layer!r}')
         if not isinstance(layer, CoreLinear):
@@ -31,8 +31,9 @@ def _forward(self, layer: Module, inputs: List[Tensor]) -> None:
                              f'{inputs!r}')
 
         # get the weight and bias
+        use_bias = layer.use_bias
         weight = layer.weight_store()
-        bias = layer.bias_store() if layer.bias_store is not None else None
+        bias = layer.bias_store() if use_bias else None
         is_conv_transpose = isinstance(layer, (LinearConvTranspose1d,
                                                LinearConvTranspose2d,
                                                LinearConvTranspose3d))
diff --git a/tensorkit/layers/__init__.py b/tensorkit/layers/__init__.py
index 0d66afb..436ccd8 100644
--- a/tensorkit/layers/__init__.py
+++ b/tensorkit/layers/__init__.py
@@ -1,4 +1,5 @@
 from .activation import *
+from .builder import *
 from .composed import *
 from .contextual import *
 from .core import *
diff --git a/tensorkit/layers/activation.py b/tensorkit/layers/activation.py
index e86a875..fb5cd03 100644
--- a/tensorkit/layers/activation.py
+++ b/tensorkit/layers/activation.py
@@ -1,19 +1,19 @@
 from ..tensor import Tensor, tanh
-from ..tensor.nn import LEAKY_RELU_DEFAULT_SLOPE, relu, leaky_relu, sigmoid
+from ..tensor.nn import *
 from .core import *
 
 __all__ = [
-    'ReLU', 'LeakyReLU', 'Tanh', 'Sigmoid',
+    'ReLU', 'LeakyReLU', 'Tanh', 'Sigmoid', 'LogSoftmax',
 ]
 
 
-class ReLU(BaseSingleVariateLayer):
+class ReLU(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return relu(input)
 
 
-class LeakyReLU(BaseSingleVariateLayer):
+class LeakyReLU(BaseLayer):
 
     __constants__ = ('negative_slope',)
 
@@ -23,17 +23,23 @@ def __init__(self, negative_slope=LEAKY_RELU_DEFAULT_SLOPE):
         super().__init__()
         self.negative_slope = negative_slope
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return leaky_relu(input, negative_slope=self.negative_slope)
 
 
-class Tanh(BaseSingleVariateLayer):
+class Tanh(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return tanh(input)
 
 
-class Sigmoid(BaseSingleVariateLayer):
+class Sigmoid(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return sigmoid(input)
+
+
+class LogSoftmax(BaseLayer):
+
+    def forward(self, input: Tensor) -> Tensor:
+        return log_softmax(input)
diff --git a/tensorkit/layers/builder.py b/tensorkit/layers/builder.py
new file mode 100644
index 0000000..9cae176
--- /dev/null
+++ b/tensorkit/layers/builder.py
@@ -0,0 +1,651 @@
+import re
+from contextlib import contextmanager
+from typing import *
+
+from mltk.utils import NOT_SET
+
+from .activation import *
+from .composed import *
+from .core import *
+from .pool import *
+from .resnet import *
+from .shape_ import *
+from .. import tensor as T
+from ..arg_check import *
+from ..typing_ import *
+
+__all__ = ['SequentialBuilder']
+
+
+def _get_layer_class(name: str) -> type:
+    if not _cached_layer_class_names_map:
+        # map the standard names of the layers to the layer classes
+        import tensorkit as tk
+        for attr in dir(tk.layers):
+            val = getattr(tk.layers, attr)
+            if isinstance(val, type) and issubclass(val, T.Module):
+                _cached_layer_class_names_map[attr.lower()] = val
+
+        # aliases to XXXTransposeNd
+        for spatial_ndims in (1, 2, 3):
+            for prefix in ('LinearConv', 'Conv'):
+                # the original name and the layer class
+                orig_name = f'{prefix}Transpose{spatial_ndims}d'
+                layer_cls = getattr(tk.layers, orig_name)
+
+                # the new name
+                alias_name = orig_name
+                alias_name = alias_name.replace('ConvTranspose', 'DeConv')
+                _cached_layer_class_names_map[alias_name.lower()] = layer_cls
+
+    canonical_name = name.lower().replace('_', '')
+    if canonical_name not in _cached_layer_class_names_map:
+        raise ValueError(f'Unsupported layer class: {name!r}.')
+    return _cached_layer_class_names_map[canonical_name]
+
+
+_cached_layer_class_names_map = {}
+
+
+def _calculate_conv_output_size(in_size, kernel_size, stride, padding, dilation):
+    out_size = []
+    for i, k, s, p, d in zip(in_size, kernel_size, stride, padding, dilation):
+        if i is None:
+            out_size.append(None)
+        else:
+            l = T.utils.calculate_conv_output_size([i], [k], [s], [p], [d])[0]
+            out_size.append(l)
+    return out_size
+
+
+def _calculate_deconv_output_size(in_size, kernel_size, stride, padding, output_padding, dilation):
+    out_size = []
+    for i, k, s, p, op, d in zip(in_size, kernel_size, stride, padding, output_padding, dilation):
+        if i is None:
+            out_size.append(None)
+        else:
+            l = T.utils.calculate_deconv_output_size(d[i], [k], [s], [p], [op], [d])[0]
+            out_size.append(l)
+    return out_size
+
+
+if T.IS_CHANNEL_LAST:
+    def _split_channel_spatial(shape):
+        return shape[-1], shape[:-1]
+
+
+    def _unsplit_channel_spatial(channel, spatial):
+        return list(spatial) + [channel]
+
+else:
+    def _split_channel_spatial(shape):
+        return shape[0], shape[1:]
+
+
+    def _unsplit_channel_spatial(channel, spatial):
+        return [channel] + list(spatial)
+
+
+class LayerArgs(object):
+    """A class that manages the default arguments for constructing layers."""
+
+    args: Dict[type, Dict[str, Any]]
+
+    def __init__(self, layer_args: Optional['LayerArgs'] = None):
+        """
+        Construct a new :class:`LayerArgs` instance.
+
+        Args:
+            layer_args: Clone from this :class:`LayerArgs` instance.
+        """
+        if layer_args is not None:
+            self.args = {type_: {key: val for key, val in type_args.items()}
+                         for type_, type_args in layer_args.args.items()}
+        else:
+            self.args = {}
+
+    def set_args(self,
+                 type_or_types_: Union[
+                     str, Type[T.Module], Sequence[Union[str, Type[T.Module]]]],
+                 **kwargs):
+        """
+        Set default arguments for the specified layer types.
+
+        Args:
+            type_or_types_: The layer type or types.
+            **kwargs: The default arguments to be set.
+        """
+        if isinstance(type_or_types_, (str, type)):
+            type_or_types_ = [type_or_types_]
+
+        for type_ in type_or_types_:
+            if isinstance(type_, str):
+                type_ = _get_layer_class(type_)
+            if type_ not in self.args:
+                self.args[type_] = {}
+            self.args[type_].update(kwargs)
+
+    def get_kwargs(self, type_: Union[str, type], **kwargs) -> Dict[str, Any]:
+        """
+        Get the merged keyword arguments for the specified layer type.
+
+        Args:
+            type_: The layer type.
+            **kwargs: The overrided keyword arguments.
+
+        Returns:
+            The merged keyword arguments.
+        """
+        if isinstance(type_, str):
+            type_ = _get_layer_class(type_)
+        layer_args = self.args.get(type_)
+        if layer_args:
+            for key, val in layer_args.items():
+                kwargs.setdefault(key, val)
+        return kwargs
+
+    def build(self, type_: Union[str, type], *args, **kwargs):
+        """
+        Build the layer with default arguments.
+
+        Args:
+            type_: The layer type.
+            *args: The positional arguments.
+            **kwargs: The named arguments, which may override the default
+                arguments.
+
+        Returns:
+            The built layer object.
+        """
+        return type_(*args, **self.get_kwargs(type_, **kwargs))
+
+
+class SequentialBuilder(object):
+    """A class that helps to build a sequence layers."""
+
+    in_shape: List[Optional[int]]
+    out_shape: List[Optional[int]]
+    layer_args: LayerArgs
+    layers: List[T.Module]
+
+    def __init__(self,
+                 in_spec: Union[
+                     Optional[int],
+                     Sequence[Optional[int]],
+                     'SequentialBuilder'] = NOT_SET,
+                 *,
+                 in_shape: Sequence[Optional[int]] = NOT_SET,
+                 in_channels: Optional[int] = NOT_SET,
+                 in_spatial_shape: List[int] = NOT_SET,
+                 in_builder: 'SequentialBuilder' = NOT_SET):
+        """
+        Construct a new :class:`SequentialBuilder`.
+
+        Args:
+            in_spec: Positional argument, maybe the input shape, the number
+                of input channels, or another instance of `SequentialBuilder`,
+                whose layer arguments will be cloned and `out_shape` will be
+                used as the `in_shape` of this :class:`SequentialBuilder`.
+            in_shape: The input shape.
+            in_channels: The number of input channels.
+            in_spatial_shape: The input spatial shape.  Can be specified
+                only if `in_channels` is specified, or `in_spec` is a int.
+            in_builder: Explicitly specify the previous sequential builder.
+        """
+
+        # parse the argument
+        if int(in_spec is not NOT_SET) + int(in_shape is not NOT_SET) + \
+                int(in_channels is not NOT_SET) + int(in_builder is not NOT_SET) != 1:
+            raise ValueError(
+                'One and only one of `in_spec`, `in_shape`, `in_channels` and '
+                '`in_builder` should be specified.'
+            )
+
+        if isinstance(in_spec, SequentialBuilder):
+            in_builder = in_spec
+            layer_args = LayerArgs(in_builder.layer_args)
+        elif hasattr(in_spec, '__iter__'):
+            in_shape = in_spec
+            layer_args = LayerArgs()
+        else:
+            in_channels = in_spec
+            layer_args = LayerArgs()
+
+        if in_spatial_shape is not NOT_SET and in_channels is NOT_SET:
+            raise ValueError(
+                '`in_spatial_shape` can be specified only when `in_channels` '
+                'is specified, or `in_spec` is None or an integer.'
+            )
+
+        if in_shape is not NOT_SET:
+            in_shape = list(in_shape)
+        elif in_channels is not NOT_SET:
+            if in_spatial_shape is NOT_SET:
+                in_spatial_shape = []
+            in_shape = _unsplit_channel_spatial(in_channels, in_spatial_shape)
+        else:
+            in_shape = list(in_builder.out_shape)
+
+        # create the object
+        self.in_shape = in_shape
+        self.out_shape = in_shape
+        self.layer_args = layer_args
+        self.layers = []
+
+    def _assert_out_shape(self,
+                          shape: Optional[Sequence[bool]] = None,
+                          channel: Optional[bool] = None,
+                          spatial: Optional[Sequence[bool]] = None,
+                          at_least: bool = False) -> List[Optional[int]]:
+        if shape is None:
+            if channel is None:
+                raise ValueError('`channel` must be specified when `shape` is not.')
+            shape = _unsplit_channel_spatial(channel, spatial or [])
+
+        ndims = len(shape)
+        if at_least:
+            if len(self.out_shape) < ndims:
+                raise ValueError(
+                    f'The previous output shape is expected to be '
+                    f'at least {ndims}d: got output shape {self.out_shape}.'
+                )
+        else:
+            if len(self.out_shape) != ndims:
+                raise ValueError(
+                    f'The previous output shape is expected to be '
+                    f'exactly {ndims}d: got output shape {self.out_shape}.'
+                )
+
+        for i, (d, s) in enumerate(
+                zip(shape[::-1], self.out_shape[::-1]), 1):
+            if d and s is None:
+                raise ValueError(
+                    f'Axis {-i} of the previous output shape is expected '
+                    f'to be deterministic: got output shape {self.out_shape}.'
+                )
+
+        return self.out_shape
+
+    def _split_out_shape(self,
+                         channel: Optional[bool] = None,
+                         spatial: Optional[Sequence[bool]] = None
+                         ) -> Tuple[Optional[int], List[Optional[int]]]:
+        out_shape = self._assert_out_shape(channel=channel, spatial=spatial)
+        return _split_channel_spatial(out_shape)
+
+    def set_args(self,
+                 type_or_types_: Union[str, type, Sequence[Union[str, type]]],
+                 **kwargs) -> 'SequentialBuilder':
+        """
+        Set layer default arguments.
+
+        Args:
+            type_or_types_: The layer type or types.
+            **kwargs: The default arguments.
+
+        Returns:
+            This sequential builder object.
+        """
+        self.layer_args.set_args(type_or_types_, **kwargs)
+        return self
+
+    @contextmanager
+    def arg_scope(self,
+                  type_or_types_: Union[str, type, Sequence[Union[str, type]]],
+                  **kwargs) -> Generator[None, None, None]:
+        """
+        Set layer default arguments within a scope, which will be restore to
+        the previous values after exiting the scope.
+
+        Args:
+            type_or_types_: The layer type or types.
+            **kwargs: The default arguments.
+        """
+        old_layer_args = self.layer_args
+        layer_args = LayerArgs(old_layer_args)
+        layer_args.set_args(type_or_types_, **kwargs)
+        self.layer_args = layer_args
+        try:
+            yield
+        finally:
+            self.layer_args = old_layer_args
+
+    def add(self,
+            layer: T.Module,
+            out_shape: List[Optional[int]] = NOT_SET,
+            *,
+            out_channels: Optional[int] = NOT_SET,
+            out_spatial_shape: List[Optional[int]] = NOT_SET
+            ) -> 'SequentialBuilder':
+        """
+        Manually add a layer to this builder.
+
+        Args:
+            layer: The layer to be added.
+            out_shape: The new output shape.
+            out_channels: The new output channels.  Should be specified and
+                only be specified when `out_shape` is not.
+            out_spatial_shape: The new spatial shape.  Should only be specified
+                when `out_channels` is specified.
+
+        Returns:
+            This sequential builder object.
+        """
+        if (out_shape is NOT_SET) == (out_channels is NOT_SET):
+            raise ValueError('Either `out_shape` or `out_channels` should be '
+                             'specified, but not both.')
+        if out_spatial_shape is not NOT_SET and out_channels is NOT_SET:
+            raise ValueError('`out_spatial_shape` can only be specified when '
+                             '`out_channels` is specified.')
+
+        if out_channels is not NOT_SET:
+            if out_spatial_shape is NOT_SET:
+                out_spatial_shape = []
+            out_shape = _unsplit_channel_spatial(out_channels, out_spatial_shape)
+
+        self.layers.append(layer)
+        self.out_shape = out_shape
+        return self
+
+    def build(self,
+              flatten_to_ndims: bool = True,
+              disable_jit: bool = False) -> T.Module:
+        """
+        Build the sequential layer.
+
+        Args:
+            flatten_to_ndims: Whether or not to wrap the sequential layer
+                with a :class:`FlattenToNDims` layer?
+            disable_jit: Whether or not to disable JIT?
+
+        Returns:
+            The built sequential layer.
+        """
+        if not self.layers:
+            raise RuntimeError('No layer has been added.')
+        elif len(self.layers) == 1:
+            layer = self.layers[0]
+        else:
+            layer = Sequential(self.layers)
+
+        if flatten_to_ndims:
+            layer = FlattenToNDims(layer, ndims=len(self.in_shape) + 1)
+        if not disable_jit:
+            layer = T.jit_compile(layer)
+        return layer
+
+    # ---- activation ----
+    def _make_activation(self, type_):
+        self._assert_out_shape((False,), at_least=True)
+        layer = self.layer_args.build(type_)
+        return self.add(layer, self.out_shape)
+
+    def relu(self):
+        return self._make_activation(ReLU)
+
+    def leaky_relu(self):
+        return self._make_activation(LeakyReLU)
+
+    def sigmoid(self):
+        return self._make_activation(Sigmoid)
+
+    def tanh(self):
+        return self._make_activation(Tanh)
+
+    def log_softmax(self):
+        return self._make_activation(LogSoftmax)
+
+    # ---- fully-connected layers ----
+    def _fully_connected(self, layer_cls, out_features, **kwargs):
+        in_features, _ = self._split_out_shape(True)
+        layer = self.layer_args.build(layer_cls, in_features, out_features, **kwargs)
+        return self.add(layer, [out_features])
+
+    def linear(self, out_features: int, **kwargs):
+        return self._fully_connected(Linear, out_features, **kwargs)
+
+    def dense(self, out_features: int, **kwargs):
+        return self._fully_connected(Dense, out_features, **kwargs)
+
+    # ---- convolution layers ----
+    def _conv_nd(self, spatial_ndims, conv_cls, out_channels, **kwargs):
+        in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
+
+        # validate the arguments
+        kwargs = self.layer_args.get_kwargs(conv_cls, **kwargs)
+        kernel_size = validate_conv_size('kernel_size', kwargs['kernel_size'], spatial_ndims)
+        stride = validate_conv_size('stride', kwargs.get('stride', 1), spatial_ndims)
+        dilation = validate_conv_size('dilation', kwargs.get('dilation', 1), spatial_ndims)
+        padding = validate_padding(
+            kwargs.get('padding', PaddingMode.DEFAULT), kernel_size, dilation, spatial_ndims)
+
+        # calculate the output shape
+        out_size = _calculate_conv_output_size(in_size, kernel_size, stride, padding, dilation)
+        out_shape = _unsplit_channel_spatial(out_channels, out_size)
+
+        # build the layer
+        layer = conv_cls(in_channels, out_channels, **kwargs)
+        return self.add(layer, out_shape)
+
+    def linear_conv1d(self,
+                      out_channels: int,
+                      **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(1, LinearConv1d, out_channels, **kwargs)
+
+    def linear_conv2d(self,
+                      out_channels: int,
+                      **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(2, LinearConv2d, out_channels, **kwargs)
+
+    def linear_conv3d(self,
+                      out_channels: int,
+                      **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(3, LinearConv3d, out_channels, **kwargs)
+
+    def conv1d(self,
+               out_channels: int,
+               **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(1, Conv1d, out_channels, **kwargs)
+
+    def conv2d(self,
+               out_channels: int,
+               **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(2, Conv2d, out_channels, **kwargs)
+
+    def conv3d(self,
+               out_channels: int,
+               **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(3, Conv3d, out_channels, **kwargs)
+
+    def res_block1d(self,
+                    out_channels: int,
+                    **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(1, ResBlock1d, out_channels, **kwargs)
+
+    def res_block2d(self,
+                    out_channels: int,
+                    **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(2, ResBlock2d, out_channels, **kwargs)
+
+    def res_block3d(self,
+                    out_channels: int,
+                    **kwargs) -> 'SequentialBuilder':
+        return self._conv_nd(3, ResBlock3d, out_channels, **kwargs)
+
+    # ---- deconvolution layers ----
+    def _deconv_nd(self, spatial_ndims, deconv_cls, out_channels, output_size, **kwargs):
+        in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
+
+        # validate the arguments
+        kwargs = self.layer_args.get_kwargs(deconv_cls, **kwargs)
+        kernel_size = validate_conv_size('kernel_size', kwargs['kernel_size'], spatial_ndims)
+        stride = validate_conv_size('stride', kwargs.get('stride', 1), spatial_ndims)
+        dilation = validate_conv_size('dilation', kwargs.get('dilation', 1), spatial_ndims)
+        padding = validate_padding(
+            kwargs.get('padding', PaddingMode.DEFAULT), kernel_size, dilation, spatial_ndims)
+
+        if 'output_padding' in kwargs and output_size is not NOT_SET:
+            raise ValueError('`output_padding` and `out_shape` cannot be both specified.')
+        elif output_size is not NOT_SET:
+            if len(output_size) != spatial_ndims:
+                raise ValueError(
+                    f'`output_size` is expected to be {spatial_ndims}d: '
+                    f'got {output_size}.'
+                )
+            if any(i is None for i in in_size):
+                raise ValueError(
+                    f'Specifying `output_size` instead of `output_padding` '
+                    f'is supported only when the previous output shape '
+                    f'is all deterministic.'
+                )
+            out_size = output_size
+            output_padding = [
+                T.utils.calculate_deconv_output_padding(*args)
+                for args in zip(
+                    in_size, output_size, kernel_size, stride, padding, dilation)
+            ]
+        elif 'output_padding' in kwargs:
+            output_padding = validate_output_padding(
+                kwargs.get('output_padding', 0), stride, dilation, spatial_ndims)
+            out_size = None
+        else:
+            output_padding = [0] * spatial_ndims
+            out_size = None
+
+        # calculate the output shape if not specified
+        if out_size is None:
+            out_size = _calculate_deconv_output_size(
+                in_size, kernel_size, stride, padding, output_padding, dilation)
+        out_shape = _unsplit_channel_spatial(out_channels, out_size)
+
+        # build the layer
+        kwargs['output_padding'] = output_padding
+        layer = deconv_cls(in_channels, out_channels, **kwargs)
+        return self.add(layer, out_shape)
+
+    def linear_conv_transpose1d(self,
+                                out_channels: int,
+                                output_size: List[int] = NOT_SET,
+                                **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            1, LinearConvTranspose1d, out_channels, output_size, **kwargs)
+
+    def linear_conv_transpose2d(self,
+                                out_channels: int,
+                                output_size: List[int] = NOT_SET,
+                                **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            2, LinearConvTranspose2d, out_channels, output_size, **kwargs)
+
+    def linear_conv_transpose3d(self,
+                                out_channels: int,
+                                output_size: List[int] = NOT_SET,
+                                **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            3, LinearConvTranspose3d, out_channels, output_size, **kwargs)
+
+    def conv_transpose1d(self,
+                         out_channels: int,
+                         output_size: List[int] = NOT_SET,
+                         **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            1, ConvTranspose1d, out_channels, output_size, **kwargs)
+
+    def conv_transpose2d(self,
+                         out_channels: int,
+                         output_size: List[int] = NOT_SET,
+                         **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            2, ConvTranspose2d, out_channels, output_size, **kwargs)
+
+    def conv_transpose3d(self,
+                         out_channels: int,
+                         output_size: List[int] = NOT_SET,
+                         **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            3, ConvTranspose3d, out_channels, output_size, **kwargs)
+
+    def res_block_transpose1d(self,
+                              out_channels: int,
+                              output_size: List[int] = NOT_SET,
+                              **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            1, ResBlockTranspose1d, out_channels, output_size, **kwargs)
+
+    def res_block_transpose2d(self,
+                              out_channels: int,
+                              output_size: List[int] = NOT_SET,
+                              **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            2, ResBlockTranspose2d, out_channels, output_size, **kwargs)
+
+    def res_block_transpose3d(self,
+                              out_channels: int,
+                              output_size: List[int] = NOT_SET,
+                              **kwargs) -> 'SequentialBuilder':
+        return self._deconv_nd(
+            3, ResBlockTranspose3d, out_channels, output_size, **kwargs)
+
+    # aliases for the deconvolution layers
+    linear_deconv1d = linear_conv_transpose1d
+    linear_deconv2d = linear_conv_transpose2d
+    linear_deconv3d = linear_conv_transpose3d
+    deconv1d = conv_transpose1d
+    deconv2d = conv_transpose2d
+    deconv3d = conv_transpose3d
+
+    # ---- pool layers ----
+    def _pool_nd(self, spatial_ndims, pool_cls, **kwargs):
+        in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
+
+        # validate the arguments
+        kwargs = self.layer_args.get_kwargs(pool_cls, **kwargs)
+        kernel_size = validate_conv_size('kernel_size', kwargs['kernel_size'], spatial_ndims)
+        stride = validate_conv_size('stride', kwargs.get('stride', kernel_size), spatial_ndims)
+        dilation = [1] * spatial_ndims
+        padding = validate_padding(kwargs.get('padding', PaddingMode.DEFAULT), kernel_size, dilation, spatial_ndims)
+
+        # calculate the output shape
+        out_size = _calculate_conv_output_size(in_size, kernel_size, stride, padding, dilation)
+        out_shape = _unsplit_channel_spatial(in_channels, out_size)
+
+        # build the layer
+        layer = pool_cls(**kwargs)
+        return self.add(layer, out_shape)
+
+    def avg_pool1d(self, **kwargs) -> 'SequentialBuilder':
+        return self._pool_nd(1, AvgPool1d, **kwargs)
+
+    def avg_pool2d(self, **kwargs) -> 'SequentialBuilder':
+        return self._pool_nd(2, AvgPool2d, **kwargs)
+
+    def avg_pool3d(self, **kwargs) -> 'SequentialBuilder':
+        return self._pool_nd(3, AvgPool3d, **kwargs)
+
+    def max_pool1d(self, **kwargs) -> 'SequentialBuilder':
+        return self._pool_nd(1, MaxPool1d, **kwargs)
+
+    def max_pool2d(self, **kwargs) -> 'SequentialBuilder':
+        return self._pool_nd(2, MaxPool2d, **kwargs)
+
+    def max_pool3d(self, **kwargs) -> 'SequentialBuilder':
+        return self._pool_nd(3, MaxPool3d, **kwargs)
+
+    def _global_avg_pool_nd(self, spatial_ndims, pool_cls, **kwargs):
+        in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
+        keepdims = kwargs.get('keepdims', False)
+        if keepdims:
+            out_shape = _unsplit_channel_spatial(in_channels, [1] * spatial_ndims)
+        else:
+            out_shape = [in_channels]
+        layer = pool_cls(**self.layer_args.get_kwargs(pool_cls, **kwargs))
+        return self.add(layer, out_shape)
+
+    def global_avg_pool1d(self, **kwargs) -> 'SequentialBuilder':
+        return self._global_avg_pool_nd(1, GlobalAvgPool1d, **kwargs)
+
+    def global_avg_pool2d(self, **kwargs) -> 'SequentialBuilder':
+        return self._global_avg_pool_nd(2, GlobalAvgPool2d, **kwargs)
+
+    def global_avg_pool3d(self, **kwargs) -> 'SequentialBuilder':
+        return self._global_avg_pool_nd(3, GlobalAvgPool3d, **kwargs)
diff --git a/tensorkit/layers/contextual.py b/tensorkit/layers/contextual.py
index 5b4769f..1251830 100644
--- a/tensorkit/layers/contextual.py
+++ b/tensorkit/layers/contextual.py
@@ -1,6 +1,6 @@
 from typing import *
 
-from ..tensor import Tensor, jit_method
+from ..tensor import Tensor
 from .core import *
 
 __all__ = [
@@ -8,37 +8,42 @@
 ]
 
 
-class IgnoreContext(BaseContextualLayer):
+class IgnoreContext(BaseLayer):
     """
     A module which simply returns the input, ignoring any context.
     """
 
-    @jit_method
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
         return input
 
 
-class AddContext(BaseContextualLayer):
+class AddContext(BaseLayer):
     """
     A module which adds the input with the contexts.
     """
 
-    @jit_method
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
         output = input
-        for t in context:
-            output = output + t
+        if context is not None:
+            for t in context:
+                output = output + t
         return output
 
 
-class MultiplyContext(BaseContextualLayer):
+class MultiplyContext(BaseLayer):
     """
     A module which multiplies the input with the contexts.
     """
 
-    @jit_method
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
         output = input
-        for t in context:
-            output = output * t
+        if context is not None:
+            for t in context:
+                output = output * t
         return output
diff --git a/tensorkit/layers/flow_layer.py b/tensorkit/layers/flow_layer.py
index d780b0d..0ad547e 100644
--- a/tensorkit/layers/flow_layer.py
+++ b/tensorkit/layers/flow_layer.py
@@ -1,4 +1,4 @@
-from ..backend.flows import BaseFlow
+from ..backend.flows import Flow
 from ..tensor import Tensor, Module, is_jit_layer
 from .core import *
 
@@ -8,7 +8,7 @@
 ]
 
 
-class FlowLayer(BaseSingleVariateLayer):
+class FlowLayer(BaseLayer):
     """
     Wrap a :class:`tk.flows.BaseFlow` into a single-input, single-output layer.
     """
@@ -17,13 +17,13 @@ class FlowLayer(BaseSingleVariateLayer):
 
     flow: Module
 
-    def __init__(self, flow: BaseFlow):
-        if not is_jit_layer(flow) and not isinstance(flow, BaseFlow):
+    def __init__(self, flow: Flow):
+        if not is_jit_layer(flow) and not isinstance(flow, Flow):
             raise TypeError(f'`flow` must be a flow: got {flow!r}')
         super().__init__()
         self.flow = flow
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         output, output_log_det = self.flow(input, compute_log_det=False)
         return output
 
diff --git a/tensorkit/layers/gated.py b/tensorkit/layers/gated.py
index 99f3241..83bdaa3 100644
--- a/tensorkit/layers/gated.py
+++ b/tensorkit/layers/gated.py
@@ -7,7 +7,7 @@
 ]
 
 
-class BaseGated(BaseSingleVariateLayer):
+class BaseGated(BaseLayer):
 
     __constants__ = ('feature_axis', 'num_features', 'gate_bias', 'activation')
 
@@ -27,8 +27,7 @@ def __init__(self,
     def _apply_activation(self, input: Tensor) -> Tensor:
         raise NotImplementedError()
 
-    @jit_method
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         if input.shape[self.feature_axis] != self.num_features * 2:
             raise ValueError(
                 'The shape of the pre-gated output is invalid: '
diff --git a/tensorkit/layers/pixelcnn.py b/tensorkit/layers/pixelcnn.py
index 309ac4b..52a551c 100644
--- a/tensorkit/layers/pixelcnn.py
+++ b/tensorkit/layers/pixelcnn.py
@@ -72,7 +72,7 @@ def shifted_deconv(deconv_cls,
                       dilation=dilation, padding=padding, **kwargs)
 
 
-class SpatialShift(BaseSingleVariateLayer):
+class SpatialShift(BaseLayer):
 
     __constants__ = ('shift',)
 
@@ -85,11 +85,11 @@ def __init__(self, shift: Sequence[int]):
         else:
             self.shift = list(shift)
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return shift(input, self.shift)
 
 
-class BranchAndAdd(BaseSingleVariateLayer):
+class BranchAndAdd(BaseLayer):
 
     __constants__ = ('branches',)
 
@@ -99,7 +99,7 @@ def __init__(self, *branches: Union[Module, Sequence[Module]]):
         super().__init__()
         self.branches = ModuleList(flatten_nested_layers(branches))
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         branch_outputs: List[Tensor] = []
         for branch in self.branches:
             branch_outputs.append(branch(input))
@@ -109,7 +109,7 @@ def _forward(self, input: Tensor) -> Tensor:
         return output
 
 
-class AddOnesChannelNd(BaseSingleVariateLayer):
+class AddOnesChannelNd(BaseLayer):
 
     __constants__ = ('_channel_axis', '_spatial_ndims')
 
@@ -128,7 +128,7 @@ def __init__(self):
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         channel_shape = shape(input)
         channel_shape[self._channel_axis] = 1
 
@@ -154,7 +154,7 @@ def _get_spatial_ndims(self) -> int:
         return 3
 
 
-class AddLeadingContext(BaseContextualLayer):
+class AddLeadingContext(BaseLayer):
 
     __constants__ = ('first_n',)
 
@@ -162,14 +162,18 @@ def __init__(self, first_n: int):
         super().__init__()
         self.first_n = first_n
 
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
+        if context is None:  # pragma: no cover
+            raise RuntimeError('`context` is required.')
         output = input
         for i in range(self.first_n):
             output = output + context[i]
         return output
 
 
-class IgnoreLeadingContext(BaseContextualLayer):
+class IgnoreLeadingContext(BaseLayer):
 
     __constants__ = ('wrapped', 'first_n',)
 
@@ -181,7 +185,11 @@ def __init__(self, wrapped: Module, first_n: int):
         self.wrapped = wrapped
         self.first_n = first_n
 
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
+        if context is None:  # pragma: no cover
+            raise RuntimeError('`context` is required.')
         return self.wrapped(input, context[self.first_n:])
 
 
@@ -226,7 +234,7 @@ def validate_pixelcnn_kernel_size(kernel_size, spatial_ndims: int) -> List[int]:
 
 
 # ---- pixelcnn input layer, which constructs the multiple pixelcnn stacks ----
-class PixelCNNInputNd(BaseSplitLayer):
+class PixelCNNInputNd(BaseLayer):
 
     __constants__ = ('_spatial_ndims', 'add_ones_channel', 'stacks',)
 
@@ -320,7 +328,7 @@ def __init__(self,
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _forward(self, input: Tensor) -> List[Tensor]:
+    def forward(self, input: Tensor) -> List[Tensor]:
         if rank(input) != self._spatial_ndims + 2:
             raise ValueError(
                 '`input` is expected to be {}d: got input shape {}.'.
@@ -371,7 +379,7 @@ def _get_spatial_ndims(self) -> int:
 
 
 # ---- pixelcnn output layer, which obtains the final output from the stacks ----
-class PixelCNNOutputNd(BaseMergeLayer):
+class PixelCNNOutputNd(BaseLayer):
 
     __constants__ = ('_spatial_ndims',)
 
@@ -384,7 +392,7 @@ def __init__(self):
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _forward(self, inputs: List[Tensor]) -> Tensor:
+    def forward(self, inputs: List[Tensor]) -> Tensor:
         if len(inputs) != self._spatial_ndims:
             raise ValueError(
                 '`len(inputs)` is expected to be {}: got {} tensors.'.
@@ -424,7 +432,7 @@ def _get_spatial_ndims(self) -> int:
 
 
 # ---- pixelcnn layers ----
-class PixelCNNResBlockNd(BaseMultiVariateContextualLayer):
+class PixelCNNResBlockNd(BaseLayer):
 
     __constants__ = ('resnet_layers',)
 
@@ -549,7 +557,12 @@ def __init__(self,
         super().__init__()
         self.resnet_layers = ModuleList(resnet_layers)
 
-    def _forward(self, inputs: List[Tensor], context: List[Tensor]) -> List[Tensor]:
+    def forward(self,
+                inputs: List[Tensor],
+                context: Optional[List[Tensor]] = None) -> List[Tensor]:
+        if context is None:
+            context = []
+
         resnet_outputs: List[Tensor] = []
         i = 0
         for resnet_layer in self.resnet_layers:
@@ -585,7 +598,7 @@ def _get_spatial_ndims(self) -> int:
 
 
 # ---- pixelcnn down-sampling conv layers and up-sampling deconv layers ----
-class PixelCNNConvNd(BaseMultiVariateContextualLayer):
+class PixelCNNConvNd(BaseLayer):
 
     __constants__ = ('conv_layers',)
 
@@ -654,7 +667,9 @@ def __init__(self,
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _forward(self, inputs: List[Tensor], context: List[Tensor]) -> List[Tensor]:
+    def forward(self,
+                inputs: List[Tensor],
+                context: Optional[List[Tensor]] = None) -> List[Tensor]:
         conv_outputs: List[Tensor] = []
         i = 0
         for conv_layer in self.conv_layers:
@@ -703,7 +718,7 @@ def _get_spatial_ndims(self) -> int:
         return 3
 
 
-class PixelCNNConvTransposeNd(BaseMultiVariateContextualLayer):
+class PixelCNNConvTransposeNd(BaseLayer):
 
     __constants__ = ('deconv_layers',)
 
@@ -775,7 +790,9 @@ def __init__(self,
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _forward(self, inputs: List[Tensor], context: List[Tensor]) -> List[Tensor]:
+    def forward(self,
+                inputs: List[Tensor],
+                context: Optional[List[Tensor]] = None) -> List[Tensor]:
         deconv_outputs: List[Tensor] = []
         i = 0
         for conv_layer in self.deconv_layers:
@@ -825,7 +842,7 @@ def _get_spatial_ndims(self) -> int:
 
 
 # ---- pixelcnn network composer ----
-class PixelCNNNd(BaseContextualLayer):
+class PixelCNNNd(BaseLayer):
 
     __constants__ = ('input_layer', 'layers', 'output_layer')
 
@@ -865,7 +882,9 @@ def __init__(self,
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
 
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
         outputs = self.input_layer(input)
         for block in self.layers:
             outputs = block(outputs, context)
diff --git a/tensorkit/layers/pool.py b/tensorkit/layers/pool.py
index f1ffd17..21a09aa 100644
--- a/tensorkit/layers/pool.py
+++ b/tensorkit/layers/pool.py
@@ -15,7 +15,7 @@
 
 
 # ---- average pooling ----
-class AvgPoolNd(BaseSingleVariateLayer):
+class AvgPoolNd(BaseLayer):
 
     __constants__ = ('kernel_size', 'stride', 'padding', 'count_padded_zeros')
 
@@ -67,7 +67,7 @@ class AvgPool1d(AvgPoolNd):
     def _get_spatial_ndims(self) -> int:
         return 1
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return avg_pool1d(
             input, kernel_size=self.kernel_size, stride=self.stride,
             padding=self.padding, count_padded_zeros=self.count_padded_zeros,
@@ -79,7 +79,7 @@ class AvgPool2d(AvgPoolNd):
     def _get_spatial_ndims(self) -> int:
         return 2
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return avg_pool2d(
             input, kernel_size=self.kernel_size, stride=self.stride,
             padding=self.padding, count_padded_zeros=self.count_padded_zeros,
@@ -91,7 +91,7 @@ class AvgPool3d(AvgPoolNd):
     def _get_spatial_ndims(self) -> int:
         return 3
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return avg_pool3d(
             input, kernel_size=self.kernel_size, stride=self.stride,
             padding=self.padding, count_padded_zeros=self.count_padded_zeros,
@@ -99,7 +99,7 @@ def _forward(self, input: Tensor) -> Tensor:
 
 
 # ---- max pooling ----
-class MaxPoolNd(BaseSingleVariateLayer):
+class MaxPoolNd(BaseLayer):
 
     __constants__ = ('kernel_size', 'stride', 'padding')
 
@@ -146,7 +146,7 @@ class MaxPool1d(MaxPoolNd):
     def _get_spatial_ndims(self) -> int:
         return 1
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return max_pool1d(
             input, kernel_size=self.kernel_size, stride=self.stride,
             padding=self.padding
@@ -158,7 +158,7 @@ class MaxPool2d(MaxPoolNd):
     def _get_spatial_ndims(self) -> int:
         return 2
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return max_pool2d(
             input, kernel_size=self.kernel_size, stride=self.stride,
             padding=self.padding
@@ -170,7 +170,7 @@ class MaxPool3d(MaxPoolNd):
     def _get_spatial_ndims(self) -> int:
         return 3
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return max_pool3d(
             input, kernel_size=self.kernel_size, stride=self.stride,
             padding=self.padding
@@ -178,7 +178,7 @@ def _forward(self, input: Tensor) -> Tensor:
 
 
 # ---- global average pooling ----
-class GlobalAvgPoolNd(BaseSingleVariateLayer):
+class GlobalAvgPoolNd(BaseLayer):
 
     __constants__ = ('spatial_ndims', 'reduce_axis', 'keepdims')
 
@@ -208,7 +208,7 @@ def _get_spatial_ndims(self) -> int:
     def __repr__(self) -> str:
         return f'{self.__class__.__qualname__}(keepdims={self.keepdims})'
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         if len(input.shape) < self.spatial_ndims + 1:
             raise ValueError(
                 '`rank(input)` is too low: expected to be at least '
diff --git a/tensorkit/layers/resnet.py b/tensorkit/layers/resnet.py
index 7a5ffda..9f4d5bd 100644
--- a/tensorkit/layers/resnet.py
+++ b/tensorkit/layers/resnet.py
@@ -13,7 +13,7 @@
 ]
 
 
-class ResBlockNd(BaseContextualLayer):
+class ResBlockNd(BaseLayer):
     """
     A general implementation of ResNet block.
 
@@ -24,7 +24,8 @@ class ResBlockNd(BaseContextualLayer):
     .. code-block:: python
 
         shortcut = input
-        if strides != 1 or in_channels != out_channels or use_shortcut:
+        if strides != 1 or (kernel_size != 1 and padding != 'half') or \
+                in_channels != out_channels or use_shortcut:
             shortcut_layer = shortcut(
                 in_channels=in_channels,
                 out_channels=out_channels,
@@ -211,6 +212,7 @@ def compile_layer_list(layers: List[Module]) -> Module:
         kernel_size = validate_conv_size('kernel_size', kernel_size, spatial_ndims)
         stride = validate_conv_size('strides', stride, spatial_ndims)
         dilation = validate_conv_size('dilation', dilation, spatial_ndims)
+        is_half_padding = padding == PaddingMode.HALF.value
         padding = validate_padding(padding, kernel_size, dilation, spatial_ndims)
 
         if output_padding != 0 and \
@@ -240,8 +242,10 @@ def compile_layer_list(layers: List[Module]) -> Module:
         if shortcut is not None:
             use_shortcut = True
         if use_shortcut is None:
-            use_shortcut = (any(s != 1 for s in stride) or
-                            in_channels != out_channels)
+            use_shortcut = (
+                any(s != 1 for s in stride) or
+                (not is_half_padding and any(k != 1 for k in stride)) or
+                in_channels != out_channels)
 
         if activation is not None:
             activation_factory = validate_layer_factory('activation', activation)
@@ -391,8 +395,12 @@ def _default_conv_factory(self) -> LayerFactory:
     def _add_output_padding_to_kwargs(self, output_padding, kwargs):
         return kwargs
 
-    @jit_method
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self,
+                input: Tensor,
+                context: Optional[List[Tensor]] = None) -> Tensor:
+        if context is None:
+            context = []
+
         # feed the input into both the shortcut and the residual path
         residual = shortcut = input
 
diff --git a/tensorkit/layers/shape_.py b/tensorkit/layers/shape_.py
index 44bbc8c..ea7266a 100644
--- a/tensorkit/layers/shape_.py
+++ b/tensorkit/layers/shape_.py
@@ -14,11 +14,10 @@
 
 
 # ---- FlattenToNDims ----
-class FlattenToNDims(BaseSingleVariateLayer):
+class FlattenToNDims(BaseLayer):
 
     __constants__ = ('layer', 'ndims')
 
-    layer: Module
     ndims: int
 
     def __init__(self, layer: Module, ndims: int):
@@ -26,7 +25,7 @@ def __init__(self, layer: Module, ndims: int):
         self.layer = layer
         self.ndims = ndims
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         # validate the shape of input
         input_rank = rank(input)
         expected_rank = self.ndims
@@ -44,7 +43,7 @@ def _forward(self, input: Tensor) -> Tensor:
 
 
 # ---- pad ----
-class ConstantPad(BaseSingleVariateLayer):
+class ConstantPad(BaseLayer):
 
     __constants__ = ('padding', 'value')
 
@@ -73,7 +72,7 @@ def check_int_tuple(t):
         self.padding = padding
         self.value = value
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return pad(input, self.padding, value=self.value)
 
 
@@ -130,37 +129,37 @@ def _get_spatial_ndims(self):
 
 
 # ---- channel swap ----
-class ChannelFirstToLast1d(BaseSingleVariateLayer):
+class ChannelFirstToLast1d(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return channel_first_to_last1d(input)
 
 
-class ChannelFirstToLast2d(BaseSingleVariateLayer):
+class ChannelFirstToLast2d(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return channel_first_to_last2d(input)
 
 
-class ChannelFirstToLast3d(BaseSingleVariateLayer):
+class ChannelFirstToLast3d(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return channel_first_to_last3d(input)
 
 
-class ChannelLastToFirst1d(BaseSingleVariateLayer):
+class ChannelLastToFirst1d(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return channel_last_to_first1d(input)
 
 
-class ChannelLastToFirst2d(BaseSingleVariateLayer):
+class ChannelLastToFirst2d(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return channel_last_to_first2d(input)
 
 
-class ChannelLastToFirst3d(BaseSingleVariateLayer):
+class ChannelLastToFirst3d(BaseLayer):
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return channel_last_to_first3d(input)
diff --git a/tensorkit/layers/split_.py b/tensorkit/layers/split_.py
index eee4c80..8945443 100644
--- a/tensorkit/layers/split_.py
+++ b/tensorkit/layers/split_.py
@@ -6,7 +6,7 @@
 __all__ = ['Branch']
 
 
-class Branch(BaseSplitLayer):
+class Branch(BaseLayer):
     """
     A module that maps the input tensor into multiple tensors via sub-modules.
 
@@ -38,7 +38,7 @@ def __init__(self,
         self.branches = ModuleList(list(branches))
         self.shared = shared
 
-    def _forward(self, input: Tensor) -> List[Tensor]:
+    def forward(self, input: Tensor) -> List[Tensor]:
         outputs: List[Tensor] = []
         shared_output = self.shared(input)
         for branch in self.branches:
diff --git a/tensorkit/losses/core.py b/tensorkit/losses/core.py
deleted file mode 100644
index e938efa..0000000
--- a/tensorkit/losses/core.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from ..backend import losses
-from ..backend.losses import *
-
-__all__ = losses.__all__
diff --git a/tensorkit/optim/__init__.py b/tensorkit/optim/__init__.py
new file mode 100644
index 0000000..525c576
--- /dev/null
+++ b/tensorkit/optim/__init__.py
@@ -0,0 +1,2 @@
+from . import lr_scheduler
+from .core import *
diff --git a/tensorkit/optim/core.py b/tensorkit/optim/core.py
new file mode 100644
index 0000000..da59b02
--- /dev/null
+++ b/tensorkit/optim/core.py
@@ -0,0 +1,4 @@
+from ..backend import optim
+from ..backend.optim import *
+
+__all__ = optim.__all__
diff --git a/tensorkit/optim/lr_scheduler.py b/tensorkit/optim/lr_scheduler.py
new file mode 100644
index 0000000..8dfb597
--- /dev/null
+++ b/tensorkit/optim/lr_scheduler.py
@@ -0,0 +1,71 @@
+from typing import *
+
+import mltk
+
+from .core import *
+
+__all__ = [
+    'LRScheduler', 'AnnealingLR',
+]
+
+
+class LRScheduler(object):
+    """
+    Base class that schedules the learning rate of an optimizer
+    during a :class:`mltk.TrainLoop`.
+    """
+
+    loop: mltk.TrainLoop
+    optimizer: Optimizer
+
+    def __init__(self,
+                 loop: mltk.TrainLoop,
+                 optimizer: Optimizer):
+        self.loop = loop
+        self.optimizer = optimizer
+        self._bind_events(loop)
+        self.update_lr()
+
+    def update_lr(self):
+        """Update the learning rate of the optimizer according to the loop."""
+        raise NotImplementedError()
+
+    def close(self):
+        """Close this scheduler, such that it will no longer affect the optimizer."""
+        self._unbind_events(self.loop)
+
+    def _bind_events(self, loop: mltk.TrainLoop):
+        raise NotImplementedError()
+
+    def _unbind_events(self, loop: mltk.TrainLoop):
+        raise NotImplementedError()
+
+
+class AnnealingLR(LRScheduler):
+
+    initial_lr: float
+    ratio: float
+    epochs: int
+
+    def __init__(self,
+                 loop: mltk.TrainLoop,
+                 optimizer: Optimizer,
+                 initial_lr: float,
+                 ratio: float,
+                 epochs: int
+                 ):
+        self.initial_lr = float(initial_lr)
+        self.ratio = float(ratio)
+        self.epochs = int(epochs)
+        super().__init__(loop, optimizer)
+
+    def _bind_events(self, loop: mltk.TrainLoop):
+        loop.on_epoch_end.do(self.update_lr)
+
+    def _unbind_events(self, loop: mltk.TrainLoop):
+        loop.on_epoch_end.cancel_do(self.update_lr)
+
+    def update_lr(self):
+        n_cycles = int(self.loop.epoch // self.epochs)
+        lr_discount = self.ratio ** n_cycles
+        self.optimizer.set_lr(self.initial_lr * lr_discount)
diff --git a/tensorkit/tensor/__init__.py b/tensorkit/tensor/__init__.py
index 2d649ea..21aced9 100644
--- a/tensorkit/tensor/__init__.py
+++ b/tensorkit/tensor/__init__.py
@@ -1,2 +1,2 @@
-from . import linalg, nn, random
+from . import linalg, nn, random, utils
 from .core import *
diff --git a/tensorkit/tensor/utils.py b/tensorkit/tensor/utils.py
new file mode 100644
index 0000000..1a13f13
--- /dev/null
+++ b/tensorkit/tensor/utils.py
@@ -0,0 +1,4 @@
+from ..backend import utils
+from ..backend.utils import *
+
+__all__ = utils.__all__
diff --git a/tensorkit/losses/__init__.py b/tensorkit/train/__init__.py
similarity index 100%
rename from tensorkit/losses/__init__.py
rename to tensorkit/train/__init__.py
diff --git a/tensorkit/train/core.py b/tensorkit/train/core.py
new file mode 100644
index 0000000..cd13464
--- /dev/null
+++ b/tensorkit/train/core.py
@@ -0,0 +1,4 @@
+from ..backend import train
+from ..backend.train import *
+
+__all__ = train.__all__
diff --git a/tensorkit/utils/__init__.py b/tensorkit/utils/__init__.py
new file mode 100644
index 0000000..101c343
--- /dev/null
+++ b/tensorkit/utils/__init__.py
@@ -0,0 +1,2 @@
+from .data_utils import *
+from .tensor_stream import *
diff --git a/tensorkit/utils/data_utils.py b/tensorkit/utils/data_utils.py
new file mode 100644
index 0000000..a6a0168
--- /dev/null
+++ b/tensorkit/utils/data_utils.py
@@ -0,0 +1,97 @@
+import mltk
+
+import numpy as np
+
+from tensorkit import tensor as T
+
+__all__ = [
+    'numpy_channel_from_last_to_first1d',
+    'numpy_channel_from_last_to_first2d',
+    'numpy_channel_from_last_to_first3d',
+
+    'numpy_channel_from_first_to_last1d',
+    'numpy_channel_from_first_to_last2d',
+    'numpy_channel_from_first_to_last3d',
+
+    'numpy_channel_from_last_to_default1d',
+    'numpy_channel_from_last_to_default2d',
+    'numpy_channel_from_last_to_default3d',
+
+    'numpy_channel_from_default_to_last1d',
+    'numpy_channel_from_default_to_last2d',
+    'numpy_channel_from_default_to_last3d',
+]
+
+
+def numpy_channel_from_last_to_first_nd(input: np.ndarray,
+                                        spatial_ndims: int
+                                        ) -> np.ndarray:
+    if len(input.shape) < spatial_ndims + 2:
+        raise ValueError(
+            f'`input` is expected to be at least {spatial_ndims + 2}d: '
+            f'got `input.shape` {input.shape}.'
+        )
+    axis = list(range(len(input.shape)))
+    transpose_axis = (
+        axis[: -(spatial_ndims + 1)] + [-1] +
+        [i for i in range(-spatial_ndims - 1, -1)]
+    )
+    return np.transpose(input, transpose_axis)
+
+
+def numpy_channel_from_last_to_first1d(input: np.ndarray) -> np.ndarray:
+    return numpy_channel_from_last_to_first_nd(input, 1)
+
+
+def numpy_channel_from_last_to_first2d(input: np.ndarray) -> np.ndarray:
+    return numpy_channel_from_last_to_first_nd(input, 2)
+
+
+def numpy_channel_from_last_to_first3d(input: np.ndarray) -> np.ndarray:
+    return numpy_channel_from_last_to_first_nd(input, 3)
+
+
+def numpy_channel_from_first_to_last_nd(input: np.ndarray,
+                                        spatial_ndims: int
+                                        ) -> np.ndarray:
+    if len(input.shape) < spatial_ndims + 2:
+        raise ValueError(
+            f'`input` is expected to be at least {spatial_ndims + 2}d: '
+            f'got `input.shape` {input.shape}.'
+        )
+    axis = list(range(len(input.shape)))
+    transpose_axis = (
+        axis[: -(spatial_ndims + 1)] +
+        [i for i in range(-spatial_ndims, 0)] +
+        [-(spatial_ndims + 1)]
+    )
+    return np.transpose(input, transpose_axis)
+
+
+def numpy_channel_from_first_to_last1d(input: np.ndarray) -> np.ndarray:
+    return numpy_channel_from_first_to_last_nd(input, 1)
+
+
+def numpy_channel_from_first_to_last2d(input: np.ndarray) -> np.ndarray:
+    return numpy_channel_from_first_to_last_nd(input, 2)
+
+
+def numpy_channel_from_first_to_last3d(input: np.ndarray) -> np.ndarray:
+    return numpy_channel_from_first_to_last_nd(input, 3)
+
+
+if T.IS_CHANNEL_LAST:
+    numpy_channel_from_last_to_default1d = \
+        numpy_channel_from_last_to_default2d = \
+        numpy_channel_from_last_to_default3d = \
+        numpy_channel_from_default_to_last1d = \
+        numpy_channel_from_default_to_last2d = \
+        numpy_channel_from_default_to_last3d = \
+        (lambda x: x)
+else:
+    numpy_channel_from_last_to_default1d = numpy_channel_from_last_to_first1d
+    numpy_channel_from_last_to_default2d = numpy_channel_from_last_to_first2d
+    numpy_channel_from_last_to_default3d = numpy_channel_from_last_to_first3d
+    numpy_channel_from_default_to_last1d = numpy_channel_from_first_to_last1d
+    numpy_channel_from_default_to_last2d = numpy_channel_from_first_to_last2d
+    numpy_channel_from_default_to_last3d = numpy_channel_from_first_to_last3d
diff --git a/tensorkit/utils/tensor_stream.py b/tensorkit/utils/tensor_stream.py
new file mode 100644
index 0000000..e154a93
--- /dev/null
+++ b/tensorkit/utils/tensor_stream.py
@@ -0,0 +1,48 @@
+from typing import *
+
+import mltk
+from mltk import ArrayTuple
+
+from .. import tensor as T
+
+__all__ = [
+    'TensorStream',
+    'as_tensor_stream',
+]
+
+
+class TensorStream(mltk.DataStream):
+
+    source: mltk.DataStream
+
+    def __init__(self, source: mltk.DataStream):
+        super().__init__(
+            batch_size=source.batch_size,
+            array_count=source.array_count,
+            data_shapes=source.data_shapes,
+            data_length=source.data_length,
+            random_state=source.random_state,
+        )
+        self.source = source
+
+    def copy(self, **kwargs):
+        return TensorStream(source=self.source, **kwargs)
+
+    def _minibatch_iterator(self) -> Generator[ArrayTuple, None, None]:
+        g = iter(self.source)
+        try:
+            for batch_data in g:
+                with T.no_grad():
+                    batch_data = tuple(T.from_numpy(arr) for arr in batch_data)
+                    yield batch_data
+        finally:
+            g.close()
+
+
+def as_tensor_stream(source: mltk.DataStream,
+                     prefetch: Optional[int] = None
+                     ) -> mltk.DataStream:
+    stream = TensorStream(source)
+    if prefetch is not None:
+        stream = stream.threaded(prefetch)
+    return stream
diff --git a/tests/distributions/test_flow.py b/tests/distributions/test_flow.py
index 5889b82..179154b 100644
--- a/tests/distributions/test_flow.py
+++ b/tests/distributions/test_flow.py
@@ -14,14 +14,14 @@
 from tests.helper import *
 
 
-class _MyFlow(tk.flows.BaseFlow):
-
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+class _MyFlow(tk.flows.Flow):
+
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             output = input * 2.0 + 1
             event_ndims = self.x_event_ndims
@@ -49,9 +49,9 @@ def _forward(self,
 def check_flow_distribution(ctx,
                             distribution,
                             flow):
-    min_event_ndims = flow.y_event_ndims
+    min_event_ndims = flow.get_y_event_ndims()
     max_event_ndims = (distribution.value_ndims +
-                       (flow.y_event_ndims - flow.x_event_ndims))
+                       (flow.get_y_event_ndims() - flow.get_x_event_ndims()))
 
     def fn(event_ndims, reparameterized, validate_tensors):
         # construct the instance
@@ -65,7 +65,7 @@ def fn(event_ndims, reparameterized, validate_tensors):
         if event_ndims is not None:
             kwargs['event_ndims'] = event_ndims
         else:
-            event_ndims = flow.y_event_ndims
+            event_ndims = flow.get_y_event_ndims()
 
         if validate_tensors is not None:
             kwargs['validate_tensors'] = validate_tensors
@@ -82,11 +82,11 @@ def log_prob_fn(t):
             assert_allclose(y, t.tensor, atol=1e-4, rtol=1e-6)
             ctx.assertEqual(
                 T.rank(log_det),
-                T.rank(log_px) - (flow.x_event_ndims - distribution.event_ndims)
+                T.rank(log_px) - (flow.get_x_event_ndims() - distribution.event_ndims)
             )
             return -log_det + T.reduce_sum(
                 log_px, T.int_range(
-                    -(flow.x_event_ndims - distribution.event_ndims),
+                    -(flow.get_x_event_ndims() - distribution.event_ndims),
                     0
                 )
             )
@@ -100,7 +100,7 @@ def log_prob_fn(t):
             max_event_ndims=max_event_ndims,
             log_prob_fn=log_prob_fn,
             transform_origin_distribution=distribution,
-            transform_origin_group_ndims=flow.x_event_ndims - distribution.event_ndims,
+            transform_origin_group_ndims=flow.get_x_event_ndims() - distribution.event_ndims,
             # other attributes
             base_distribution=distribution,
             flow=flow,
diff --git a/tests/flows/test_core.py b/tests/flows/test_core.py
index 26cf5c1..8aeb32c 100644
--- a/tests/flows/test_core.py
+++ b/tests/flows/test_core.py
@@ -14,18 +14,18 @@
 from tests.ops import *
 
 
-class _MyFlow(BaseFlow):
+class _MyFlow(Flow):
 
     def __init__(self):
         super().__init__(x_event_ndims=1,
                          y_event_ndims=2,
                          explicitly_invertible=True)
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             output = reshape_tail(0.5 * (input - 1.), 2, [-1])
         else:
@@ -47,18 +47,18 @@ def _forward(self,
         return output, output_log_det
 
 
-class _MyBadFlow(BaseFlow):
+class _MyBadFlow(Flow):
 
     def __init__(self):
         super().__init__(x_event_ndims=1,
                          y_event_ndims=1,
                          explicitly_invertible=True)
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool) -> Tuple[Tensor, Optional[Tensor]]:
         output = input
         output_log_det = input_log_det
         if compute_log_det:
@@ -72,19 +72,19 @@ def _forward(self,
 class BaseFlowTestCase(unittest.TestCase):
 
     def test_constructor(self):
-        flow = BaseFlow(x_event_ndims=1,
-                        y_event_ndims=2,
-                        explicitly_invertible=True)
-        self.assertEqual(flow.x_event_ndims, 1)
-        self.assertEqual(flow.y_event_ndims, 2)
-        self.assertEqual(flow.explicitly_invertible, True)
-
-        flow = BaseFlow(x_event_ndims=3,
-                        y_event_ndims=1,
-                        explicitly_invertible=False)
-        self.assertEqual(flow.x_event_ndims, 3)
-        self.assertEqual(flow.y_event_ndims, 1)
-        self.assertEqual(flow.explicitly_invertible, False)
+        flow = Flow(x_event_ndims=1,
+                    y_event_ndims=2,
+                    explicitly_invertible=True)
+        self.assertEqual(flow.get_x_event_ndims(), 1)
+        self.assertEqual(flow.get_y_event_ndims(), 2)
+        self.assertEqual(flow.is_explicitly_invertible(), True)
+
+        flow = Flow(x_event_ndims=3,
+                    y_event_ndims=1,
+                    explicitly_invertible=False)
+        self.assertEqual(flow.get_x_event_ndims(), 3)
+        self.assertEqual(flow.get_y_event_ndims(), 1)
+        self.assertEqual(flow.is_explicitly_invertible(), False)
 
     def test_invert(self):
         flow = _MyFlow()
@@ -93,9 +93,9 @@ def test_invert(self):
 
     def test_call(self):
         flow = T.jit_compile(_MyFlow())
-        self.assertEqual(flow.x_event_ndims, 1)
-        self.assertEqual(flow.y_event_ndims, 2)
-        self.assertEqual(flow.explicitly_invertible, True)
+        self.assertEqual(flow.get_x_event_ndims(), 1)
+        self.assertEqual(flow.get_y_event_ndims(), 2)
+        self.assertEqual(flow.is_explicitly_invertible(), True)
 
         # test call
         x = T.random.randn([2, 3, 4])
@@ -138,13 +138,14 @@ def test_constructor(self):
         flow = FeatureMappingFlow(axis=-1,
                                   event_ndims=2,
                                   explicitly_invertible=True)
-        self.assertEqual(flow.event_ndims, 2)
-
-        flow = T.jit_compile(flow)
+        self.assertEqual(flow.get_event_ndims(), 2)
         self.assertEqual(flow.axis, -1)
-        self.assertEqual(flow.x_event_ndims, 2)
-        self.assertEqual(flow.y_event_ndims, 2)
-        self.assertEqual(flow.explicitly_invertible, True)
+        flow = T.jit_compile(flow)
+
+        self.assertEqual(flow.get_axis(), -1)
+        self.assertEqual(flow.get_x_event_ndims(), 2)
+        self.assertEqual(flow.get_y_event_ndims(), 2)
+        self.assertEqual(flow.is_explicitly_invertible(), True)
 
         with pytest.raises(ValueError,
                            match='`event_ndims` must be at least 1'):
@@ -168,9 +169,9 @@ def test_InverseFlow(self):
         self.assertIs(flow.invert(), original_flow)
 
         flow = T.jit_compile(flow)
-        self.assertEqual(flow.x_event_ndims, 2)
-        self.assertEqual(flow.y_event_ndims, 1)
-        self.assertTrue(flow.explicitly_invertible)
+        self.assertEqual(flow.get_x_event_ndims(), 2)
+        self.assertEqual(flow.get_y_event_ndims(), 1)
+        self.assertTrue(flow.is_explicitly_invertible())
 
         x = T.random.randn([2, 3, 4, 1])
         expected_y = T.reshape((x - 1.) * 0.5, [2, 3, 4])
@@ -191,18 +192,18 @@ def test_InverseFlow(self):
             _ = InverseFlow(T.jit_compile(base_flow))
 
 
-class _MyFlow1(BaseFlow):
+class _MyFlow1(Flow):
 
     def __init__(self):
         super().__init__(x_event_ndims=1, y_event_ndims=1,
                          explicitly_invertible=True)
 
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             output = (input - 1.) * 0.5
         else:
@@ -230,15 +231,15 @@ class SequentialFlowTestCase(unittest.TestCase):
     def test_constructor(self):
         flows = [T.jit_compile(_MyFlow1()), T.jit_compile(_MyFlow())]
         flow = T.jit_compile(SequentialFlow(flows))
-        self.assertEqual(flow.x_event_ndims, 1)
-        self.assertEqual(flow.y_event_ndims, 2)
-        self.assertTrue(flow.explicitly_invertible)
+        self.assertEqual(flow.get_x_event_ndims(), 1)
+        self.assertEqual(flow.get_y_event_ndims(), 2)
+        self.assertTrue(flow.is_explicitly_invertible())
 
         flow2 = _MyFlow()
         flow2.explicitly_invertible = False
         flows = [T.jit_compile(_MyFlow1()), T.jit_compile(flow2)]
         flow = T.jit_compile(SequentialFlow(flows))
-        self.assertFalse(flow.explicitly_invertible)
+        self.assertFalse(flow.is_explicitly_invertible())
 
         with pytest.raises(ValueError,
                            match='`flows` must not be empty'):
@@ -281,19 +282,19 @@ def test_call(self):
             _ = flow(x, inverse=True)
 
 
-def check_invertible_matrix(ctx, m):
+def check_invertible_matrix(ctx, m, size):
     matrix, log_det = m(inverse=False, compute_log_det=False)
     ctx.assertIsNone(log_det)
 
     matrix, log_det = m(inverse=False, compute_log_det=True)
-    ctx.assertEqual(T.shape(matrix), [m.size, m.size])
+    ctx.assertEqual(T.shape(matrix), [size, size])
     assert_allclose(T.matrix_inverse(T.matrix_inverse(matrix)),
                     matrix, rtol=1e-4, atol=1e-6)
     assert_allclose(T.linalg.slogdet(matrix)[1], log_det,
                     rtol=1e-4, atol=1e-6)
 
     inv_matrix, inv_log_det = m(inverse=True, compute_log_det=True)
-    ctx.assertEqual(T.shape(inv_matrix), [m.size, m.size])
+    ctx.assertEqual(T.shape(inv_matrix), [size, size])
     assert_allclose(T.matrix_inverse(inv_matrix),
                     matrix, rtol=1e-4, atol=1e-6)
     assert_allclose(T.matrix_inverse(T.matrix_inverse(inv_matrix)),
@@ -310,9 +311,9 @@ def test_invertible_matrices(self):
             for n in [1, 3, 5]:
                 m = cls(np.random.randn(n, n))
                 self.assertEqual(repr(m), f'{cls.__qualname__}(size={n})')
+                self.assertEqual(m.size, n)
 
                 m = T.jit_compile(m)
-                self.assertEqual(m.size, n)
 
                 # check the initial value is an orthogonal matrix
                 matrix, _ = m(inverse=False, compute_log_det=False)
@@ -323,19 +324,20 @@ def test_invertible_matrices(self):
                                 rtol=1e-4, atol=1e-6)
 
                 # check the invertibility
-                check_invertible_matrix(self, m)
+                check_invertible_matrix(self, m, n)
 
                 # check the gradient
                 matrix, log_det = m(inverse=False, compute_log_det=True)
-                params = [v for _, v in tk.layers.get_parameters(m)]
-                grads = T.grad([T.reduce_sum(matrix), T.reduce_sum(log_det)], params)
+                params = list(tk.layers.get_parameters(m))
+                grads = T.grad(
+                    [T.reduce_sum(matrix), T.reduce_sum(log_det)], params)
 
                 # update with gradient, then check the invertibility
                 if cls is StrictInvertibleMatrix:
                     for param, grad in zip(params, grads):
                         with T.no_grad():
                             T.assign(param, param + 0.001 * grad)
-                    check_invertible_matrix(self, m)
+                    check_invertible_matrix(self, m, n)
 
 
 def check_invertible_linear(ctx,
@@ -405,7 +407,7 @@ def test_invertible_conv_nd(self):
 
 
 def check_scale(ctx,
-                scale: BaseScale,
+                scale: Scale,
                 x,
                 pre_scale,
                 expected_y,
@@ -479,7 +481,7 @@ def check_scale(ctx,
                         rtol=1e-4, atol=1e-6)
 
 
-class _BadScale1(BaseScale):
+class _BadScale1(Scale):
 
     def _scale_and_log_scale(self,
                              pre_scale: Tensor,
@@ -494,7 +496,7 @@ def _scale_and_log_scale(self,
         return scale, log_scale
 
 
-class _BadScale2(BaseScale):
+class _BadScale2(Scale):
 
     def _scale_and_log_scale(self,
                              pre_scale: Tensor,
diff --git a/tests/flows/test_coupling.py b/tests/flows/test_coupling.py
index 3958773..aa1b21f 100644
--- a/tests/flows/test_coupling.py
+++ b/tests/flows/test_coupling.py
@@ -51,7 +51,7 @@ def do_check(secondary, scale_type):
             scale = SigmoidScale(pre_scale_bias=sigmoid_scale_bias)
         elif scale_type == 'linear' or scale_type is LinearScale:
             scale = LinearScale()
-        elif isinstance(scale_type, BaseScale) or T.is_jit_layer(scale_type):
+        elif isinstance(scale_type, Scale) or T.is_jit_layer(scale_type):
             scale = scale_type
         else:
             raise ValueError(f'Invalid value for `scale`: {scale_type}')
diff --git a/tests/flows/test_shape_.py b/tests/flows/test_shape_.py
index e347a44..b972c73 100644
--- a/tests/flows/test_shape_.py
+++ b/tests/flows/test_shape_.py
@@ -16,8 +16,8 @@ def test_ReshapeFlow(self):
         flow = ReshapeFlow([4, -1], [-1])
         self.assertEqual(flow.x_event_shape, [4, -1])
         self.assertEqual(flow.y_event_shape, [-1])
-        self.assertEqual(flow.x_event_ndims, 2)
-        self.assertEqual(flow.y_event_ndims, 1)
+        self.assertEqual(flow.get_x_event_ndims(), 2)
+        self.assertEqual(flow.get_y_event_ndims(), 1)
         self.assertIn('x_event_shape=[4, -1]', repr(flow))
         self.assertIn('y_event_shape=[-1]', repr(flow))
         flow = T.jit_compile(flow)
diff --git a/tests/flows/test_split_.py b/tests/flows/test_split_.py
index 0cd9508..8250a6c 100644
--- a/tests/flows/test_split_.py
+++ b/tests/flows/test_split_.py
@@ -91,10 +91,8 @@ def test_SplitFlow(self):
         T.random.seed(1234)
 
         # x and y with the same event ndims
-        left = T.jit_compile(ActNorm(2))
-        right = T.jit_compile(ActNorm(3))
-        _ = left(T.random.randn([5, 2]))
-        _ = right(T.random.randn([5, 3]))
+        left = T.jit_compile(InvertibleDense(2))
+        right = T.jit_compile(InvertibleDense(3))
 
         check_split_flow(
             ctx=self,
@@ -110,20 +108,20 @@ def test_SplitFlow(self):
         with pytest.raises(ValueError,
                            match=f'`left` and `right` flows must have same '
                                  f'`x_event_ndims` and `y_event_ndims`: '
-                                 f'got `left.x_event_ndims` == {left.x_event_ndims}, '
-                                 f'`left.y_event_ndims` == {left.y_event_ndims}, '
-                                 f'`right.x_event_ndims` == {left.x_event_ndims}, '
+                                 f'got `left.x_event_ndims` == {left.get_x_event_ndims()}, '
+                                 f'`left.y_event_ndims` == {left.get_y_event_ndims()}, '
+                                 f'`right.x_event_ndims` == {left.get_x_event_ndims()}, '
                                  f'and `right.y_event_ndims` == 6'):
-            _ = SplitFlow([2, 3], left, ReshapeFlow([1] * left.x_event_ndims, [1] * 6))
+            _ = SplitFlow([2, 3], left, ReshapeFlow([1] * left.get_x_event_ndims(), [1] * 6))
 
         with pytest.raises(ValueError,
                            match=f'`left` and `right` flows must have same '
                                  f'`x_event_ndims` and `y_event_ndims`: '
-                                 f'got `left.x_event_ndims` == {left.x_event_ndims}, '
-                                 f'`left.y_event_ndims` == {left.y_event_ndims}, '
+                                 f'got `left.x_event_ndims` == {left.get_x_event_ndims()}, '
+                                 f'`left.y_event_ndims` == {left.get_y_event_ndims()}, '
                                  f'`right.x_event_ndims` == 6, '
-                                 f'and `right.y_event_ndims` == {left.y_event_ndims}'):
-            _ = SplitFlow([2, 3], left, ReshapeFlow([1] * 6, [1] * left.y_event_ndims))
+                                 f'and `right.y_event_ndims` == {left.get_y_event_ndims()}'):
+            _ = SplitFlow([2, 3], left, ReshapeFlow([1] * 6, [1] * left.get_y_event_ndims()))
 
         # x and y with different event ndims
         left = ReshapeFlow([-1], [-1, 2])
@@ -159,14 +157,10 @@ def test_SplitFlowNd(self):
 
         for spatial_ndims in (1, 2, 3):
             cls = getattr(tk.flows, f'SplitFlow{spatial_ndims}d')
-            sub_cls = getattr(tk.flows, f'ActNorm{spatial_ndims}d')
+            sub_cls = getattr(tk.flows, f'InvertibleConv{spatial_ndims}d')
 
             left = T.jit_compile(sub_cls(2))
             right = T.jit_compile(sub_cls(3))
-            _ = left(T.random.randn(
-                make_conv_shape([5], 2, [6, 7, 8][:spatial_ndims])))
-            _ = right(T.random.randn(
-                make_conv_shape([5], 3, [6, 7, 8][:spatial_ndims])))
 
             check_split_flow(
                 ctx=self,
diff --git a/tests/init/test_core.py b/tests/init/test_core.py
index 2caf524..0039cdf 100644
--- a/tests/init/test_core.py
+++ b/tests/init/test_core.py
@@ -325,7 +325,7 @@ class _MyDataDependentInitializer(tk.init.DataDependentInitializer):
     def __init__(self, watcher):
         self.watcher = watcher
 
-    def _forward(self, layer: T.Module, inputs: List[T.Tensor]) -> None:
+    def _init(self, layer: T.Module, inputs: List[T.Tensor]) -> None:
         _ = layer(inputs[0])
         self.watcher.append((layer, inputs))
 
diff --git a/tests/layers/test_contextual.py b/tests/layers/test_contextual.py
index 373bc3e..7dd093b 100644
--- a/tests/layers/test_contextual.py
+++ b/tests/layers/test_contextual.py
@@ -1,70 +1,12 @@
 import unittest
-from typing import List
 
 import tensorkit as tk
 from tensorkit import tensor as T
-from tensorkit.backend import Tensor
 from tests.helper import *
 
 
-class _MyContextualLayer(tk.layers.BaseContextualLayer):
-
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
-        output = input
-        base = -1.
-        for t in context:
-            output = output + t * base
-            base = base * 10.
-        return output
-
-
-class _MyMultiVariateContextualLayer(tk.layers.BaseMultiVariateContextualLayer):
-
-    def _forward(self, inputs: List[Tensor], context: List[Tensor]) -> List[Tensor]:
-        outputs: List[Tensor] = []
-        input_base = 1.
-        for input in inputs:
-            output = input * input_base
-            base = -1.
-            for t in context:
-                output = output + t * base
-                base = base * 10.
-            outputs.append(output)
-            input_base *= 10.
-        return outputs
-
-
 class ContextualTestCase(unittest.TestCase):
 
-    def test_BaseContextualLayer(self):
-        x = T.random.randn([2, 3, 4])
-        context = [T.random.randn([2, 3, 4]),
-                   T.random.randn([2, 3, 4])]
-        layer = T.jit_compile(_MyContextualLayer())
-        assert_allclose(layer(x), x)
-        assert_allclose(layer(x, context), x - context[0] - 10. * context[1])
-
-    def test_BaseMultiVariateContextualLayer(self):
-        inputs = [T.random.randn([2, 3, 4]),
-                  T.random.randn([2, 3, 4])]
-        context = [T.random.randn([2, 3, 4]),
-                   T.random.randn([2, 3, 4])]
-        layer = T.jit_compile(_MyMultiVariateContextualLayer())
-
-        for k in range(len(inputs)):
-            outputs = layer(inputs[:k])
-            self.assertEqual(len(outputs), k)
-            for j, (input, output) in enumerate(zip(inputs, outputs)):
-                assert_allclose(output, input * (10 ** j))
-
-            outputs = layer(inputs[:k], context)
-            self.assertEqual(len(outputs), k)
-            for j, (input, output) in enumerate(zip(inputs, outputs)):
-                assert_allclose(
-                    output,
-                    input * (10 ** j) - context[0] - 10. * context[1]
-                )
-
     def test_IgnoreContext(self):
         x = T.random.randn([2, 3, 4])
         context = [T.random.randn([2, 3, 4]),
diff --git a/tests/layers/test_core.py b/tests/layers/test_core.py
index 15450d0..99a84c9 100644
--- a/tests/layers/test_core.py
+++ b/tests/layers/test_core.py
@@ -14,7 +14,7 @@
 from tests.ops import *
 
 
-class _MyWrapper(BaseSingleVariateLayer):
+class _MyWrapper(BaseLayer):
 
     __constants__ = ('wrapped',)
 
@@ -24,7 +24,7 @@ def __init__(self, wrapped: Module):
         super().__init__()
         self.wrapped = wrapped
 
-    def _forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         return self.wrapped(input)
 
 
@@ -67,14 +67,20 @@ def test_param_and_buffer(self):
         c = get_buffer(layer, 'c')
         c2 = get_buffer(layer, 'c2')
 
-        self.assertDictEqual(dict(get_parameters(layer)), {'w': w, 'w2': w2})
-        self.assertDictEqual(dict(get_buffers(layer)), {'c': c, 'c2': c2})
+        self.assertListEqual(list(get_parameters(layer)), [w, w2])
+        self.assertDictEqual(dict(get_named_parameters(layer)), {'w': w, 'w2': w2})
+        self.assertListEqual(list(get_buffers(layer)), [c, c2])
+        self.assertDictEqual(dict(get_named_buffers(layer)), {'c': c, 'c2': c2})
 
         seq = _MyWrapper(layer)
-        self.assertDictEqual(dict(get_parameters(seq)), {'wrapped.w': w, 'wrapped.w2': w2})
-        self.assertDictEqual(dict(get_parameters(seq, recursive=False)), {})
-        self.assertDictEqual(dict(get_buffers(seq)), {'wrapped.c': c, 'wrapped.c2': c2})
-        self.assertDictEqual(dict(get_buffers(seq, recursive=False)), {})
+        self.assertListEqual(list(get_parameters(seq)), [w, w2])
+        self.assertListEqual(list(get_parameters(seq, recursive=False)), [])
+        self.assertDictEqual(dict(get_named_parameters(seq)), {'wrapped.w': w, 'wrapped.w2': w2})
+        self.assertDictEqual(dict(get_named_parameters(seq, recursive=False)), {})
+        self.assertListEqual(list(get_buffers(seq)), [c, c2])
+        self.assertListEqual(list(get_buffers(seq, recursive=False)), [])
+        self.assertDictEqual(dict(get_named_buffers(seq)), {'wrapped.c': c, 'wrapped.c2': c2})
+        self.assertDictEqual(dict(get_named_buffers(seq, recursive=False)), {})
 
     def test_SimpleParamStore(self):
         initial_value = np.random.randn(2, 3, 4)
@@ -173,7 +179,7 @@ def test_identity(self):
         assert_equal(x, layer(x))
 
 
-class _MySingleVariateLayer(BaseSingleVariateLayer):
+class _MySingleVariateLayer(BaseLayer):
 
     bias: float
 
@@ -190,29 +196,28 @@ def _add_numpy_array(self, x: Tensor) -> Tensor:
         return x + T.from_numpy(np.arange(x.shape[-1]),
                                 dtype=T.get_dtype(x))
 
-    @T.jit_method
-    def _forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: Tensor) -> Tensor:
         return self._add_numpy_array(x * 11. + self.bias)
 
 
-class _MyMultiVariateLayer(BaseMultiVariateLayer):
+class _MyMultiVariateLayer(BaseLayer):
 
-    def _forward(self, inputs: List[Tensor]) -> List[Tensor]:
+    def forward(self, inputs: List[Tensor]) -> List[Tensor]:
         ret: List[Tensor] = []
         for i in range(len(inputs) - 1):
             ret.append(inputs[i] + inputs[i + 1])
         return ret
 
 
-class _MySplitLayer(BaseSplitLayer):
+class _MySplitLayer(BaseLayer):
 
-    def _forward(self, input: Tensor) -> List[Tensor]:
+    def forward(self, input: Tensor) -> List[Tensor]:
         return [input, input + 1, input + 2]
 
 
-class _MyMergeLayer(BaseMergeLayer):
+class _MyMergeLayer(BaseLayer):
 
-    def _forward(self, inputs: List[Tensor]) -> Tensor:
+    def forward(self, inputs: List[Tensor]) -> Tensor:
         return T.add_n(inputs)
 
 
diff --git a/tests/layers/test_flow_layer.py b/tests/layers/test_flow_layer.py
index 995e86e..bb0e654 100644
--- a/tests/layers/test_flow_layer.py
+++ b/tests/layers/test_flow_layer.py
@@ -10,15 +10,15 @@
 from tests.ops import make_conv_shape
 
 
-class _MyFlow(tk.flows.BaseFlow):
+class _MyFlow(tk.flows.Flow):
 
     @T.jit_method
-    def _forward(self,
-                 input: Tensor,
-                 input_log_det: Optional[Tensor],
-                 inverse: bool,
-                 compute_log_det: bool
-                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    def _transform(self,
+                   input: Tensor,
+                   input_log_det: Optional[Tensor],
+                   inverse: bool,
+                   compute_log_det: bool
+                   ) -> Tuple[Tensor, Optional[Tensor]]:
         if inverse:
             raise RuntimeError('Not invertible.')
         output = input * 2.
diff --git a/tests/layers/test_pixelcnn.py b/tests/layers/test_pixelcnn.py
index b312f53..59ede6f 100644
--- a/tests/layers/test_pixelcnn.py
+++ b/tests/layers/test_pixelcnn.py
@@ -103,9 +103,9 @@ def ensure_full_receptive_field(ctx,
     )
 
 
-class _MyAddContext(tk.layers.BaseContextualLayer):
+class _MyAddContext(tk.layers.BaseLayer):
 
-    def _forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
+    def forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
         if len(context) == 0:
             return input
         elif len(context) == 1:
@@ -266,7 +266,7 @@ def test_pixelcnn_network(self):
                 deconv_layer_cls = getattr(
                     tk.layers, f'PixelCNNConvTranspose{spatial_ndims}d')
                 normalizer_cls = getattr(
-                    tk.layers, f'ActNorm{spatial_ndims}d')
+                    tk.layers, f'BatchNorm{spatial_ndims}d')
                 dropout_cls = getattr(
                     tk.layers, f'Dropout{spatial_ndims}d')
 
diff --git a/tests/layers/test_resnet.py b/tests/layers/test_resnet.py
index abf95e9..79742c2 100644
--- a/tests/layers/test_resnet.py
+++ b/tests/layers/test_resnet.py
@@ -48,8 +48,8 @@ def check_resblock(ctx,
     # force `use_bias` = False
     layer = resblock_cls(in_channels=5, out_channels=5, kernel_size=1,
                          use_bias=False)
-    ctx.assertIsNone(layer.conv0.bias_store)
-    ctx.assertIsNone(layer.conv1.bias_store)
+    ctx.assertFalse(layer.conv0.use_bias)
+    ctx.assertFalse(layer.conv1.use_bias)
 
     layer = T.jit_compile(layer)
     assert_allclose(
@@ -63,7 +63,7 @@ def check_resblock(ctx,
                          use_shortcut=True)
     ctx.assertIsInstance(layer.shortcut, linear_cls)
     ctx.assertIsInstance(layer.shortcut.weight_store, tk.layers.SimpleParamStore)
-    ctx.assertIsNone(layer.shortcut.bias_store)
+    ctx.assertFalse(layer.shortcut.use_bias)
     ctx.assertEqual(layer.shortcut.kernel_size, [1] * spatial_ndims)
     ctx.assertEqual(layer.shortcut.stride, [1] * spatial_ndims)
     ctx.assertEqual(layer.shortcut.padding, [(0, 0)] * spatial_ndims)
@@ -101,7 +101,7 @@ def check_resblock(ctx,
         **output_padding_arg
     )
     ctx.assertIsInstance(layer.shortcut, linear_cls)
-    ctx.assertIsNone(layer.shortcut.bias_store)
+    ctx.assertFalse(layer.shortcut.use_bias)
     ctx.assertEqual(layer.shortcut.kernel_size, kernel_size)
     ctx.assertEqual(layer.shortcut.stride, stride)
     ctx.assertEqual(layer.shortcut.padding, padding)
@@ -158,7 +158,7 @@ def check_resblock(ctx,
     tk.layers.set_train_mode(layer, True)
     _ = layer(x)  # initialize the normalizers
     tk.layers.set_train_mode(layer, False)
-    ctx.assertIsNone(layer.conv0.bias_store)
+    ctx.assertFalse(layer.conv0.use_bias)
     ctx.assertIsInstance(layer.pre_conv0, tk.layers.Sequential)
     ctx.assertIsInstance(layer.pre_conv0[0], normalizer_cls)
     ctx.assertIsInstance(layer.pre_conv0[1], tk.layers.LeakyReLU)
diff --git a/tests/losses/test_core.py b/tests/losses/test_core.py
deleted file mode 100644
index 6d17cce..0000000
--- a/tests/losses/test_core.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import unittest
-
-import tensorkit as tk
-from tensorkit import tensor as T
-from tensorkit.tensor import Tensor
-
-from tests.helper import *
-
-
-class _MySupervisedLoss1(tk.losses.BaseSupervisedLossLayer):
-
-    def _forward(self, output: Tensor, target: Tensor) -> Tensor:
-        return output + target
-
-
-class _MySupervisedLoss2(tk.losses.BaseSupervisedLossLayer):
-
-    def _forward(self, output: Tensor, target: Tensor) -> Tensor:
-        return (output + target).mean()
-
-
-class BaseLossesTestCase(unittest.TestCase):
-
-    def test_supervised(self):
-        output = T.random.randn([2, 3, 4])
-        target = T.random.randn([3, 4])
-
-        l = T.jit_compile(_MySupervisedLoss1())
-        assert_allclose(l(output, target), (output + target).mean())
-        l = T.jit_compile(_MySupervisedLoss2())
-        assert_allclose(l(output, target), (output + target).mean())
diff --git a/tests/ops.py b/tests/ops.py
index c917ac2..3dc8d17 100644
--- a/tests/ops.py
+++ b/tests/ops.py
@@ -14,7 +14,7 @@
     # convolution shape ops
     'get_spatial_axis', 'get_channel_axis',
     'channel_to_last_nd', 'channel_to_first_nd', 'space_to_depth_nd',
-    'make_conv_shape',
+    'make_conv_shape', 'get_conv_output_size', 'get_deconv_output_size',
 
     # convolution ops
     'dense', 'conv_nd', 'conv_transpose_nd',
diff --git a/tests/tensor/test_core.py b/tests/tensor/test_core.py
index c43763a..f5d5124 100644
--- a/tests/tensor/test_core.py
+++ b/tests/tensor/test_core.py
@@ -1080,6 +1080,24 @@ def log_f_exp(f, x, axis=None, keepdims=False):
                                    match='`axis` must not be an empty list'):
                     _ = T_op(t, axis=[])
 
+        # test argmax, argmin
+        def np_argmaxmin(fn, x, axis, keepdims=False):
+            r_shape = list(x.shape)
+            r_shape[axis] = 1
+            r = fn(x, axis)
+            if keepdims:
+                r = r.reshape(r_shape)
+            return r
+
+        for name in ['argmax', 'argmin']:
+            T_op = getattr(T, name, getattr(T, name, None))
+            np_op = partial(np_argmaxmin, getattr(np, name))
+
+            for axis in (0, 1, 2, -1, -2, -3):
+                assert_allclose(T_op(t, axis=axis), np_op(x, axis=axis))
+                assert_allclose(T_op(t, axis=axis, keepdims=True),
+                                np_op(x, axis=axis, keepdims=True))
+
         # test calculate_mean_and_var
         x = np.random.randn(3, 4, 5)
         for dtype in float_dtypes:
@@ -1362,7 +1380,7 @@ def test_gradient(self):
             [l_sum, l_squares],
             [xt, yt],
             grad_outputs=[None, T.ones_like(l_squares)],
-            keep_graph=True,
+            retain_graph=True,
             create_graph=True
         )
         assert_allclose(x_grad, y + 21 * x ** 2)
@@ -1373,7 +1391,7 @@ def test_gradient(self):
             [x_grad, y_grad],
             [xt, yt],
             grad_outputs=[T.ones_like(xt), T.ones_like(yt)],
-            keep_graph=True,
+            retain_graph=True,
             create_graph=False
         )
         assert_allclose(x_grad_2, 42. * x + 1.)
@@ -1384,7 +1402,7 @@ def test_gradient(self):
             [l_sum, l_squares],
             [xt],
             grad_outputs=[None, T.ones_like(l_squares)],
-            keep_graph=True,
+            retain_graph=True,
             create_graph=True
         )
         assert_allclose(x_grad, y + 21 * x ** 2)
@@ -1393,7 +1411,7 @@ def test_gradient(self):
             [l_sum, l_squares],
             [yt],
             grad_outputs=[None, T.ones_like(l_squares)],
-            keep_graph=True,
+            retain_graph=True,
             create_graph=True
         )
         assert_allclose(y_grad, x + 33 * y ** 2)
@@ -1403,7 +1421,7 @@ def test_gradient(self):
         [x_grad, y_grad] = T.grad(
             [l_sum],
             [xt, yt],
-            keep_graph=False,
+            retain_graph=False,
             create_graph=False,
             allow_unused=True,
         )
@@ -1417,11 +1435,11 @@ def test_gradient(self):
 
         # stop_grad, but `allow_unused` is False
         l_sum = T.reduce_sum(T.stop_grad(xt ** 2) * yt)
-        with pytest.raises(Exception, match='Set allow_unused=True'):
+        with pytest.raises(Exception):
             _ = T.grad(
                 [l_sum],
                 [xt, yt],
-                keep_graph=False,
+                retain_graph=False,
                 create_graph=False,
                 allow_unused=False,
             )
diff --git a/tests/tensor/test_nn.py b/tests/tensor/test_nn.py
index 3285e20..78798d5 100644
--- a/tests/tensor/test_nn.py
+++ b/tests/tensor/test_nn.py
@@ -203,6 +203,12 @@ def cross_entropy(logits, labels, reduction, negative):
                     _f(logits), _f(labels), reduction, negative)
                 assert_allclose(ans, out)
 
+                # test cross_entropy with int32 labels
+                ans = cross_entropy(logits, labels, reduction, negative)
+                out = T.nn.cross_entropy_with_logits(
+                    _f(logits), T.cast(_f(labels), dtype=T.int32), reduction, negative)
+                assert_allclose(ans, out)
+
                 # test cross_entropy on 2d
                 ans = cross_entropy(
                     logits[0, 0, 0], labels[0, 0], reduction, negative)
diff --git a/tests/tensor/test_utils.py b/tests/tensor/test_utils.py
new file mode 100644
index 0000000..5bd668e
--- /dev/null
+++ b/tests/tensor/test_utils.py
@@ -0,0 +1,119 @@
+import unittest
+from itertools import product
+
+import pytest
+
+import tensorkit as tk
+from tensorkit import tensor as T
+from tests.ops import *
+
+
+class UtilsTestCase(unittest.TestCase):
+
+    def test_split_channel_spatial_shape(self):
+        for spatial_ndims in (1, 2, 3):
+            conv_shape = make_conv_shape([], 6, [7, 8, 9][:spatial_ndims])
+            self.assertEqual(
+                T.utils.split_channel_spatial_shape(conv_shape),
+                (6, [7, 8, 9][:spatial_ndims])
+            )
+        with pytest.raises(Exception, match='Invalid `shape`'):
+            _ = T.utils.split_channel_spatial_shape([])
+
+    def test_unsplit_channel_spatial_shape(self):
+        for spatial_ndims in (1, 2, 3):
+            conv_shape = make_conv_shape([], 6, [7, 8, 9][:spatial_ndims])
+            self.assertEqual(
+                T.utils.unsplit_channel_spatial_shape(6, [7, 8, 9][:spatial_ndims]),
+                conv_shape
+            )
+        with pytest.raises(Exception, match='Invalid `size`'):
+            _ = T.utils.unsplit_channel_spatial_shape(1, [])
+
+    def test_conv_deconv_output_shape_and_args(self):
+        for input_size, kernel_size, stride, padding, dilation in product(
+                    ([8, 9, 10], [16, 21, 32], [30, 31, 32]),
+                    ([1] * 3, [2] * 3, [3] * 3, [1, 2, 3]),
+                    ([1] * 3, [2] * 3, [3] * 3, [1, 2, 3]),
+                    ([(0, 0)] * 3, [(1, 1)] * 3, [(2, 2)] * 3, [(3, 3)] * 3,
+                     [(1, 2), (2, 3), (3, 4)]),
+                    ([1] * 3, [2] * 3, [3] * 3, [1, 2, 3]),
+                ):
+            args = (input_size, kernel_size, stride, padding, dilation)
+
+            # calculate_conv_output_size
+            output_size = [get_conv_output_size(*a) for a in zip(*args)]
+            self.assertEqual(
+                T.utils.calculate_conv_output_size(
+                    input_size=input_size, kernel_size=kernel_size,
+                    stride=stride, padding=padding, dilation=dilation,
+                ),
+                output_size
+            )
+            layer1 = tk.layers.LinearConv3d(
+                1, 1, kernel_size=kernel_size, stride=stride, padding=padding,
+                dilation=dilation,
+            )
+            x = T.zeros(make_conv_shape([1], 1, input_size))
+            y = layer1(x)
+            self.assertEqual(
+                T.utils.split_channel_spatial_shape(T.shape(y)[1:])[1],
+                output_size,
+            )
+
+            # calculate_deconv_output_padding
+            output_padding = T.utils.calculate_deconv_output_padding(
+                input_size=output_size, output_size=input_size,
+                kernel_size=kernel_size, stride=stride, padding=padding,
+                dilation=dilation,
+            )
+            layer2 = tk.layers.LinearConvTranspose3d(
+                1, 1, kernel_size=kernel_size, stride=stride, padding=padding,
+                output_padding=output_padding, dilation=dilation,
+            )
+            z = layer2(y)
+            self.assertEqual(
+                T.utils.split_channel_spatial_shape(T.shape(z)[1:])[1],
+                input_size,
+            )
+
+            # calculate_deconv_output_size
+            self.assertEqual(
+                T.utils.calculate_deconv_output_size(
+                    input_size=output_size, kernel_size=kernel_size,
+                    stride=stride, padding=padding,
+                    output_padding=output_padding, dilation=dilation,
+                ),
+                input_size
+            )
+
+        # test error
+        kwargs = dict(kernel_size=[1], stride=[1], dilation=[1], padding=[(0, 0)])
+        for input_size in ([], [1, 2, 3, 4]):
+            with pytest.raises(Exception,
+                               match='`input_size` is not a 1d, 2d or 3d '
+                                     'convolutional input size'):
+                _ = T.utils.calculate_conv_output_size(input_size, **kwargs)
+            with pytest.raises(Exception,
+                               match='`input_size` is not a 1d, 2d or 3d '
+                                     'convolutional input size'):
+                _ = T.utils.calculate_deconv_output_size(input_size, output_padding=[0], **kwargs)
+
+        for arg_name in ('kernel_size', 'stride', 'dilation', 'padding'):
+            kwargs2 = dict(kwargs)
+            if arg_name == 'padding':
+                kwargs2[arg_name] = [(0, 0)] * 2
+            else:
+                kwargs2[arg_name] = [1, 1]
+            with pytest.raises(Exception, match='`.*` is not for .*d convolution'):
+                _ = T.utils.calculate_conv_output_size([11], **kwargs2)
+            with pytest.raises(Exception, match='`.*` is not for .*d convolution'):
+                _ = T.utils.calculate_deconv_output_size([11], output_padding=[0], **kwargs2)
+
+        with pytest.raises(Exception, match='`.*` is not for .*d convolution'):
+            _ = T.utils.calculate_deconv_output_size([11], output_padding=[0, 0], **kwargs)
+
+        with pytest.raises(Exception,
+                           match='No `output_padding` can satisfy the '
+                                 'deconvolution task'):
+            _ = T.utils.calculate_deconv_output_padding([2], [1], [1], [1], [(0, 0)], [1])
diff --git a/tests/train/__init__.py b/tests/train/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/train/test_core.py b/tests/train/test_core.py
new file mode 100644
index 0000000..2b1f793
--- /dev/null
+++ b/tests/train/test_core.py
@@ -0,0 +1,67 @@
+import os
+import unittest
+from tempfile import TemporaryDirectory
+
+import numpy as np
+import pytest
+import torch
+
+from mltk import SimpleStatefulObject
+
+import tensorkit as tk
+
+
+class TorchCheckpointTestCase(unittest.TestCase):
+
+    def test_invalid_type(self):
+        with pytest.raises(TypeError,
+                           match=r'Object must be a :class:`StatefulObject`, '
+                                 r'or has `state_dict\(\)` and '
+                                 r'`load_state_dict\(\)` methods: got 123'):
+            _ = tk.train.Checkpoint(obj=123)
+
+    def test_save_restore(self):
+        x = torch.from_numpy(np.random.normal(size=[2, 5]).astype(np.float32))
+
+        with TemporaryDirectory() as temp_dir:
+            root_dir = os.path.join(temp_dir, 'ckpt')
+
+            # test save
+            layer = torch.nn.Linear(5, 3)
+            optimizer = tk.optim.Adam(tk.layers.get_parameters(layer))
+
+            obj = SimpleStatefulObject()
+            obj.value = 123456
+            ckpt = tk.train.Checkpoint(obj=obj, optimizer=optimizer, layer=layer)
+            ckpt.save(root_dir)
+
+            # test restore
+            layer2 = torch.nn.Linear(5, 3)
+            optimizer2 = tk.optim.Adam(tk.layers.get_parameters(layer2))
+            obj2 = SimpleStatefulObject()
+            ckpt2 = tk.train.Checkpoint(obj=obj2, optimizer=optimizer2, layer=layer2)
+            ckpt2.restore(root_dir)
+
+            # todo: check the state of the optimizer
+
+            # compare two objects
+            out = layer(x)
+            out2 = layer2(x)
+            self.assertTrue(torch.allclose(out2, out))
+            self.assertEqual(obj2.value, 123456)
+
+            # test partial restore
+            layer3 = torch.nn.Linear(5, 3)
+            ckpt3 = tk.train.Checkpoint(layer=layer3)
+            ckpt3.restore(root_dir)
+            self.assertTrue(torch.allclose(layer3(x), out))
+
+            # test restore error
+            ckpt4 = tk.train.Checkpoint(layer=layer3, xyz=SimpleStatefulObject())
+            with pytest.raises(ValueError,
+                               match=f'Key \'xyz\' does not exist in '
+                                     f'the state dict recovered from: '
+                                     f'{root_dir}'):
+                ckpt4.restore(root_dir)
+
+

From 3c08c4c5fbfde90608129e47f45d9737a208825e Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Mon, 17 Feb 2020 17:38:50 +0800
Subject: [PATCH 2/7] add gpu device support

---
 tensorkit/arg_check.py                        |   2 +-
 tensorkit/backend/pytorch_/core.py            | 202 +++++++++++++-----
 tensorkit/backend/pytorch_/flows.py           |  54 +++--
 tensorkit/backend/pytorch_/layers.py          | 132 ++++++++++--
 tensorkit/backend/pytorch_/random.py          |  41 +++-
 tensorkit/distributions/base.py               |   8 +-
 tensorkit/distributions/bernoulli.py          |   9 +-
 tensorkit/distributions/categorical.py        |  20 +-
 tensorkit/distributions/discretized.py        |  10 +-
 tensorkit/distributions/flow.py               |   6 +-
 tensorkit/distributions/mixture.py            |  18 +-
 tensorkit/distributions/normal.py             |  38 +++-
 tensorkit/distributions/uniform.py            |  15 +-
 tensorkit/distributions/utils.py              |  40 +++-
 tensorkit/examples/classification/mnist.py    |   6 +-
 .../examples/classification/mnist_resnet.py   |   6 +-
 tensorkit/flows/act_norm.py                   |  38 ++--
 tensorkit/flows/coupling.py                   |   5 +-
 tensorkit/flows/rearrangement.py              |  11 +-
 tensorkit/flows/reshape_.py                   |   6 +-
 tensorkit/flows/split_.py                     |   5 +-
 tensorkit/init/std_data_init.py               |   6 +-
 tensorkit/layers/builder.py                   |   7 +-
 tensorkit/layers/composed.py                  |   6 +
 tensorkit/layers/flow_layer.py                |   2 +-
 tensorkit/layers/pixelcnn.py                  |  26 ++-
 tensorkit/layers/resnet.py                    |  37 +++-
 tensorkit/layers/utils.py                     |  23 +-
 tensorkit/tensor/core.py                      |   4 +-
 tensorkit/tensor/core_extras.py               |   1 -
 tensorkit/tensor/random_extras.py             |  33 +--
 tensorkit/utils/tensor_stream.py              |  14 +-
 tests/distributions/test_flow.py              |   6 +-
 tests/flows/test_act_norm.py                  |   2 +-
 tests/flows/test_core.py                      |  42 ++--
 tests/flows/test_coupling.py                  |  11 +-
 tests/flows/test_rearrangement.py             |   2 +-
 tests/flows/test_shape_.py                    |   2 +-
 tests/flows/test_split_.py                    |  12 +-
 tests/init/test_core.py                       |   2 +-
 tests/init/test_std_data_init.py              |   2 +-
 tests/layers/test_composed.py                 |  12 +-
 tests/layers/test_contextual.py               |   6 +-
 tests/layers/test_core.py                     |  24 +--
 tests/layers/test_flow_layer.py               |   8 +-
 tests/layers/test_gated.py                    |   4 +-
 tests/layers/test_pixelcnn.py                 |  14 +-
 tests/layers/test_pool.py                     |   4 +-
 tests/layers/test_resnet.py                   |  18 +-
 tests/layers/test_shape_.py                   |  12 +-
 tests/layers/test_split_.py                   |   4 +-
 tests/tensor/test_core.py                     | 171 +++++++--------
 tests/tensor/test_nn.py                       |   6 +-
 tests/tensor/test_random.py                   |  22 +-
 tests/test_arg_check.py                       |   4 +-
 55 files changed, 803 insertions(+), 418 deletions(-)
 delete mode 100644 tensorkit/tensor/core_extras.py

diff --git a/tensorkit/arg_check.py b/tensorkit/arg_check.py
index 67edf5c..154d7ac 100644
--- a/tensorkit/arg_check.py
+++ b/tensorkit/arg_check.py
@@ -22,7 +22,7 @@ def validate_positive_int(arg_name: str, arg_value) -> int:
 
 # layer argument validators
 def validate_layer(arg_name: str, layer) -> 'Module':
-    from tensorkit.tensor import is_jit_layer
+    from tensorkit.layers import is_jit_layer
     if isinstance(layer, Module) or is_jit_layer(layer):
         return layer
     else:
diff --git a/tensorkit/backend/pytorch_/core.py b/tensorkit/backend/pytorch_/core.py
index 8c01417..c1a2aa0 100644
--- a/tensorkit/backend/pytorch_/core.py
+++ b/tensorkit/backend/pytorch_/core.py
@@ -1,4 +1,5 @@
 import math
+from contextlib import contextmanager
 from typing import *
 
 import numpy as np
@@ -10,14 +11,18 @@
 
 __all__ = [
     # constants
-    'IS_CHANNEL_LAST', 'EPSILON',
+    'IS_CHANNEL_LAST', 'EPSILON', 'CPU_DEVICE',
 
     # typing
     'Tensor', 'Variable', 'Module',
 
     # ordinary module base classes
     # jit
-    'jit', 'jit_ignore', 'jit_method', 'jit_compile', 'is_jit_layer',
+    'jit', 'jit_ignore', 'jit_method',
+
+    # device
+    'get_device', 'to_device', 'current_device', 'use_device',
+    'gpu_device_list', 'first_gpu_device',
 
     # utilities
     'int_range', 'identity',
@@ -27,7 +32,8 @@
     'is_floating_point_dtype',
 
     # tensor constructors
-    'as_tensor_backend', 'as_tensor', 'from_numpy', 'float_scalar', 'int_scalar',
+    'as_tensor', 'from_numpy',
+    'float_scalar', 'float_scalar_like', 'int_scalar', 'int_scalar_like',
     'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like',
     'arange', 'one_hot',
 
@@ -92,6 +98,9 @@
 EPSILON = 1e-6
 """The small infinitesimal constant to avoid diving by zero of taking logarithm of zero."""
 
+CPU_DEVICE = 'cpu'
+"""The constant that represents the local CPU device."""
+
 
 # ---- typing ----
 Tensor = torch.Tensor
@@ -118,18 +127,55 @@ def jit_method(fn):
     return fn
 
 
-def jit_compile(m):
-    if isinstance(m, Module):
-        if not settings.disable_jit:
-            m = torch.jit.script(m)
+# ---- device ----
+@jit
+def get_device(t: Tensor) -> str:
+    return str(t.device)
+
+
+@jit
+def to_device(t: Tensor, device: str) -> Tensor:
+    if str(t.device) != device:
+        t = t.to(device=device)
+    return t
+
+
+_current_device = [CPU_DEVICE]
+
+
+@jit_ignore
+def current_device() -> str:
+    return _current_device[0]
+
+
+@contextmanager
+def use_device(device: str):
+    if not torch.cuda.is_available():
+        if device != CPU_DEVICE:
+            raise RuntimeError('GPU is not available.')
+        yield
     else:
-        raise TypeError(f'Not supported by `jit_compile`: {m!r}')
-    return m
+        old_device = _current_device[0]
+        try:
+            with torch.cuda.device(device):
+                _current_device[0] = device
+                yield
+        finally:
+            _current_device[0] = old_device
 
 
-def is_jit_layer(layer: Module) -> bool:
-    """Check whether or not `layer` is a JIT compiled layer."""
-    return isinstance(layer, torch.jit.ScriptModule)
+def gpu_device_list() -> List[str]:
+    return [f'cuda:{index}' for index in range(torch.cuda.device_count())]
+
+
+def first_gpu_device(fallback_to_cpu: bool = True) -> str:
+    gpu_list = gpu_device_list()
+    if not gpu_list:
+        if not fallback_to_cpu:  # pragma: no cover
+            raise RuntimeError('No GPU is available.')
+        else:
+            return CPU_DEVICE
+    return gpu_list[0]
 
 
 # ---- utilities ----
@@ -147,22 +193,28 @@ def int_range(start: int, end: int, step: int = 1) -> List[int]:
 
 # ---- dtypes ----
 @jit
-def cast(input: Tensor, dtype: str) -> Tensor:
+def cast(input: Tensor, dtype: str, device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         target_dtype = torch.float32
     elif dtype == 'int32':
         target_dtype = torch.int32
     else:
         target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
-    if target_dtype != input.dtype:
+
+    if target_dtype != input.dtype and device is not None:
+        input = input.to(dtype=target_dtype, device=device)
+    elif target_dtype != input.dtype:
         input = input.to(dtype=target_dtype)
+    elif device is not None:
+        input = input.to(device=device)
+
     return input
 
 
 @jit
-def cast_like(input: Tensor, dtype_as: Tensor) -> Tensor:
-    if dtype_as.dtype != input.dtype:
-        input = input.to(dtype=dtype_as.dtype)
+def cast_like(input: Tensor, like: Tensor) -> Tensor:
+    if like.dtype != input.dtype:
+        input = input.to(dtype=like.dtype, device=like.device)
     return input
 
 
@@ -193,18 +245,10 @@ def is_floating_point_dtype(dtype: str) -> bool:
 
 
 # ---- tensor constructors ----
-as_tensor_backend = torch.as_tensor
-"""
-``T.as_tensor`` with JIT support.
-
-This should be an alias of the backend function ``as_tensor(data, dtype=None)``.
-Use only ``(data) -> torch.Tensor``, or ``(data, dtype=another_tensor.dtype) -> torch.Tensor``.
-"""
-
-
 @jit_ignore
 def as_tensor(data,
               dtype: Optional[Union[torch.dtype, str]] = None,
+              device: Optional[str] = None,
               force_copy: bool = False) -> Tensor:
     """
     Construct a new tensor from `data`.
@@ -217,6 +261,7 @@ def as_tensor(data,
             another tensor, a :class:`~tensorkit.StochasticTensor`, or anything
             else that the backend supports.
         dtype: The expected dtype of the constructed tensor.
+        device: Where to put the new tensor.
         force_copy: Force to copy `data` even if it is not necessary.
             The gradient propagation will not be stopped from the copied tensor
             to the original tensor.  The caller may need to use `T.stop_grad()`
@@ -243,27 +288,38 @@ def as_tensor(data,
             else:
                 target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
 
+    # check the device argument
+    if device is None:
+        device = current_device()
+
     # if `data` is already a tensor
     if isinstance(data, StochasticTensor):
         data = data.tensor
 
     if isinstance(data, Tensor):
         # input `data` may be `StochasticTensor`, `Tensor` or `numpy.ndarray`
+        kwargs = {}
         if data.dtype != target_dtype:
-            data = data.to(target_dtype)
+            kwargs['dtype'] = target_dtype
+        if str(data.device) != device:
+            kwargs['device'] = device
+        if kwargs:
+            data = data.to(**kwargs)
         if force_copy:
             data = data.clone()
         return data
 
     # or if `data` is other types
-    ret = torch.as_tensor(data, dtype=target_dtype)
+    ret = torch.as_tensor(data, dtype=target_dtype, device=device)
     if force_copy:
         ret = ret.clone()
     return ret
 
 
 @jit_ignore
-def from_numpy(data, dtype: Optional[Union[torch.dtype, str]] = None) -> Tensor:
+def from_numpy(data,
+               dtype: Optional[Union[torch.dtype, str]] = None,
+               device: Optional[str] = None) -> Tensor:
     """
     Construct a new tensor from given numpy array `data`.
 
@@ -271,40 +327,68 @@ def from_numpy(data, dtype: Optional[Union[torch.dtype, str]] = None) -> Tensor:
         data: The numpy array, which will always be copied, even if the backend
             supports share memory between a numpy array and a tensor.
         dtype: The expected dtype of the constructed tensor.
+        device: Where to put the new tensor.
 
     Returns:
         The constructed tensor.
     """
-    return as_tensor(data, dtype=dtype, force_copy=True)
+    if device is None:
+        device = current_device()
+    return as_tensor(data, dtype=dtype, device=device, force_copy=True)
 
 
 @jit
-def float_scalar(data: float, dtype: str = settings.float_x) -> Tensor:
+def float_scalar(data: float,
+                 dtype: str = settings.float_x,
+                 device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         real_dtype = torch.float32
     else:
         real_dtype = {'float16': torch.float16, 'float64': torch.float64}[dtype]
-    return torch.tensor(data, dtype=real_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.tensor(data, dtype=real_dtype, device=device)
+
+
+@jit
+def float_scalar_like(data: float, like: Tensor) -> Tensor:
+    return torch.tensor(data, dtype=like.dtype, device=like.device)
 
 
 @jit
-def int_scalar(data: int, dtype: str = 'int32') -> Tensor:
+def int_scalar(data: int,
+               dtype: str = 'int32',
+               device: Optional[str] = None) -> Tensor:
     if dtype == 'int32':
         int_dtype = torch.int32
     else:
         int_dtype = {'int8': torch.int8, 'int16': torch.int16, 'int64': torch.int64}[dtype]
-    return torch.tensor(data, dtype=int_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.tensor(data, dtype=int_dtype, device=device)
 
 
 @jit
-def zeros(shape: List[int], dtype: str = settings.float_x) -> Tensor:
+def int_scalar_like(data: int, like: Tensor) -> Tensor:
+    return torch.tensor(data, dtype=like.dtype, device=like.device)
+
+
+@jit
+def zeros(shape: List[int],
+          dtype: str = settings.float_x,
+          device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         target_dtype = torch.float32
     elif dtype == 'int32':
         target_dtype = torch.int32
     else:
         target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
-    return torch.zeros(shape, dtype=target_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.zeros(shape, dtype=target_dtype, device=device)
 
 
 @jit
@@ -322,18 +406,23 @@ def zeros_like(input: Tensor,
         target_dtype = input.dtype
     if shape is None:
         shape = list(input.shape)
-    return torch.zeros(shape, dtype=target_dtype)
+    return torch.zeros(shape, dtype=target_dtype, device=input.device)
 
 
 @jit
-def ones(shape: List[int], dtype: str = settings.float_x) -> Tensor:
+def ones(shape: List[int],
+         dtype: str = settings.float_x,
+         device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         target_dtype = torch.float32
     elif dtype == 'int32':
         target_dtype = torch.int32
     else:
         target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
-    return torch.ones(shape, dtype=target_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.ones(shape, dtype=target_dtype, device=device)
 
 
 @jit
@@ -351,20 +440,24 @@ def ones_like(input: Tensor,
         target_dtype = input.dtype
     if shape is None:
         shape = list(input.shape)
-    return torch.ones(shape, dtype=target_dtype)
+    return torch.ones(shape, dtype=target_dtype, device=input.device)
 
 
 @jit
 def full(shape: List[int],
          fill_value: float,
-         dtype: str = settings.float_x) -> Tensor:
+         dtype: str = settings.float_x,
+         device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         target_dtype = torch.float32
     elif dtype == 'int32':
         target_dtype = torch.int32
     else:
         target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
-    return torch.full(shape, fill_value, dtype=target_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.full(shape, fill_value, dtype=target_dtype, device=device)
 
 
 @jit
@@ -383,18 +476,22 @@ def full_like(input: Tensor,
         target_dtype = input.dtype
     if shape is None:
         shape = list(input.shape)
-    return torch.full(shape, fill_value, dtype=target_dtype)
+    return torch.full(shape, fill_value, dtype=target_dtype, device=input.device)
 
 
 @jit
-def arange(start: int, end: int, step: int = 1, dtype: str = 'int32') -> Tensor:
+def arange(start: int, end: int, step: int = 1, dtype: str = 'int32',
+           device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         target_dtype = torch.float32
     elif dtype == 'int32':
         target_dtype = torch.int32
     else:
         target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
-    return torch.arange(start, end, step, dtype=target_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.arange(start, end, step, dtype=target_dtype, device=device)
 
 
 @jit
@@ -430,6 +527,7 @@ def to_numpy(input: Tensor) -> np.ndarray:
 # ---- variable and initializer ----
 def variable(shape: List[int],
              dtype: Union[str, torch.dtype] = settings.float_x,
+             device: Optional[str] = None,
              initializer: Optional[
                  Union[
                      int, float, np.ndarray, Tensor,
@@ -444,6 +542,7 @@ def variable(shape: List[int],
     Args:
         shape: Shape of the variable.
         dtype: Dtype of the variable.
+        device: The device where to place new tensors and variables.
         initializer: The variable initializer.  It may be a scalar (which
             will be filled into the new variable), an array or another
             `Tensor` with the same shape as specified `shape`, or a callable
@@ -467,28 +566,31 @@ def variable(shape: List[int],
     else:
         target_dtype = dtype
 
+    if device is None:
+        device = current_device()
+
     if isinstance(initializer, (int, float)):
         ret = torch.full(shape, float(initializer), dtype=target_dtype,
-                         requires_grad=requires_grad)
+                         device=device, requires_grad=requires_grad)
     elif isinstance(initializer, np.ndarray) and initializer.shape == ():
         ret = torch.full(shape, initializer.tolist(), dtype=target_dtype,
-                         requires_grad=requires_grad)
+                         device=device, requires_grad=requires_grad)
     elif isinstance(initializer, (np.ndarray, Tensor)):
         if list(initializer.shape) != shape:
             raise ValueError(f'`initializer.shape` != `shape`: '
                              f'{list(initializer.shape)} vs {shape}')
         ret = as_tensor(initializer, dtype=target_dtype,
-                        force_copy=force_copy)
+                        device=device, force_copy=force_copy)
         if requires_grad:
             ret.requires_grad_(True)
     elif isinstance(initializer, Callable):
-        ret = zeros(shape, dtype=dtype)
+        ret = zeros(shape, device=device, dtype=dtype)
         with torch.no_grad():
             initializer(ret)
         if requires_grad:
             ret.requires_grad_(True)
     elif initializer is None:
-        ret = torch.zeros(shape, dtype=target_dtype,
+        ret = torch.zeros(shape, dtype=target_dtype, device=device,
                           requires_grad=requires_grad)
     else:
         raise TypeError(f'Unsupported initializer: {initializer!r}')
diff --git a/tensorkit/backend/pytorch_/flows.py b/tensorkit/backend/pytorch_/flows.py
index 216bc72..df28aec 100644
--- a/tensorkit/backend/pytorch_/flows.py
+++ b/tensorkit/backend/pytorch_/flows.py
@@ -353,7 +353,8 @@ class LooseInvertibleMatrix(InvertibleMatrix):
 
     def __init__(self,
                  seed_matrix: np.ndarray,
-                 dtype: str = settings.float_x):
+                 dtype: str = settings.float_x,
+                 device: Optional[str] = None):
         """
         Construct a new :class:`LooseInvertibleMatrix`.
 
@@ -361,11 +362,16 @@ def __init__(self,
             seed_matrix: A matrix that is used as a seed to obtain the
                 initial invertible and orthogonal matrix.
             dtype: The dtype of the matrix.
+            device: The device where to place new tensors and variables.
         """
+        device = device or current_device()
         initial_matrix = la.qr(seed_matrix)[0]
 
         super().__init__(initial_matrix.shape[0])
-        add_parameter(self, 'matrix', from_numpy(initial_matrix, dtype=dtype))
+        add_parameter(
+            self, 'matrix',
+            from_numpy(initial_matrix, dtype=dtype, device=device)
+        )
 
     def forward(self,
                 inverse: bool,
@@ -392,6 +398,7 @@ class StrictInvertibleMatrix(InvertibleMatrix):
     def __init__(self,
                  seed_matrix: np.ndarray,
                  dtype: str = settings.float_x,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
         """
         Construct a new :class:`StrictInvertibleMatrix`.
@@ -400,8 +407,12 @@ def __init__(self,
             seed_matrix: A matrix that is used as a seed to obtain the
                 initial invertible and orthogonal matrix.
             dtype: The dtype of the matrix.
+            device: The device where to place new tensors and variables.
+            epsilon: The infinitesimal constant to avoid dividing by zero or
+                taking logarithm of zero.
         """
         initial_matrix = la.qr(seed_matrix)[0]
+        device = device or current_device()
 
         super().__init__(initial_matrix.shape[0])
         matrix_shape = list(initial_matrix.shape)
@@ -413,25 +424,29 @@ def __init__(self,
         initial_log_s = np.log(np.maximum(np.abs(initial_s), epsilon))
         initial_U = np.triu(initial_U, k=1)
 
-        add_buffer(self, 'P', from_numpy(initial_P, dtype=dtype))
+        add_buffer(self, 'P', from_numpy(initial_P, dtype=dtype, device=device))
         assert_finite(
-            add_parameter(self, 'pre_L', from_numpy(initial_L, dtype=dtype)),
+            add_parameter(
+                self, 'pre_L', from_numpy(initial_L, dtype=dtype, device=device)),
             'pre_L',
         )
         add_buffer(
-            self, 'L_mask',
-            from_numpy(np.tril(np.ones(matrix_shape), k=-1), dtype=dtype))
+            self, 'L_mask', from_numpy(
+                np.tril(np.ones(matrix_shape), k=-1), dtype=dtype, device=device)
+        )
         assert_finite(
-            add_parameter(self, 'pre_U', from_numpy(initial_U, dtype=dtype)),
+            add_parameter(self, 'pre_U', from_numpy(
+                initial_U, dtype=dtype, device=device)),
             'pre_U',
         )
         add_buffer(
-            self, 'U_mask',
-            from_numpy(np.triu(np.ones(matrix_shape), k=1), dtype=dtype))
+            self, 'U_mask', from_numpy(
+                np.triu(np.ones(matrix_shape), k=1), dtype=dtype, device=device))
         add_buffer(
-            self, 'sign', from_numpy(initial_sign, dtype=dtype))
+            self, 'sign', from_numpy(initial_sign, dtype=dtype, device=device))
         assert_finite(
-            add_parameter(self, 'log_s', from_numpy(initial_log_s, dtype=dtype)),
+            add_parameter(self, 'log_s', from_numpy(
+                initial_log_s, dtype=dtype, device=device)),
             'log_s',
         )
 
@@ -441,7 +456,7 @@ def forward(self,
                 ) -> Tuple[Tensor, Optional[Tensor]]:
         P = self.P
         L = (self.L_mask * self.pre_L +
-             torch.eye(self.size, dtype=P.dtype))
+             torch.eye(self.size, dtype=P.dtype, device=self.P.device))
         U = self.U_mask * self.pre_U + torch.diag(self.sign * exp(self.log_s))
 
         log_det: Optional[Tensor] = None
@@ -476,6 +491,7 @@ def __init__(self,
                  strict: bool = False,
                  weight_init: TensorInitArgType = init.kaming_uniform,
                  dtype: str = settings.float_x,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
         """
         Construct a new linear transformation flow.
@@ -489,9 +505,12 @@ def __init__(self,
                 and :class:`StrictInvertibleMatrix`.
             weight_init: The weight initializer for the seed matrix.
             dtype: The dtype of the invertible matrix.
+            device: The device where to place new tensors and variables.
             epsilon: The infinitesimal constant to avoid having numerical issues.
         """
         spatial_ndims = self._get_spatial_ndims()
+        device = device or current_device()
+
         super().__init__(
             axis=-(spatial_ndims + 1),
             event_ndims=(spatial_ndims + 1),
@@ -506,16 +525,17 @@ def __init__(self,
         # will allow the backend random seed to have effect on the initialization
         # step of the invertible matrix.
         seed_matrix = variable(
-            shape=[num_features, num_features], dtype=dtype,
+            shape=[num_features, num_features], dtype=dtype, device='cpu',
             initializer=weight_init, requires_grad=False,
         )
+        seed_matrix = to_numpy(seed_matrix)
 
         if strict:
             self.invertible_matrix = StrictInvertibleMatrix(
-                to_numpy(seed_matrix), dtype=dtype, epsilon=epsilon)
+                seed_matrix, dtype=dtype, device=device, epsilon=epsilon)
         else:
             self.invertible_matrix = LooseInvertibleMatrix(
-                to_numpy(seed_matrix), dtype=dtype)
+                seed_matrix, dtype=dtype, device=device)
 
     def _get_spatial_ndims(self) -> int:
         raise NotImplementedError()
@@ -792,12 +812,12 @@ def _scale_and_log_scale(self,
         if inverse:
             scale = 1. / pre_scale
             if compute_log_scale:
-                epsilon = as_tensor_backend(self.epsilon, dtype=pre_scale.dtype)
+                epsilon = float_scalar_like(self.epsilon, pre_scale)
                 log_scale = -log(maximum(abs(pre_scale), epsilon))
         else:
             scale = pre_scale
             if compute_log_scale:
-                epsilon = as_tensor_backend(self.epsilon, dtype=pre_scale.dtype)
+                epsilon = float_scalar_like(self.epsilon, pre_scale)
                 log_scale = log(maximum(abs(pre_scale), epsilon))
 
         return scale, log_scale
diff --git a/tensorkit/backend/pytorch_/layers.py b/tensorkit/backend/pytorch_/layers.py
index 53fdc72..4f16167 100644
--- a/tensorkit/backend/pytorch_/layers.py
+++ b/tensorkit/backend/pytorch_/layers.py
@@ -2,21 +2,24 @@
 
 import torch
 from torch import nn as torch_nn
+from torch.jit import script as torch_script
 from torch.nn import ModuleList
 
-from . import init
-from .core import *
+from ...settings_ import settings
 from ...typing_ import *
 from ...arg_check import *
+from . import init
+from .core import *
 
 __all__ = [
     # constants
     'DEFAULT_GATE_BIAS', 'DEFAULT_WEIGHT_INIT', 'DEFAULT_BIAS_INIT',
 
     # utils
+    'jit_compile', 'is_jit_layer', 'layer_to_device',
     'add_parameter', 'get_parameter', 'get_parameters', 'get_named_parameters',
     'add_buffer', 'get_buffer', 'get_buffers', 'get_named_buffers',
-    'set_train_mode',
+    'set_train_mode', 'set_eval_mode',
 
     # parameter store modules
     'ParamStore', 'SimpleParamStore',
@@ -49,6 +52,36 @@
 
 
 # ---- utils ----
+def jit_compile(m: Module) -> Module:
+    if not settings.disable_jit:
+        m = torch_script(m)
+    return m
+
+
+def is_jit_layer(layer: Module) -> bool:
+    """Check whether or not `layer` is a JIT compiled layer."""
+    return isinstance(layer, torch.jit.ScriptModule)
+
+
+def layer_to_device(layer: Module, device: Optional[str] = None) -> Module:
+    """
+    Move the specified module or layer to the given device.
+    The module or layer may be changed in-place.
+
+    Args:
+        layer: The module or layer to be moved.
+        device: The device, to where move the module or layer.
+            If not specified, will move to ``T.current_device()``.
+
+    Returns:
+        The layer instance.
+    """
+    if device is None:
+        device = current_device()
+    layer = layer.to(device=torch.device(device))
+    return layer
+
+
 def add_parameter(layer: Module,
                   name: str,
                   value: Optional[Tensor],
@@ -103,14 +136,27 @@ def set_train_mode(layer: Module, training: bool = True):
     return layer
 
 
+def set_eval_mode(layer: Module):
+    layer.train(False)
+    return layer
+
+
 # ---- weight wrapper: a simple weight, or a normed weight ----
 class _NullParamStore(Module):
     # This module is actually not used in any context.
     # It is just a place-holder module, to gain JIT support.
 
+    __constants__ = ('device',)
+
+    device: str
+
+    def __init__(self, device: Optional[str] = None):
+        super().__init__()
+        self.device = device or current_device()
+
     def forward(self) -> Tensor:  # pragma: no cover
         zero_shape: List[int] = []
-        return torch.zeros(zero_shape, dtype=torch.float32)
+        return zeros(zero_shape, dtype='float32', device=self.device)
 
 
 class ParamStore(Module):
@@ -146,9 +192,12 @@ class SimpleParamStore(ParamStore):
 
     def __init__(self,
                  shape: List[int],
-                 initializer: TensorInitArgType):
+                 initializer: TensorInitArgType,
+                 device: Optional[str] = None):
+        device = device or current_device()
         super().__init__(shape)
-        add_parameter(self, 'value', variable(shape, initializer=initializer))
+        add_parameter(self, 'value', variable(
+            shape, initializer=initializer, device=device))
 
     @jit_method
     def get(self) -> Tensor:
@@ -177,7 +226,7 @@ def weight_norm_decompose(weight: Tensor,
         A tuple of `(v, v_norm)`.
     """
     v_norm = norm_except_axis(weight, axis=[norm_axis], keepdims=True)
-    v = weight / torch.max(v_norm, torch.as_tensor(epsilon, dtype=v_norm.dtype))
+    v = weight / torch.max(v_norm, float_scalar_like(epsilon, v_norm))
     return v, v_norm
 
 
@@ -193,12 +242,15 @@ def __init__(self,
                  shape: List[int],
                  initializer: TensorInitArgType,
                  norm_axis: int = 1,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
+        device = device or current_device()
+
         super().__init__(shape)
         self.norm_axis = norm_axis
         self.epsilon = epsilon
 
-        weight = variable(shape, initializer=initializer)
+        weight = variable(shape, initializer=initializer, device=device)
         with no_grad():
             v, _ = weight_norm_decompose(weight, norm_axis, epsilon)
         add_parameter(self, 'v', v)
@@ -211,7 +263,7 @@ def get(self) -> Tensor:
     def set(self, value: TensorOrData) -> None:
         with no_grad():
             v, _ = weight_norm_decompose(
-                as_tensor(value, dtype=get_dtype(self.v)),
+                as_tensor(value, dtype=get_dtype(self.v), device=get_device(self.v)),
                 self.norm_axis,
                 self.epsilon,
             )
@@ -230,12 +282,15 @@ def __init__(self,
                  shape: List[int],
                  initializer: TensorInitArgType,
                  norm_axis: int = 1,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
+        device = device or current_device()
+
         super().__init__(shape)
         self.norm_axis = norm_axis
         self.epsilon = epsilon
 
-        weight = variable(shape, initializer=initializer)
+        weight = variable(shape, initializer=initializer, device=device)
         with no_grad():
             v, g = weight_norm_decompose(weight, norm_axis, epsilon)
         add_parameter(self, 'v', v)
@@ -249,7 +304,7 @@ def get(self) -> Tensor:
     def set(self, value: TensorOrData) -> None:
         with no_grad():
             v, g = weight_norm_decompose(
-                as_tensor(value, dtype=get_dtype(self.v)),
+                as_tensor(value, dtype=get_dtype(self.v), device=get_dtype(self.v)),
                 self.norm_axis,
                 self.epsilon,
             )
@@ -260,7 +315,8 @@ def set(self, value: TensorOrData) -> None:
 def get_weight_store(shape: List[int],
                      initializer: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                      norm_axis: int = 1,
-                     weight_norm: WeightNormArgType = False
+                     weight_norm: WeightNormArgType = False,
+                     device: Optional[str] = None,
                      ) -> ParamStore:
     """
     Create a module which carries the `weight` parameter.
@@ -273,16 +329,18 @@ def get_weight_store(shape: List[int],
             Use `NormedAndScaledWeightStore` if `True` or `WeightNormMode.FULL`.
             Use `NormedWeightStore` if `WeightNormMode.NO_SCALE`.
             Use `WeightStore` if `False` or `WeightNormMode.NONE`.
+        device: The device where to place new tensors and variables.
 
     Returns:
         The weight object.
     """
+    device = device or current_device()
     if weight_norm is True or weight_norm == WeightNormMode.FULL:
-        return NormedAndScaledWeightStore(shape, initializer, norm_axis)
+        return NormedAndScaledWeightStore(shape, initializer, norm_axis, device)
     elif weight_norm == WeightNormMode.NO_SCALE:
-        return NormedWeightStore(shape, initializer, norm_axis)
+        return NormedWeightStore(shape, initializer, norm_axis, device)
     elif weight_norm is False or weight_norm == WeightNormMode.NONE:
-        return SimpleParamStore(shape, initializer)
+        return SimpleParamStore(shape, initializer, device)
     else:
         raise ValueError(f'Invalid value for argument `weight_norm`: '
                          f'{weight_norm!r}.')
@@ -290,7 +348,8 @@ def get_weight_store(shape: List[int],
 
 def get_bias_store(shape: List[int],
                    initializer: TensorInitArgType = DEFAULT_BIAS_INIT,
-                   use_bias: bool = True
+                   use_bias: bool = True,
+                   device: Optional[str] = None
                    ) -> Optional[ParamStore]:
     """
     Create a module that carries the `bias` parameter.
@@ -300,12 +359,14 @@ def get_bias_store(shape: List[int],
         initializer: The initializer for the bias.
         use_bias: Whether or not to use the bias?
             If `False`, will return :obj:`None`.
+        device: The device where to place new tensors and variables.
 
     Returns:
         The bias object, or :obj:`None` if `use_bias` is False.
     """
+    device = device or current_device()
     if use_bias:
-        return SimpleParamStore(shape, initializer)
+        return SimpleParamStore(shape, initializer, device)
 
 
 # ---- identity layer ----
@@ -393,13 +454,17 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
+        device = device or current_device()
         weight_store = get_weight_store(
-            weight_shape, initializer=weight_init, weight_norm=weight_norm)
+            weight_shape, initializer=weight_init, weight_norm=weight_norm,
+            device=device
+        )
         bias_store = get_bias_store(
-            bias_shape, initializer=bias_init, use_bias=use_bias)
+            bias_shape, initializer=bias_init, use_bias=use_bias, device=device)
         if bias_store is None:
-            bias_store = _NullParamStore()
+            bias_store = _NullParamStore(device=device)
 
         if data_init is not None:
             if not isinstance(data_init, init.DataDependentInitializer) and \
@@ -453,6 +518,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         in_features = validate_positive_int('in_features', in_features)
         out_features = validate_positive_int('out_features', out_features)
@@ -468,6 +534,7 @@ def __init__(self,
             weight_init=weight_init,
             bias_init=bias_init,
             data_init=data_init,
+            device=device,
         )
 
     @jit_method
@@ -504,6 +571,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         spatial_ndims = self._get_spatial_ndims()
         in_channels = validate_positive_int('in_channels', in_channels)
@@ -530,6 +598,7 @@ def __init__(self,
             weight_init=weight_init,
             bias_init=bias_init,
             data_init=data_init,
+            device=device,
         )
 
     def _get_spatial_ndims(self) -> int:
@@ -632,6 +701,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         spatial_ndims = self._get_spatial_ndims()
         in_channels = validate_positive_int('in_channels', in_channels)
@@ -661,6 +731,7 @@ def __init__(self,
             weight_init=weight_init,
             bias_init=bias_init,
             data_init=data_init,
+            device=device,
         )
 
     def _get_spatial_ndims(self) -> int:
@@ -767,8 +838,12 @@ class BatchNorm(torch_nn.BatchNorm1d):
     def __init__(self,
                  num_features: int,
                  momentum: float = 0.1,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
+        device = device or current_device()
         super().__init__(num_features, eps=epsilon, momentum=momentum)
+        if device != CPU_DEVICE:
+            self.to(device=device)
 
     def _check_input_dim(self, input: Tensor):
         if rank(input) != 2:
@@ -782,8 +857,12 @@ class BatchNorm1d(torch_nn.BatchNorm1d):
     def __init__(self,
                  num_features: int,
                  momentum: float = 0.1,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
+        device = device or current_device()
         super().__init__(num_features, eps=epsilon, momentum=momentum)
+        if device != CPU_DEVICE:
+            self.to(device=device)
 
     def _check_input_dim(self, input: Tensor):
         if rank(input) != 3:
@@ -797,8 +876,12 @@ class BatchNorm2d(torch_nn.BatchNorm2d):
     def __init__(self,
                  num_features: int,
                  momentum: float = 0.1,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
+        device = device or current_device()
         super().__init__(num_features, eps=epsilon, momentum=momentum)
+        if device != CPU_DEVICE:
+            self.to(device=device)
 
     def _check_input_dim(self, input: Tensor):
         if input.dim() != 4:
@@ -812,8 +895,12 @@ class BatchNorm3d(torch_nn.BatchNorm3d):
     def __init__(self,
                  num_features: int,
                  momentum: float = 0.1,
+                 device: Optional[str] = None,
                  epsilon: float = EPSILON):
+        device = device or current_device()
         super().__init__(num_features, eps=epsilon, momentum=momentum)
+        if device != CPU_DEVICE:
+            self.to(device=device)
 
     def _check_input_dim(self, input: Tensor):
         if rank(input) != 5:
@@ -843,11 +930,12 @@ def forward(self, input: Tensor) -> Tensor:
             raise ValueError('`input` must be at least 2d, but the '
                              'input shape is {}.'.format(shape(input)))
 
+        device = input.device
         output = input
         if self.training:
             noise_shape = output.shape[:-1] + (1,)
-            noise = torch.zeros(noise_shape, dtype=output.dtype)
-            keep_prob = torch.as_tensor(self._keep_prob, dtype=output.dtype)
+            noise = torch.zeros(noise_shape, dtype=output.dtype, device=device)
+            keep_prob = torch.as_tensor(self._keep_prob, dtype=output.dtype, device=device)
             noise = torch.bernoulli(keep_prob.expand(noise_shape), out=noise)
             noise = noise.detach()
             output = output * noise / keep_prob
diff --git a/tensorkit/backend/pytorch_/random.py b/tensorkit/backend/pytorch_/random.py
index b46cf25..aec0f2b 100644
--- a/tensorkit/backend/pytorch_/random.py
+++ b/tensorkit/backend/pytorch_/random.py
@@ -37,26 +37,34 @@
 
 def seed(seed: int):
     torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
 
 
 # ---- uniform distribution ----
 @jit
-def rand(shape: List[int], dtype: str = settings.float_x) -> Tensor:
+def rand(shape: List[int],
+         dtype: str = settings.float_x,
+         device: Optional[str] = None) -> Tensor:
     if dtype == 'float32':
         real_dtype = torch.float32
     else:
         real_dtype = {'float16': torch.float16, 'float64': torch.float64}[dtype]
-    return torch.rand(shape, dtype=real_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.rand(shape, dtype=real_dtype, device=device)
 
 
 @jit
 def uniform(shape: List[int], low: float, high: float,
-            dtype: str = settings.float_x) -> Tensor:
+            dtype: str = settings.float_x,
+            device: Optional[str] = None) -> Tensor:
     if low >= high:
         raise ValueError('`low` < `high` does not hold: low == {}, high == {}'.
                          format(low, high))
     scale = high - low
-    return rand(shape, dtype) * scale + low
+    return rand(shape, dtype, device=device) * scale + low
 
 
 # ---- shuffle and random permutation ----
@@ -64,7 +72,8 @@ def uniform(shape: List[int], low: float, high: float,
 def shuffle(input: Tensor, axis: int = 0) -> Tensor:
     input_shape = input.shape
     shuffle_size = input_shape[axis]
-    permutation = torch.randperm(shuffle_size, dtype=torch.long)
+    permutation = torch.randperm(
+        shuffle_size, dtype=torch.long, device=input.device)
     if axis == 0:
         return input[permutation]
     else:
@@ -72,22 +81,32 @@ def shuffle(input: Tensor, axis: int = 0) -> Tensor:
 
 
 @jit
-def random_permutation(n: int, dtype: str = 'int32') -> Tensor:
+def random_permutation(n: int,
+                       dtype: str = 'int32',
+                       device: Optional[str] = None) -> Tensor:
     if dtype == 'int32':
         int_dtype = torch.int32
     else:
         int_dtype = {'int8': torch.int8, 'int16': torch.int16, 'int64': torch.int64}[dtype]
-    return torch.randperm(n, dtype=int_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.randperm(n, dtype=int_dtype, device=device)
 
 
 # ---- normal distribution ----
 @jit
-def randn(shape: List[int], dtype: str = settings.float_x) -> Tensor:
+def randn(shape: List[int],
+          dtype: str = settings.float_x,
+          device: Optional[str] = None,) -> Tensor:
     if dtype == 'float32':
         real_dtype = torch.float32
     else:
         real_dtype = {'float16': torch.float16, 'float64': torch.float64}[dtype]
-    return torch.randn(shape, dtype=real_dtype)
+
+    if device is None:
+        device = current_device()
+    return torch.randn(shape, dtype=real_dtype, device=device)
 
 
 @jit
@@ -116,7 +135,7 @@ def normal(mean: Tensor,
     param_shape = broadcast_shape(shape(mean), shape(std))
     if n_samples is not None:
         param_shape = [n_samples] + param_shape
-    r = std * torch.randn(param_shape, dtype=mean.dtype) + mean
+    r = std * torch.randn(param_shape, dtype=mean.dtype, device=mean.device) + mean
     if not reparameterized:
         r = r.detach()
     return r
@@ -181,7 +200,7 @@ def bernoulli(probs: Tensor,
     if n_samples is not None:
         sample_shape = (n_samples,) + sample_shape
         probs = probs.unsqueeze(dim=0).expand(sample_shape)
-    out = torch.zeros(sample_shape, dtype=target_dtype)
+    out = torch.zeros(sample_shape, dtype=target_dtype, device=probs.device)
     return torch.bernoulli(probs, out=out).detach()
 
 
diff --git a/tensorkit/distributions/base.py b/tensorkit/distributions/base.py
index 7adf2bd..81ed490 100644
--- a/tensorkit/distributions/base.py
+++ b/tensorkit/distributions/base.py
@@ -70,6 +70,9 @@ class Distribution(metaclass=DocInherit):
     ``value_shape == batch_shape + event_shape``.
     """
 
+    device: str
+    """Device, where the parameters of this distribution is placed."""
+
     validate_tensors: bool
     """
     Whether or not to perform time-consuming validation on argument tensors
@@ -85,6 +88,7 @@ def __init__(self,
                  reparameterized: Optional[bool] = None,
                  event_ndims: Optional[int] = None,
                  min_event_ndims: Optional[int] = None,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         # either `value_shape` or `batch_shape` should be specified, but not both.
         if value_shape is None and batch_shape is None:
@@ -158,6 +162,7 @@ def __init__(self,
         self.batch_shape = batch_shape
         self.event_shape = event_shape
         self.event_ndims = event_ndims
+        self.device = device or T.current_device()
         self.validate_tensors = (
             settings.validate_tensors if validate_tensors is None
             else bool(validate_tensors)
@@ -251,7 +256,8 @@ def log_prob(self,
         Returns:
             The computed log-prob or log-density.
         """
-        given = T.as_tensor(given)
+        if not isinstance(given, T.Tensor):
+            given = T.as_tensor(given, device=self.device)
         reduce_ndims = get_prob_reduce_ndims(
             # here `given` might have lower rank than `len(value_shape)`,
             # in which case `given` should be broadcasted to match `value_shape`.
diff --git a/tensorkit/distributions/bernoulli.py b/tensorkit/distributions/bernoulli.py
index a13b98b..3000bdf 100644
--- a/tensorkit/distributions/bernoulli.py
+++ b/tensorkit/distributions/bernoulli.py
@@ -40,6 +40,7 @@ def __init__(self,
                  dtype: str = T.int32,
                  event_ndims: int = 0,
                  epsilon: float = T.EPSILON,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`Bernoulli` distribution object.
@@ -53,12 +54,14 @@ def __init__(self,
             event_ndims: The number of dimensions in the samples to be
                 considered as an event.
             epsilon: The infinitesimal constant, used for computing `logits`.
+            device: The device where to place new tensors and variables.
             validate_tensors: Whether or not to check the numerical issues?
                 Defaults to ``settings.validate_tensors``.
         """
         # validate the arguments
         (logits, probs), = check_tensor_arg_types([('logits', logits),
-                                                   ('probs', probs)])
+                                                   ('probs', probs)],
+                                                  device=device)
         if logits is not None:
             value_shape = T.shape(logits)
             mutual_params = {'logits': logits}
@@ -72,6 +75,7 @@ def __init__(self,
             dtype=dtype,
             value_shape=value_shape,
             event_ndims=event_ndims,
+            device=device or T.get_device(logits),
             validate_tensors=validate_tensors,
         )
         for k, v in mutual_params.items():
@@ -130,7 +134,8 @@ def copy(self, **overrided_params):
         return copy_distribution(
             cls=Bernoulli,
             base=self,
-            attrs=('dtype', 'event_ndims', 'validate_tensors', 'epsilon'),
+            attrs=('dtype', 'device', 'event_ndims', 'validate_tensors',
+                   'epsilon'),
             mutual_attrs=(('logits', 'probs'),),
             compute_deps={'logits': ('epsilon',)},
             original_mutual_params=self._mutual_params,
diff --git a/tensorkit/distributions/categorical.py b/tensorkit/distributions/categorical.py
index ec7eda1..1140474 100644
--- a/tensorkit/distributions/categorical.py
+++ b/tensorkit/distributions/categorical.py
@@ -36,9 +36,11 @@ def __init__(self,
                  dtype: str,
                  event_ndims: int,
                  epsilon: float = T.EPSILON,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         (logits, probs), = check_tensor_arg_types([('logits', logits),
-                                                   ('probs', probs)])
+                                                   ('probs', probs)],
+                                                  device=device)
         if logits is not None:
             param_shape = T.shape(logits)
             mutual_params = {'logits': logits}
@@ -59,6 +61,7 @@ def __init__(self,
             dtype=dtype,
             value_shape=value_shape,
             event_ndims=event_ndims,
+            device=device or T.get_device(logits),
             validate_tensors=validate_tensors,
         )
         for k, v in mutual_params.items():
@@ -117,7 +120,7 @@ def copy(self, **overrided_params):
         return copy_distribution(
             cls=self.__class__,
             base=self,
-            attrs=('dtype', 'event_ndims', 'validate_tensors', 'epsilon'),
+            attrs=('dtype', 'event_ndims', 'epsilon', 'device', 'validate_tensors'),
             mutual_attrs=(('logits', 'probs'),),
             compute_deps={'logits': ('epsilon',)},
             original_mutual_params=self._mutual_params,
@@ -143,6 +146,7 @@ def __init__(self,
                  dtype: str = T.categorical_dtype,
                  event_ndims: int = 0,
                  epsilon: float = T.EPSILON,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`Categorical` distribution object.
@@ -165,8 +169,9 @@ def __init__(self,
             probs=probs,
             dtype=dtype,
             event_ndims=event_ndims,
-            validate_tensors=validate_tensors,
             epsilon=epsilon,
+            device=device,
+            validate_tensors=validate_tensors,
         )
 
     def _sample(self,
@@ -195,7 +200,7 @@ def to_one_hot(self, dtype: str = T.int32) -> 'OneHotCategorical':
         return copy_distribution(
             cls=OneHotCategorical,
             base=self,
-            attrs=('dtype', 'validate_tensors', 'event_ndims', 'epsilon'),
+            attrs=('dtype', 'event_ndims', 'epsilon', 'device', 'validate_tensors'),
             mutual_attrs=(('logits', 'probs'),),
             compute_deps={'logits': ('epsilon',)},
             original_mutual_params=self._mutual_params,
@@ -223,6 +228,7 @@ def __init__(self,
                  dtype: str = T.int32,
                  event_ndims: int = 1,
                  epsilon: float = T.EPSILON,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`OneHotCategorical` distribution object.
@@ -234,6 +240,7 @@ def __init__(self,
             probs: The probability `p` of being each possible value.
                 ``p = softmax(logits)``.
             dtype: The dtype of the samples.
+            device: The device where to place new tensors and variables.
             event_ndims: The number of dimensions in the samples to be
                 considered as an event.
             epsilon: The infinitesimal constant, used for computing `logits`.
@@ -245,8 +252,9 @@ def __init__(self,
             probs=probs,
             dtype=dtype,
             event_ndims=event_ndims,
-            validate_tensors=validate_tensors,
             epsilon=epsilon,
+            device=device,
+            validate_tensors=validate_tensors,
         )
 
     def _sample(self,
@@ -272,7 +280,7 @@ def to_indexed(self, dtype: str = T.categorical_dtype) -> 'Categorical':
         return copy_distribution(
             cls=Categorical,
             base=self,
-            attrs=('dtype', 'validate_tensors', 'event_ndims', 'epsilon'),
+            attrs=('dtype', 'event_ndims', 'epsilon', 'device', 'validate_tensors'),
             mutual_attrs=(('logits', 'probs'),),
             compute_deps={'logits': ('epsilon',)},
             original_mutual_params=self._mutual_params,
diff --git a/tensorkit/distributions/discretized.py b/tensorkit/distributions/discretized.py
index dad8f19..549cc8a 100644
--- a/tensorkit/distributions/discretized.py
+++ b/tensorkit/distributions/discretized.py
@@ -65,6 +65,7 @@ def __init__(self,
                  reparameterized: bool = False,
                  event_ndims: int = 0,
                  epsilon: float = T.EPSILON,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`DiscretizedLogistic`.
@@ -87,6 +88,7 @@ def __init__(self,
                 considered as an event.
             epsilon: An infinitesimal constant to avoid dividing by zero or
                 taking logarithm of zero.
+            device: The device where to place new tensors and variables.
             validate_tensors: Whether or not to check the numerical issues?
                 Defaults to ``settings.validate_tensors``.
         """
@@ -96,8 +98,8 @@ def __init__(self,
                              '`discretize_sample` is True.')
 
         mean, log_scale = check_tensor_arg_types(
-            ('mean', mean), ('log_scale', log_scale))
-        log_scale = T.as_tensor_backend(log_scale, dtype=mean.dtype)
+            ('mean', mean), ('log_scale', log_scale), device=device)
+        log_scale = T.cast_like(log_scale, mean)
         dtype = T.get_dtype(mean)
 
         if min_val is not None and max_val is not None:
@@ -127,6 +129,7 @@ def __init__(self,
             continuous=not discretize_sample,
             reparameterized=reparameterized,
             event_ndims=event_ndims,
+            device=device or T.get_device(mean),
             validate_tensors=validate_tensors,
         )
         self.mean = mean
@@ -188,7 +191,8 @@ def copy(self, **overrided_params):
             attrs=(
                 'mean', 'log_scale', 'bin_size', 'min_val', 'max_val',
                 'biased_edges', 'discretize_given', 'discretize_sample',
-                'reparameterized', 'event_ndims', 'epsilon', 'validate_tensors'
+                'reparameterized', 'event_ndims', 'epsilon', 'device',
+                'validate_tensors',
             ),
             overrided_params=overrided_params,
         )
diff --git a/tensorkit/distributions/flow.py b/tensorkit/distributions/flow.py
index 6064903..714d268 100644
--- a/tensorkit/distributions/flow.py
+++ b/tensorkit/distributions/flow.py
@@ -2,6 +2,7 @@
 
 from .. import tensor as T
 from ..flows import Flow
+from ..layers import is_jit_layer
 from ..stochastic import StochasticTensor
 from .base import Distribution
 from .utils import copy_distribution, get_overrided_parameterized
@@ -34,7 +35,7 @@ def __init__(self,
         if not isinstance(distribution, Distribution):
             raise TypeError(f'`distribution` is not an instance of '
                             f'`Distribution`: got {distribution!r}')
-        if not isinstance(flow, Flow) and not T.is_jit_layer(flow):
+        if not isinstance(flow, Flow) and not is_jit_layer(flow):
             raise TypeError(f'`flow` is not a flow: {flow!r}')
 
         # `distribution` is required to be continuous and have float dtype.
@@ -93,7 +94,8 @@ def __init__(self,
         super(FlowDistribution, self).__init__(
             dtype=dtype, batch_shape=batch_shape, continuous=continuous,
             reparameterized=reparameterized, event_ndims=event_ndims,
-            min_event_ndims=min_event_ndims, validate_tensors=validate_tensors,
+            min_event_ndims=min_event_ndims, device=distribution.device,
+            validate_tensors=validate_tensors,
         )
         self._base_distribution = distribution
         self.flow = flow
diff --git a/tensorkit/distributions/mixture.py b/tensorkit/distributions/mixture.py
index 648be13..59c6ae1 100644
--- a/tensorkit/distributions/mixture.py
+++ b/tensorkit/distributions/mixture.py
@@ -98,7 +98,7 @@ def __init__(self,
                 validate_tensors = True
 
         # attributes of `components`
-        for attr in ('dtype', 'continuous', 'event_ndims', 'batch_shape'):
+        for attr in ('dtype', 'continuous', 'event_ndims', 'batch_shape', 'device'):
             c0_val = getattr(components[0], attr)
             for i, c in enumerate(components[1:], 1):
                 c_val = getattr(c, attr)
@@ -108,15 +108,18 @@ def __init__(self,
                         f'{c_val} vs {c0_val}.'
                     )
         dtype = components[0].dtype
+        device = components[0].device
         continuous = components[0].continuous
         batch_shape = components[0].batch_shape
 
-        # categorical `batch_shape` must be broadcastable to `batch_shape`
-        if categorical.batch_shape != batch_shape:
-            raise ValueError(
-                f'`categorical.batch_shape` != the `batch_shape` of '
-                f'`components`: {categorical.batch_shape} vs {batch_shape}.'
-            )
+        # categorical `batch_shape` and `device` must match the components
+        for attr in ('batch_shape', 'device'):
+            if getattr(categorical, attr) != getattr(components[0], attr):
+                raise ValueError(
+                    f'`categorical.{attr}` != the `{attr}` of '
+                    f'`components`: {getattr(categorical, attr)} vs '
+                    f'{getattr(components[0], attr)}.'
+                )
 
         # infer the `min_event_shape` and `min_event_ndims`
         min_event_shape = components[0].event_shape
@@ -159,6 +162,7 @@ def __init__(self,
             reparameterized=reparameterized,
             event_ndims=event_ndims,
             min_event_ndims=min_event_ndims,
+            device=device,
             validate_tensors=validate_tensors,
         )
         self.categorical = categorical.to_indexed()
diff --git a/tensorkit/distributions/normal.py b/tensorkit/distributions/normal.py
index 3ea34f0..b5f1fb0 100644
--- a/tensorkit/distributions/normal.py
+++ b/tensorkit/distributions/normal.py
@@ -34,6 +34,7 @@ def __init__(self,
                  dtype: str = T.float_x(),
                  reparameterized: bool = True,
                  event_ndims: int = 0,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`UnitNormal` distribution.
@@ -44,6 +45,7 @@ def __init__(self,
             reparameterized: Whether the distribution should be reparameterized?
             event_ndims: The number of dimensions in the samples to be
                 considered as an event.
+            device: The device where to place new tensors and variables.
             validate_tensors: Whether or not to check the numerical issues?
                 Defaults to ``settings.validate_tensors``.
         """
@@ -52,6 +54,7 @@ def __init__(self,
             value_shape=shape,
             reparameterized=reparameterized,
             event_ndims=event_ndims,
+            device=device or T.current_device(),
             validate_tensors=validate_tensors,
         )
 
@@ -59,21 +62,24 @@ def __init__(self,
     def mean(self) -> T.Tensor:
         """The mean of the normal distribution."""
         if self._mean is None:
-            self._mean = T.zeros(self.value_shape, self.dtype)
+            self._mean = T.zeros(
+                self.value_shape, dtype=self.dtype, device=self.device)
         return self._mean
 
     @property
     def std(self) -> T.Tensor:
         """The standard deviation (std) of the normal distribution."""
         if self._std is None:
-            self._std = T.ones(self.value_shape, self.dtype)
+            self._std = T.ones(
+                self.value_shape, dtype=self.dtype, device=self.device)
         return self._std
 
     @property
     def logstd(self) -> T.Tensor:
         """The log-std of the normal distribution."""
         if self._logstd is None:
-            self._logstd = T.zeros(self.value_shape, self.dtype)
+            self._logstd = T.zeros(
+                self.value_shape, dtype=self.dtype, device=self.device)
         return self._logstd
 
     def _sample(self,
@@ -86,6 +92,7 @@ def _sample(self,
                 shape=([n_samples] + self.value_shape if n_samples is not None
                        else self.value_shape),
                 dtype=self.dtype,
+                device=self.device,
             ),
             distribution=self,
             n_samples=n_samples,
@@ -104,12 +111,12 @@ def copy(self, **overrided_params):
             cls=UnitNormal,
             base=self,
             attrs=(('shape', 'value_shape'), 'dtype', 'reparameterized',
-                   'event_ndims', 'validate_tensors'),
+                   'event_ndims', 'device', 'validate_tensors'),
             cached_attrs=('mean', 'std', 'logstd'),
             compute_deps={
-                'mean': ('dtype', 'value_shape'),
-                'std': ('dtype', 'value_shape'),
-                'logstd': ('dtype', 'value_shape'),
+                'mean': ('dtype', 'value_shape', 'device'),
+                'std': ('dtype', 'value_shape', 'device'),
+                'logstd': ('dtype', 'value_shape', 'device'),
             },
             overrided_params=overrided_params,
         )
@@ -147,10 +154,13 @@ def __init__(self,
                  logstd: Optional[TensorOrData] = None,
                  reparameterized: bool = True,
                  event_ndims: int = 0,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         # validate the arguments
         mean, (std, logstd) = check_tensor_arg_types(
-            ('mean', mean), [('std', std), ('logstd', logstd)])
+            ('mean', mean), [('std', std), ('logstd', logstd)],
+            device=device,
+        )
         if std is not None:
             mutual_params = {'std': std}
             stdx = std
@@ -168,6 +178,7 @@ def __init__(self,
             reparameterized=reparameterized,
             value_shape=value_shape,
             event_ndims=event_ndims,
+            device=device or T.get_device(mean),
             validate_tensors=validate_tensors,
         )
         for k, v in mutual_params.items():
@@ -208,7 +219,8 @@ def copy(self, **overrided_params):
             cls=self.__class__,
             base=self,
             attrs=(
-                ('mean', 'reparameterized', 'event_ndims', 'validate_tensors') +
+                ('mean', 'reparameterized', 'event_ndims', 'device',
+                 'validate_tensors') +
                 self._extra_args
             ),
             mutual_attrs=(('std', 'logstd'),),
@@ -227,6 +239,7 @@ def __init__(self,
                  logstd: Optional[TensorOrData] = None,
                  reparameterized: bool = True,
                  event_ndims: int = 0,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`Normal` distribution instance.
@@ -238,12 +251,13 @@ def __init__(self,
             reparameterized: Whether the distribution should be reparameterized?
             event_ndims: The number of dimensions in the samples to be
                 considered as an event.
+            device: The device where to place new tensors and variables.
             validate_tensors: Whether or not to check the numerical issues?
                 Defaults to ``settings.validate_tensors``.
         """
         super().__init__(
             mean=mean, std=std, logstd=logstd, reparameterized=reparameterized,
-            event_ndims=event_ndims, validate_tensors=validate_tensors,
+            event_ndims=event_ndims, device=device, validate_tensors=validate_tensors,
         )
 
     def _sample(self,
@@ -304,6 +318,7 @@ def __init__(self,
                  event_ndims: int = 0,
                  epsilon: float = T.EPSILON,
                  log_zero: float = T.random.LOG_ZERO_VALUE,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None):
         """
         Construct a new :class:`TruncatedNormal` distribution instance.
@@ -321,6 +336,7 @@ def __init__(self,
             log_zero: The value to represent ``log(0)`` in the result of
                 :meth:`log_prob()`, instead of using ``-math.inf``, to avoid
                 potential numerical issues.
+            device: The device where to place new tensors and variables.
             validate_tensors: Whether or not to check the numerical issues?
                 Defaults to ``settings.validate_tensors``.
         """
@@ -330,7 +346,7 @@ def __init__(self,
                                  f'and high == {high}.')
         super().__init__(
             mean=mean, std=std, logstd=logstd, reparameterized=reparameterized,
-            event_ndims=event_ndims, validate_tensors=validate_tensors,
+            event_ndims=event_ndims, device=device, validate_tensors=validate_tensors,
         )
         self.low = low
         self.high = high
diff --git a/tensorkit/distributions/uniform.py b/tensorkit/distributions/uniform.py
index a1c5250..547c0c6 100644
--- a/tensorkit/distributions/uniform.py
+++ b/tensorkit/distributions/uniform.py
@@ -19,10 +19,10 @@ class Uniform(Distribution):
     _shape: Optional[List[int]]
     """The original `shape` argument for constructor."""
 
-    low: Union[T.Tensor]
+    low: T.Tensor
     """The lower-bound of the uniform distribution."""
 
-    high: Union[T.Tensor]
+    high: T.Tensor
     """The upper-bound of the uniform distribution (exclusive)."""
 
     log_zero: float
@@ -37,6 +37,7 @@ def __init__(self,
                  dtype: str = T.float_x(),
                  reparameterized: bool = True,
                  event_ndims: int = 0,
+                 device: Optional[str] = None,
                  log_zero: float = T.random.LOG_ZERO_VALUE,
                  validate_tensors: Optional[bool] = None):
         """
@@ -55,6 +56,7 @@ def __init__(self,
             reparameterized: Whether the distribution should be reparameterized?
             event_ndims: The number of dimensions in the samples to be
                 considered as an event.
+            device: The device where to place new tensors and variables.
             log_zero: The value to represent ``log(0)`` in the result of
                 :meth:`log_prob()`, instead of using ``-math.inf``, to avoid
                 potential numerical issues.
@@ -79,7 +81,9 @@ def __init__(self,
                 range_checked = True
 
             low, high = check_tensor_arg_types(
-                ('low', low), ('high', high), default_dtype=dtype)
+                ('low', low), ('high', high), default_dtype=dtype,
+                device=device,
+            )
 
             dtype = T.get_dtype(low)
             value_shape = (value_shape +
@@ -90,6 +94,7 @@ def __init__(self,
             value_shape=value_shape,
             reparameterized=reparameterized,
             event_ndims=event_ndims,
+            device=device or T.get_device(low),
             validate_tensors=validate_tensors,
         )
 
@@ -118,7 +123,7 @@ def _sample(self,
                 reparameterized: bool) -> StochasticTensor:
         sample_shape = ([n_samples] + self.value_shape if n_samples is not None
                         else self.value_shape)
-        samples = T.random.rand(sample_shape, dtype=self.dtype)
+        samples = T.random.rand(sample_shape, dtype=self.dtype, device=self.device)
         if self.low is not None and self.high is not None:
             scale = self.high - self.low
             samples = samples * scale + self.low
@@ -157,6 +162,6 @@ def copy(self, **overrided_params):
             base=self,
             attrs=(('shape', '_shape'), 'low', 'high', 'dtype',
                    'reparameterized', 'event_ndims', 'log_zero',
-                   'validate_tensors'),
+                   'device', 'validate_tensors'),
             overrided_params=overrided_params,
         )
diff --git a/tensorkit/distributions/utils.py b/tensorkit/distributions/utils.py
index 95f8e68..6fc3478 100644
--- a/tensorkit/distributions/utils.py
+++ b/tensorkit/distributions/utils.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 
-from ..tensor import (jit, Tensor, where, as_tensor_backend, as_tensor, get_dtype,
-                      float_x)
+from ..tensor import (jit, Tensor, where, float_scalar_like, as_tensor,
+                      get_dtype, get_device, current_device, float_x)
 
 __all__ = [
     'get_overrided_parameterized',
@@ -84,12 +84,14 @@ def log_pdf_mask(condition: Tensor,
     out remaining positions (i.e., set log-pdf of these locations to
     `log_zero`).
     """
-    return where(condition, log_pdf, as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+    return where(condition, log_pdf, float_scalar_like(log_zero, log_pdf))
 
 
 def check_tensor_arg_types(*args,
                            dtype: Optional[str] = None,
-                           default_dtype: str = float_x()
+                           device: Optional[str] = None,
+                           default_dtype: str = float_x(),
+                           default_device: Optional[str] = None,
                            ) -> Tuple[Union[Tensor, Tuple[Tensor, ...]], ...]:
     """
     Validate tensor argument types.
@@ -104,9 +106,13 @@ def check_tensor_arg_types(*args,
                while the others must be None.
         dtype: If specified, all arguments must be tensors of this dtype,
             or Python numbers (which can be casted into this dtype).
+        device: If specified, all tensor arguments must be placed on this device.
         default_dtype: The default dtype to cast Python numbers into,
             if `dtype` is not specified, and all arguments are Python numbers
             (thus no dtype can be inferred).
+        default_device: The default device where to place new tensors and
+            variables, if `device` is not specified, and all arguments are
+            Python numbers (thus no device can be inferred).
 
     Returns:
         A list of validated tensors.
@@ -130,12 +136,25 @@ def check_dtype(name, data):
                 raise ValueError(f'`{name}.dtype` != `{inferred_dtype[0]}`: '
                                  f'{data_dtype} vs {inferred_dtype[1]}')
 
+    def check_device(name, data):
+        if isinstance(data, StochasticTensor):
+            data = data.tensor
+        if isinstance(data, Tensor):
+            data_device = get_device(data)
+            if inferred_device[1] is None:
+                inferred_device[0] = f'{name}.device'
+                inferred_device[1] = data_device
+            elif inferred_device[1] != data_device:
+                raise ValueError(f'`{name}.device` != `{inferred_device[0]}`: '
+                                 f'{data_device} vs {inferred_device[1]}')
+
     def check_arg(arg):
         if isinstance(arg, tuple):
             name, data = arg
             if data is None:
                 raise ValueError(f'`{name}` must be specified.')
             check_dtype(name, data)
+            check_device(name, data)
         else:
             not_none_count = 0
             for i, (name, data) in enumerate(arg):
@@ -144,6 +163,7 @@ def check_arg(arg):
                     if not_none_count != 1:
                         break
                     check_dtype(name, data)
+                    check_device(name, data)
             if not_none_count != 1:
                 names = [f'`{n}`' for n, _ in arg]
                 if len(names) == 2:
@@ -161,18 +181,26 @@ def check_arg(arg):
     else:
         inferred_dtype = [None, None]
 
+    if device is not None:
+        inferred_device = ['device', device]
+    else:
+        inferred_device = [None, None]
+
     for a in args:
         check_arg(a)
 
     # do cast the tensors
+    default_device = default_device or current_device()
     target_dtype = inferred_dtype[1] or default_dtype
+    target_device = inferred_device[1] or default_device
+
     ret: List[Union[Tensor, Tuple[Tensor, ...]]] = []
     for arg in args:
         if isinstance(arg, tuple):
-            ret.append(as_tensor(arg[1], dtype=target_dtype))
+            ret.append(as_tensor(arg[1], dtype=target_dtype, device=target_device))
         else:
             ret.append(tuple(
-                (as_tensor(data, dtype=target_dtype)
+                (as_tensor(data, dtype=target_dtype, device=target_device)
                  if data is not None else None)
                 for _, data in arg
             ))
diff --git a/tensorkit/examples/classification/mnist.py b/tensorkit/examples/classification/mnist.py
index 7b43db0..b91a8bb 100644
--- a/tensorkit/examples/classification/mnist.py
+++ b/tensorkit/examples/classification/mnist.py
@@ -43,7 +43,7 @@ def train_step(x, y):
         return {'loss': loss, 'acc': acc}
 
     def evaluate(x, y):
-        with T.no_grad():
+        with tk.layers.scoped_eval_mode(net), T.no_grad():
             logits = net(x)
             acc = utils.calculate_acc(logits, y)
         return {'acc': acc}
@@ -87,6 +87,7 @@ def evaluate(x, y):
     )
 
     # train the model
+    tk.layers.set_train_mode(net, True)
     utils.fit_model(loop=loop, optimizer=optimizer, fn=train_step,
                     stream=train_stream)
 
@@ -96,4 +97,5 @@ def evaluate(x, y):
 
 if __name__ == '__main__':
     with mltk.Experiment(Config) as exp:
-        main(exp)
+        with T.use_device(T.first_gpu_device()):
+            main(exp)
diff --git a/tensorkit/examples/classification/mnist_resnet.py b/tensorkit/examples/classification/mnist_resnet.py
index 4b5aacd..374c6a5 100644
--- a/tensorkit/examples/classification/mnist_resnet.py
+++ b/tensorkit/examples/classification/mnist_resnet.py
@@ -49,7 +49,7 @@ def train_step(x, y):
         return {'loss': loss, 'acc': acc}
 
     def evaluate(x, y):
-        with T.no_grad():
+        with tk.layers.scoped_eval_mode(net), T.no_grad():
             logits = net(x)
             acc = utils.calculate_acc(logits, y)
         return {'acc': acc}
@@ -92,6 +92,7 @@ def evaluate(x, y):
     )
 
     # train the model
+    tk.layers.set_train_mode(net, True)
     utils.fit_model(loop=loop, optimizer=optimizer, fn=train_step,
                     stream=train_stream)
 
@@ -101,4 +102,5 @@ def evaluate(x, y):
 
 if __name__ == '__main__':
     with mltk.Experiment(Config) as exp:
-        main(exp)
+        with T.use_device(T.first_gpu_device()):
+            main(exp)
diff --git a/tensorkit/flows/act_norm.py b/tensorkit/flows/act_norm.py
index 239da4b..5a99a71 100644
--- a/tensorkit/flows/act_norm.py
+++ b/tensorkit/flows/act_norm.py
@@ -4,7 +4,7 @@
 from .. import init, tensor as T
 from ..tensor import (Tensor, Module, reshape, shape, int_range,
                       calculate_mean_and_var, assert_finite,
-                      as_tensor_backend, maximum, log, sqrt)
+                      float_scalar_like, maximum, log, sqrt)
 from ..layers import *
 from ..typing_ import *
 from .core import *
@@ -43,8 +43,9 @@ def __init__(self,
                  event_ndims: int = 1,
                  scale: Union[str, ActNormScaleType] = 'exp',
                  initialized: bool = False,
-                 epsilon: float = T.EPSILON,
-                 dtype: str = T.float_x()):
+                 dtype: str = T.float_x(),
+                 device: Optional[str] = None,
+                 epsilon: float = T.EPSILON):
         """
         Construct a new :class:`ActNorm` instance.
 
@@ -64,9 +65,10 @@ def __init__(self,
             initialized: Whether or not the variables have been
                 initialized?  Defaults to :obj:`False`, where the first input
                 `x` in the forward pass will be used to initialize the variables.
+            dtype: Dtype of the parameters.
+            device: The device where to place new tensors and variables.
             epsilon: The infinitesimal constant to avoid dividing by zero or
                 taking logarithm of zero.
-            dtype: Dtype of the parameters.
         """
         # validate the arguments
         scale_type = ActNormScaleType(scale)
@@ -81,6 +83,8 @@ def __init__(self,
         else:  # pragma: no cover
             raise ValueError(f'Unsupported `scale_type`: {scale_type}')
 
+        device = device or T.current_device()
+
         # construct the layer
         super().__init__(axis=axis,
                          event_ndims=event_ndims,
@@ -94,11 +98,13 @@ def __init__(self,
 
         add_parameter(
             self, 'pre_scale',
-            T.variable([num_features], dtype=dtype, initializer=pre_scale_init),
+            T.variable([num_features], dtype=dtype, initializer=pre_scale_init,
+                       device=device),
         )
         add_parameter(
             self, 'bias',
-            T.variable([num_features], dtype=dtype, initializer=init.zeros),
+            T.variable([num_features], dtype=dtype, initializer=init.zeros,
+                       device=device),
         )
 
     @T.jit_method
@@ -133,7 +139,7 @@ def calculate_bias_and_pre_scale_for_init(self, input: Tensor) -> Tuple[Tensor,
         bias = -input_mean
 
         # calculate the initial value for `pre_scale`
-        epsilon = as_tensor_backend(self.epsilon, dtype=input_var.dtype)
+        epsilon = float_scalar_like(self.epsilon, input_var)
         if self.scale_type == 'exp':
             pre_scale = -0.5 * log(maximum(input_var, epsilon))
         else:
@@ -142,12 +148,13 @@ def calculate_bias_and_pre_scale_for_init(self, input: Tensor) -> Tuple[Tensor,
         return bias, pre_scale
 
     @T.jit_ignore
-    def _initialize_act_norm(self, input: Tensor) -> None:
+    def _initialize_act_norm(self, input: Tensor) -> bool:
         bias, pre_scale = self.calculate_bias_and_pre_scale_for_init(input)
         with T.no_grad():
-            T.assign(get_parameter(self, 'bias'), bias)
-            T.assign(get_parameter(self, 'pre_scale'), pre_scale)
+            T.assign(self.bias, bias)
+            T.assign(self.pre_scale, pre_scale)
         self.set_initialized(True)
+        return False
 
     @T.jit_method
     def _transform(self,
@@ -199,8 +206,9 @@ def __init__(self,
                  num_features: int,
                  scale: Union[str, ActNormScaleType] = 'exp',
                  initialized: bool = False,
-                 epsilon: float = T.EPSILON,
-                 dtype: str = T.float_x()):
+                 dtype: str = T.float_x(),
+                 device: Optional[str] = None,
+                 epsilon: float = T.EPSILON):
         """
         Construct a new convolutional :class:`ActNorm` instance.
 
@@ -213,9 +221,10 @@ def __init__(self,
             initialized: Whether or not the variables have been
                 initialized?  Defaults to :obj:`False`, where the first input
                 `x` in the forward pass will be used to initialize the variables.
+            dtype: Dtype of the parameters.
+            device: The device where to place new tensors and variables.
             epsilon: The infinitesimal constant to avoid dividing by zero or
                 taking logarithm of zero.
-            dtype: Dtype of the parameters.
         """
         spatial_ndims = self._get_spatial_ndims()
         feature_axis = -1 if T.IS_CHANNEL_LAST else -(spatial_ndims + 1)
@@ -226,8 +235,9 @@ def __init__(self,
             event_ndims=spatial_ndims + 1,
             scale=scale,
             initialized=initialized,
-            epsilon=epsilon,
             dtype=dtype,
+            device=device,
+            epsilon=epsilon,
         )
 
     def _get_spatial_ndims(self) -> int:
diff --git a/tensorkit/flows/coupling.py b/tensorkit/flows/coupling.py
index 4d1729a..49d0c81 100644
--- a/tensorkit/flows/coupling.py
+++ b/tensorkit/flows/coupling.py
@@ -1,6 +1,7 @@
 from typing import *
 
 from .. import tensor as T
+from ..layers import is_jit_layer
 from ..tensor import Tensor, Module, concat, split
 from .core import (FeatureMappingFlow, Scale, ExpScale, SigmoidScale,
                    LinearScale)
@@ -106,11 +107,13 @@ def __init__(self,
                 scale = INVALID
 
         if isinstance(scale, Module):
-            if not isinstance(scale, Scale) and not T.is_jit_layer(scale):
+            if not isinstance(scale, Scale) and not is_jit_layer(scale):
                 scale = INVALID
         elif isinstance(scale, type) or callable(scale):
             if scale is SigmoidScale:
                 scale = scale(pre_scale_bias=sigmoid_scale_bias)
+            elif scale is LinearScale:
+                scale = scale(epsilon=epsilon)
             else:
                 scale = scale()
         else:
diff --git a/tensorkit/flows/rearrangement.py b/tensorkit/flows/rearrangement.py
index 539771c..c075298 100644
--- a/tensorkit/flows/rearrangement.py
+++ b/tensorkit/flows/rearrangement.py
@@ -1,7 +1,7 @@
 from typing import *
 
 from .. import tensor as T
-from ..tensor import Tensor, argsort, index_select, as_tensor_backend
+from ..tensor import Tensor, argsort, index_select, float_scalar_like
 from ..tensor.random import random_permutation
 from ..layers import *
 from .core import *
@@ -30,7 +30,8 @@ class FeatureShufflingFlow(FeatureMappingFlow):
     def __init__(self,
                  num_features: int,
                  axis: int = -1, 
-                 event_ndims: int = 1):
+                 event_ndims: int = 1,
+                 device: Optional[str] = None):
         """
         Construct a new :class:`FeatureShufflingFlow`.
 
@@ -39,13 +40,15 @@ def __init__(self,
             axis: The feature axis, to apply the transformation.
             event_ndims: Number of dimensions to be considered as the
                 event dimensions.  `x.ndims - event_ndims == log_det.ndims`.
+            device: The device where to place new tensors and variables.
         """
         super().__init__(axis=int(axis), event_ndims=event_ndims,
                          explicitly_invertible=True)
         self.num_features = num_features
 
         # initialize the permutation variable, and the inverse permutation
-        permutation = random_permutation(num_features, dtype=T.index_dtype)
+        permutation = random_permutation(num_features, dtype=T.index_dtype,
+                                         device=device)
         inv_permutation = argsort(permutation)
 
         # register the permutation as layer parameter, such that it could be
@@ -66,7 +69,7 @@ def _transform(self,
             output = index_select(input, self.permutation, axis=self.axis)
         output_log_det = input_log_det
         if compute_log_det and output_log_det is None:
-            output_log_det = as_tensor_backend(0., dtype=input.dtype)
+            output_log_det = float_scalar_like(0., input)
         return output, output_log_det
 
 
diff --git a/tensorkit/flows/reshape_.py b/tensorkit/flows/reshape_.py
index 9f1eec8..f287ec1 100644
--- a/tensorkit/flows/reshape_.py
+++ b/tensorkit/flows/reshape_.py
@@ -1,6 +1,6 @@
 from typing import *
 
-from ..tensor import Tensor, reshape_tail, as_tensor_backend, jit_method
+from ..tensor import Tensor, reshape_tail, float_scalar_like, jit_method
 from ..tensor.nn import *
 from .core import *
 
@@ -84,7 +84,7 @@ def _transform(self,
 
         output_log_det = input_log_det
         if compute_log_det and output_log_det is None:
-            output_log_det = as_tensor_backend(0., dtype=input.dtype)
+            output_log_det = float_scalar_like(0., input)
         return output, output_log_det
 
 
@@ -133,7 +133,7 @@ def _transform(self,
 
         output_log_det = input_log_det
         if compute_log_det and output_log_det is None:
-            output_log_det = as_tensor_backend(0., dtype=input.dtype)
+            output_log_det = float_scalar_like(0., input)
 
         return output, output_log_det
 
diff --git a/tensorkit/flows/split_.py b/tensorkit/flows/split_.py
index b055bfa..6cf9111 100644
--- a/tensorkit/flows/split_.py
+++ b/tensorkit/flows/split_.py
@@ -1,6 +1,7 @@
 from typing import *
 
 from .. import tensor as T
+from ..layers import is_jit_layer
 from ..tensor import Tensor, Module, split, concat
 from .core import *
 
@@ -79,13 +80,13 @@ def __init__(self,
                                  f'two positive integers: got {y_sections!r}.')
             y_sections = list(map(int, y_sections))
 
-        if not isinstance(left, Flow) and not T.is_jit_layer(left):
+        if not isinstance(left, Flow) and not is_jit_layer(left):
             raise TypeError(f'`left` is not a flow: got {left!r}.')
         x_event_ndims = left.get_x_event_ndims()
         y_event_ndims = left.get_y_event_ndims()
 
         if right is not None:
-            if not isinstance(right, Flow) and not T.is_jit_layer(right):
+            if not isinstance(right, Flow) and not is_jit_layer(right):
                 raise TypeError(f'`right` is not a flow: got {right!r}.')
             if right.get_x_event_ndims() != x_event_ndims or \
                     right.get_y_event_ndims() != y_event_ndims:
diff --git a/tensorkit/init/std_data_init.py b/tensorkit/init/std_data_init.py
index 0527771..2b6429c 100644
--- a/tensorkit/init/std_data_init.py
+++ b/tensorkit/init/std_data_init.py
@@ -1,12 +1,12 @@
 from typing import List
 
 from .. import tensor as T
+from ..backend import Module, Tensor
 from ..layers import *
 from .core import *
 
 __all__ = ['StdDataInit']
 
-from ..backend import Module, Tensor
 
 
 class StdDataInit(DataDependentInitializer):
@@ -22,7 +22,7 @@ def __init__(self, epsilon: float = T.EPSILON):
         self.epsilon = epsilon
 
     def _init(self, layer: Module, inputs: List[Tensor]) -> None:
-        if T.is_jit_layer(layer):
+        if is_jit_layer(layer):
             raise TypeError(f'JIT compiled layer is not supported: got {layer!r}')
         if not isinstance(layer, CoreLinear):
             raise TypeError(f'`layer` is not a core linear layer: got {layer!r}')
@@ -62,7 +62,7 @@ def _init(self, layer: Module, inputs: List[Tensor]) -> None:
         out_std = T.sqrt(
             T.maximum(
                 out_var,
-                T.as_tensor_backend(self.epsilon, dtype=out_var.dtype)
+                T.float_scalar_like(self.epsilon, out_var)
             )
         )
         weight_scale = out_std
diff --git a/tensorkit/layers/builder.py b/tensorkit/layers/builder.py
index 9cae176..240d500 100644
--- a/tensorkit/layers/builder.py
+++ b/tensorkit/layers/builder.py
@@ -347,16 +347,13 @@ def add(self,
         self.out_shape = out_shape
         return self
 
-    def build(self,
-              flatten_to_ndims: bool = True,
-              disable_jit: bool = False) -> T.Module:
+    def build(self, flatten_to_ndims: bool = True) -> T.Module:
         """
         Build the sequential layer.
 
         Args:
             flatten_to_ndims: Whether or not to wrap the sequential layer
                 with a :class:`FlattenToNDims` layer?
-            disable_jit: Whether or not to disable JIT?
 
         Returns:
             The built sequential layer.
@@ -370,8 +367,6 @@ def build(self,
 
         if flatten_to_ndims:
             layer = FlattenToNDims(layer, ndims=len(self.in_shape) + 1)
-        if not disable_jit:
-            layer = T.jit_compile(layer)
         return layer
 
     # ---- activation ----
diff --git a/tensorkit/layers/composed.py b/tensorkit/layers/composed.py
index b17a49a..6a9fde8 100644
--- a/tensorkit/layers/composed.py
+++ b/tensorkit/layers/composed.py
@@ -61,6 +61,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         # check the arguments
         if use_bias is None:
@@ -76,6 +77,7 @@ def __init__(self,
                 weight_init=weight_init,
                 bias_init=bias_init,
                 data_init=data_init,
+                device=device,
             ),
             out_features=out_features,
             out_feature_axis=-1,
@@ -105,6 +107,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         spatial_ndims = self._get_spatial_ndims()
         linear_factory = self._get_linear_factory()
@@ -132,6 +135,7 @@ def __init__(self,
                 weight_init=weight_init,
                 bias_init=bias_init,
                 data_init=data_init,
+                device=device,
             ),
             out_features=out_channels,
             out_feature_axis=-1 if T.IS_CHANNEL_LAST else -(spatial_ndims + 1),
@@ -195,6 +199,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         spatial_ndims = self._get_spatial_ndims()
         linear_factory = self._get_linear_factory()
@@ -225,6 +230,7 @@ def __init__(self,
                 weight_init=weight_init,
                 bias_init=bias_init,
                 data_init=data_init,
+                device=device,
             ),
             out_features=out_channels,
             out_feature_axis=-1 if T.IS_CHANNEL_LAST else -(spatial_ndims + 1),
diff --git a/tensorkit/layers/flow_layer.py b/tensorkit/layers/flow_layer.py
index 0ad547e..11b2941 100644
--- a/tensorkit/layers/flow_layer.py
+++ b/tensorkit/layers/flow_layer.py
@@ -1,5 +1,5 @@
 from ..backend.flows import Flow
-from ..tensor import Tensor, Module, is_jit_layer
+from ..tensor import Tensor, Module
 from .core import *
 
 __all__ = [
diff --git a/tensorkit/layers/pixelcnn.py b/tensorkit/layers/pixelcnn.py
index 52a551c..d3c5e2f 100644
--- a/tensorkit/layers/pixelcnn.py
+++ b/tensorkit/layers/pixelcnn.py
@@ -1,13 +1,13 @@
 from functools import partial
 from typing import *
 
-from . import resnet, core, composed
-from .core import *
-from .utils import flatten_nested_layers
 from .. import tensor as T
 from ..arg_check import *
 from ..tensor import Tensor, Module, rank, shift, shape, concat, ones_like
 from ..typing_ import *
+from . import resnet, core, composed
+from .core import *
+from .utils import flatten_nested_layers
 
 __all__ = [
     'PixelCNNInput1d', 'PixelCNNInput2d', 'PixelCNNInput3d',
@@ -250,7 +250,8 @@ def __init__(self,
                  weight_norm: WeightNormArgType = False,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
-                 data_init: Optional[DataInitArgType] = None):
+                 data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None):
         """
         Construct a new pixelcnn input layer.
 
@@ -274,6 +275,7 @@ def __init__(self,
             weight_init: The weight initializer for the convolutional layers.
             bias_init: The bias initializer for the convolutional layers.
             data_init: The data-dependent initializer for the convolutional layers.
+            device: The device where to place new tensors and variables.
         """
         super().__init__()
 
@@ -314,6 +316,7 @@ def __init__(self,
                             weight_init=weight_init,
                             bias_init=bias_init,
                             data_init=data_init,
+                            device=device,
                         ),
                         SpatialShift(spatial_shift)
                     )
@@ -453,6 +456,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         """
         Construct a new PixelCNN resnet block.
@@ -490,6 +494,7 @@ def __init__(self,
             weight_init: The weight initializer for the convolutional layers.
             bias_init: The bias initializer for the convolutional layers.
             data_init: The data-dependent initializer for the convolutional layers.
+            device: The device where to place new tensors and variables.
         """
         spatial_ndims = self._get_spatial_ndims()
 
@@ -551,6 +556,7 @@ def __init__(self,
                     weight_init=weight_init,
                     bias_init=bias_init,
                     data_init=data_init,
+                    device=device,
                 )
             )
 
@@ -618,7 +624,9 @@ def __init__(self,
                  gate_bias: float = DEFAULT_GATE_BIAS,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
-                 data_init: Optional[DataInitArgType] = None):
+                 data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
+                 ):
         spatial_ndims = self._get_spatial_ndims()
 
         # validate the arguments
@@ -658,6 +666,7 @@ def __init__(self,
                     weight_init=weight_init,
                     bias_init=bias_init,
                     data_init=data_init,
+                    device=device,
                 )
             )
 
@@ -739,7 +748,9 @@ def __init__(self,
                  gate_bias: float = DEFAULT_GATE_BIAS,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
-                 data_init: Optional[DataInitArgType] = None):
+                 data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
+                 ):
         spatial_ndims = self._get_spatial_ndims()
 
         # validate the arguments
@@ -781,6 +792,7 @@ def __init__(self,
                     weight_init=weight_init,
                     bias_init=bias_init,
                     data_init=data_init,
+                    device=device,
                 )
             )
 
@@ -865,7 +877,7 @@ def __init__(self,
 
         input_cls_name = f'PixelCNNInput{spatial_ndims}d'
         if not isinstance(input_layer, global_dict[input_cls_name]) and \
-                not T.is_jit_layer(input_layer):
+                not is_jit_layer(input_layer):
             raise TypeError(
                 f'`input_layer` must be an instance of `{input_cls_name}`: '
                 f'got {input_layer!r}.'
diff --git a/tensorkit/layers/resnet.py b/tensorkit/layers/resnet.py
index 9f4d5bd..d2bb078 100644
--- a/tensorkit/layers/resnet.py
+++ b/tensorkit/layers/resnet.py
@@ -130,6 +130,7 @@ def __init__(self,
                  weight_init: TensorInitArgType = DEFAULT_WEIGHT_INIT,
                  bias_init: TensorInitArgType = DEFAULT_BIAS_INIT,
                  data_init: Optional[DataInitArgType] = None,
+                 device: Optional[str] = None,
                  ):
         """
         Construct a new resnet block.
@@ -189,6 +190,7 @@ def __init__(self,
             weight_init: The weight initializer for the convolutional layers.
             bias_init: The bias initializer for the convolutional layers.
             data_init: The data-dependent initializer for the convolutional layers.
+            device: The device where to place new tensors and variables.
         """
         def use_bias_or_else(default_val: bool):
             if use_bias is None:
@@ -204,6 +206,7 @@ def compile_layer_list(layers: List[Module]) -> Module:
                 return Sequential(layers)
 
         spatial_ndims = self._get_spatial_ndims()
+        is_deconv = self._is_deconv()
 
         # validate arguments
         in_channels = int(in_channels)
@@ -212,13 +215,11 @@ def compile_layer_list(layers: List[Module]) -> Module:
         kernel_size = validate_conv_size('kernel_size', kernel_size, spatial_ndims)
         stride = validate_conv_size('strides', stride, spatial_ndims)
         dilation = validate_conv_size('dilation', dilation, spatial_ndims)
-        is_half_padding = padding == PaddingMode.HALF.value
         padding = validate_padding(padding, kernel_size, dilation, spatial_ndims)
 
-        if output_padding != 0 and \
-                self._add_output_padding_to_kwargs(output_padding, {}) == {}:
+        if output_padding != 0 and not is_deconv:
             raise ValueError(f'The `output_padding` argument is not allowed '
-                             f'by a {self.__class__.__qualname__} layer.')
+                             f'by {self.__class__.__qualname__}.')
         output_padding = validate_output_padding(
             output_padding, stride, dilation, spatial_ndims)
 
@@ -244,7 +245,8 @@ def compile_layer_list(layers: List[Module]) -> Module:
         if use_shortcut is None:
             use_shortcut = (
                 any(s != 1 for s in stride) or
-                (not is_half_padding and any(k != 1 for k in stride)) or
+                any(p[0] + p[1] != (k - 1) * d
+                    for p, k, d in zip(padding, kernel_size, dilation)) or
                 in_channels != out_channels)
 
         if activation is not None:
@@ -270,7 +272,7 @@ def compile_layer_list(layers: List[Module]) -> Module:
             )
 
         kwargs = {'weight_init': weight_init, 'bias_init': bias_init,
-                  'data_init': data_init}
+                  'data_init': data_init, 'device': device}
 
         # build the shortcut path
         if use_shortcut:
@@ -392,8 +394,11 @@ def _get_spatial_ndims(self) -> int:
     def _default_conv_factory(self) -> LayerFactory:
         raise NotImplementedError()
 
+    def _is_deconv(self) -> bool:
+        raise NotImplementedError()
+
     def _add_output_padding_to_kwargs(self, output_padding, kwargs):
-        return kwargs
+        raise NotImplementedError()
 
     def forward(self,
                 input: Tensor,
@@ -422,7 +427,16 @@ def forward(self,
         return output
 
 
-class ResBlock1d(ResBlockNd):
+class ResBlockConvNd(ResBlockNd):
+
+    def _add_output_padding_to_kwargs(self, output_padding, kwargs):
+        return kwargs
+
+    def _is_deconv(self) -> bool:
+        return False
+
+
+class ResBlock1d(ResBlockConvNd):
     """1D ResNet convolution block."""
 
     def _get_spatial_ndims(self) -> int:
@@ -432,7 +446,7 @@ def _default_conv_factory(self) -> LayerFactory:
         return LinearConv1d
 
 
-class ResBlock2d(ResBlockNd):
+class ResBlock2d(ResBlockConvNd):
     """2D ResNet convolution block."""
 
     def _get_spatial_ndims(self) -> int:
@@ -442,7 +456,7 @@ def _default_conv_factory(self) -> LayerFactory:
         return LinearConv2d
 
 
-class ResBlock3d(ResBlockNd):
+class ResBlock3d(ResBlockConvNd):
     """3D ResNet convolution block."""
 
     def _get_spatial_ndims(self) -> int:
@@ -459,6 +473,9 @@ def _add_output_padding_to_kwargs(self, output_padding, kwargs=None):
         kwargs['output_padding'] = output_padding
         return kwargs
 
+    def _is_deconv(self) -> bool:
+        return True
+
 
 class ResBlockTranspose1d(ResBlockTransposeNd):
     """1D ResNet de-convolution block."""
diff --git a/tensorkit/layers/utils.py b/tensorkit/layers/utils.py
index 906cda7..d04b837 100644
--- a/tensorkit/layers/utils.py
+++ b/tensorkit/layers/utils.py
@@ -1,13 +1,15 @@
+from contextlib import contextmanager
 from typing import *
 
 from ..arg_check import *
 from ..tensor import Module
 from ..typing_ import *
 from .activation import *
+from .core import *
 
 __all__ = [
     'flatten_nested_layers', 'get_activation_class',
-    'get_deconv_output_padding',
+    'get_deconv_output_padding', 'scoped_eval_mode',
 ]
 
 
@@ -129,3 +131,22 @@ def f(i, o, k, s, d, p):
 
     return [f(*args) for args in zip(
         input_size, output_size, kernel_size, stride, dilation, padding)]
+
+
+@contextmanager
+def scoped_eval_mode(*layer_or_layers: Union[Module, Sequence[Module]]):
+    """
+    Set the layers to evaluation mode when entering the context, and
+    set to training mode when exiting the context.
+
+    Args:
+        layer_or_layers: The layer or layers to be set.
+    """
+    layer_or_layers = flatten_nested_layers(layer_or_layers)
+    try:
+        for layer in layer_or_layers:
+            set_eval_mode(layer)
+        yield
+    finally:
+        for layer in layer_or_layers:
+            set_train_mode(layer)
diff --git a/tensorkit/tensor/core.py b/tensorkit/tensor/core.py
index 1fab558..7891f94 100644
--- a/tensorkit/tensor/core.py
+++ b/tensorkit/tensor/core.py
@@ -1,6 +1,4 @@
 from ..backend import core
 from ..backend.core import *
-from . import core_extras
-from .core_extras import *
 
-__all__ = core.__all__ + core_extras.__all__
+__all__ = core.__all__
diff --git a/tensorkit/tensor/core_extras.py b/tensorkit/tensor/core_extras.py
deleted file mode 100644
index a9a2c5b..0000000
--- a/tensorkit/tensor/core_extras.py
+++ /dev/null
@@ -1 +0,0 @@
-__all__ = []
diff --git a/tensorkit/tensor/random_extras.py b/tensorkit/tensor/random_extras.py
index 04d97cd..a222f7d 100644
--- a/tensorkit/tensor/random_extras.py
+++ b/tensorkit/tensor/random_extras.py
@@ -35,10 +35,11 @@ def truncated_randn(shape: List[int],
                     low: Optional[float] = None,
                     high: Optional[float] = None,
                     dtype: str = float_x(),
+                    device: Optional[str] = None,
                     epsilon: float = EPSILON) -> Tensor:
     # fast routine: low is None and high is None, use standard randn
     if low is None and high is None:
-        return randn(shape, dtype)
+        return randn(shape, dtype, device)
 
     # compute cdf(low) and cdf(high)
     if low is None:
@@ -52,7 +53,7 @@ def truncated_randn(shape: List[int],
         high_cdf = _unit_normal_cdf_float(high)
 
     # sample u ~ uniform(0, 1)
-    u = rand(shape, dtype)
+    u = rand(shape, dtype, device)
 
     # transform uniform random variable into truncated normal
     if low_cdf == 0.:
@@ -111,17 +112,17 @@ def truncated_randn_log_pdf(given: Tensor,
         log_pdf = where(
             logical_and(low <= given, given <= high),
             log_pdf,
-            as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+            float_scalar_like(log_zero, log_pdf))
     elif low is not None:
         log_pdf = where(
             low <= given,
             log_pdf,
-            as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+            float_scalar_like(log_zero, log_pdf))
     elif high is not None:
         log_pdf = where(
             given <= high,
             log_pdf,
-            as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+            float_scalar_like(log_zero, log_pdf))
     else:
         log_pdf = log_pdf  # do nothing, but JIT requires this branch
 
@@ -145,10 +146,10 @@ def truncated_normal(mean: Tensor,
     if n_samples is not None:
         param_shape = [n_samples] + param_shape
     r = truncated_randn(param_shape, low=low, high=high, dtype=get_dtype(mean),
-                        epsilon=epsilon)
+                        epsilon=epsilon, device=get_device(mean))
     r = r * std + mean
     if not reparameterized:
-        r = r.detach()
+        r = stop_grad(r)
     return r
 
 
@@ -182,17 +183,17 @@ def truncated_normal_log_pdf(given: Tensor,
             logical_and((low * std + mean) <= given,
                         given <= (high * std + mean)),
             log_pdf,
-            as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+            float_scalar_like(log_zero, log_pdf))
     elif low is not None:
         log_pdf = where(
             (low * std + mean) <= given,
             log_pdf,
-            as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+            float_scalar_like(log_zero, log_pdf))
     elif high is not None:
         log_pdf = where(
             given <= (high * std + mean),
             log_pdf,
-            as_tensor_backend(log_zero, dtype=log_pdf.dtype))
+            float_scalar_like(log_zero, log_pdf))
     else:
         log_pdf = log_pdf  # do nothing, but JIT requires this branch
 
@@ -244,7 +245,7 @@ def discretized_logistic(mean: Tensor,
                          format(mean_dtype, log_scale_dtype))
 
     u = uniform(shape=sample_shape, low=epsilon, high=1. - epsilon,
-                dtype=mean_dtype)
+                dtype=mean_dtype, device=get_device(mean))
 
     # inverse CDF of the logistic
     inverse_logistic_cdf = log(u) - log1p(-u)
@@ -318,7 +319,7 @@ def discretized_logistic_log_prob(given: Tensor,
     # the middle bins cases:
     #   log(sigmoid(x_high) - sigmoid(x_low))
     # middle_bins_pdf = tf.log(cdf_delta + self._epsilon)
-    epsilon_tensor = as_tensor_backend(epsilon, dtype=cdf_delta.dtype)
+    epsilon_tensor = float_scalar_like(epsilon, cdf_delta)
     middle_bins_pdf = log(maximum(cdf_delta, epsilon_tensor))
 
     # # but in extreme cases where `sigmoid(x_high) - sigmoid(x_low)`
@@ -328,7 +329,7 @@ def discretized_logistic_log_prob(given: Tensor,
     #     cdf_delta > epsilon_tensor,
     #     # to avoid NaNs pollute the select statement, we have to use
     #     # `maximum(cdf_delta, 1e-12)`
-    #     log(maximum(cdf_delta, as_tensor_backend(1e-12, dtype=cdf_delta.dtype))),
+    #     log(maximum(cdf_delta, float_scalar_like(1e-12, cdf_delta))),
     #     # the alternative form.  basically it can be derived by using
     #     # the mean value theorem for integration.
     #     x_mid + log_delta - 2. * softplus(x_mid)
@@ -345,7 +346,7 @@ def discretized_logistic_log_prob(given: Tensor,
 
             # the left-edge bin case
             #   log(sigmoid(x_high) - sigmoid(-infinity))
-            left_edge = as_tensor_backend(min_val + half_bin, dtype=broadcast_given.dtype)
+            left_edge = float_scalar_like(min_val + half_bin, broadcast_given)
             left_edge_pdf = -softplus(-x_high)
             if validate_tensors:
                 left_edge_pdf = assert_finite(left_edge_pdf, 'left_edge_pdf')
@@ -358,7 +359,7 @@ def discretized_logistic_log_prob(given: Tensor,
 
             # the right-edge bin case
             #   log(sigmoid(infinity) - sigmoid(x_low))
-            right_edge = as_tensor_backend(max_val - half_bin, dtype=broadcast_given.dtype)
+            right_edge = float_scalar_like(max_val - half_bin, broadcast_given)
             right_edge_pdf = -softplus(x_low)
             if validate_tensors:
                 right_edge_pdf = assert_finite(right_edge_pdf, 'right_edge_pdf')
@@ -376,7 +377,7 @@ def discretized_logistic_log_prob(given: Tensor,
                 logical_and(given >= min_val - half_bin,
                             given <= max_val + half_bin),
                 log_prob,
-                as_tensor_backend(log_zero, dtype=log_prob.dtype))
+                float_scalar_like(log_zero, log_prob))
 
     # now reduce the group_ndims
     if group_ndims > 0:
diff --git a/tensorkit/utils/tensor_stream.py b/tensorkit/utils/tensor_stream.py
index e154a93..3b0abee 100644
--- a/tensorkit/utils/tensor_stream.py
+++ b/tensorkit/utils/tensor_stream.py
@@ -14,8 +14,10 @@
 class TensorStream(mltk.DataStream):
 
     source: mltk.DataStream
+    device: str
 
-    def __init__(self, source: mltk.DataStream):
+    def __init__(self, source: mltk.DataStream, device: Optional[str] = None):
+        device = device or T.current_device()
         super().__init__(
             batch_size=source.batch_size,
             array_count=source.array_count,
@@ -24,8 +26,10 @@ def __init__(self, source: mltk.DataStream):
             random_state=source.random_state,
         )
         self.source = source
+        self.device = device
 
     def copy(self, **kwargs):
+        kwargs.setdefault('device', self.device)
         return TensorStream(source=self.source, **kwargs)
 
     def _minibatch_iterator(self) -> Generator[ArrayTuple, None, None]:
@@ -33,16 +37,20 @@ def _minibatch_iterator(self) -> Generator[ArrayTuple, None, None]:
         try:
             for batch_data in g:
                 with T.no_grad():
-                    batch_data = tuple(T.from_numpy(arr) for arr in batch_data)
+                    batch_data = tuple(
+                        T.from_numpy(arr, device=self.device)
+                        for arr in batch_data
+                    )
                     yield batch_data
         finally:
             g.close()
 
 
 def as_tensor_stream(source: mltk.DataStream,
+                     device: Optional[str] = None,
                      prefetch: Optional[int] = None
                      ) -> mltk.DataStream:
-    stream = TensorStream(source)
+    stream = TensorStream(source, device=device)
     if prefetch is not None:
         stream = stream.threaded(prefetch)
     return stream
diff --git a/tests/distributions/test_flow.py b/tests/distributions/test_flow.py
index 179154b..a80e12f 100644
--- a/tests/distributions/test_flow.py
+++ b/tests/distributions/test_flow.py
@@ -10,7 +10,7 @@
 from tensorkit.distributions import Categorical, FlowDistribution, UnitNormal
 from tensorkit.distributions.utils import copy_distribution
 from tensorkit.flows import ReshapeFlow, ActNorm
-from tensorkit.tensor import Tensor, as_tensor_backend, int_range
+from tensorkit.tensor import Tensor, float_scalar_like, int_range
 from tests.helper import *
 
 
@@ -31,9 +31,9 @@ def _transform(self,
 
         if compute_log_det:
             if inverse:
-                output_log_det = as_tensor_backend(-math.log(2.))
+                output_log_det = float_scalar_like(-math.log(2.), output)
             else:
-                output_log_det = as_tensor_backend(math.log(2.))
+                output_log_det = float_scalar_like(math.log(2.), output)
 
             for axis in int_range(-event_ndims, 0):
                 output_log_det = output_log_det * output.shape[axis]
diff --git a/tests/flows/test_act_norm.py b/tests/flows/test_act_norm.py
index c3b2cd5..7e02103 100644
--- a/tests/flows/test_act_norm.py
+++ b/tests/flows/test_act_norm.py
@@ -24,7 +24,7 @@ def do_check(batch_shape, scale_type, initialized, dtype):
         ctx.assertIn(f'num_features={num_features}', repr(flow))
         ctx.assertIn(f'axis={-(spatial_ndims + 1)}', repr(flow))
         ctx.assertIn(f'scale_type={scale_type!r}', repr(flow))
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         # check initialize
         if not initialized:
diff --git a/tests/flows/test_core.py b/tests/flows/test_core.py
index 8aeb32c..a7e30b3 100644
--- a/tests/flows/test_core.py
+++ b/tests/flows/test_core.py
@@ -7,7 +7,7 @@
 
 import tensorkit as tk
 from tensorkit import tensor as T
-from tensorkit.tensor import Tensor, reshape_tail, as_tensor_backend, zeros_like, shape
+from tensorkit.tensor import Tensor, reshape_tail, float_scalar_like, zeros_like, shape
 from tensorkit.tensor.random import randn
 from tensorkit.flows import *
 from tests.helper import *
@@ -33,7 +33,7 @@ def _transform(self,
 
         output_log_det = input_log_det
         if compute_log_det:
-            log_2 = as_tensor_backend(math.log(2.), dtype=output.dtype)
+            log_2 = float_scalar_like(math.log(2.), output)
             if output_log_det is None:
                 if inverse:
                     output_log_det = -log_2 * input.shape[-2]
@@ -92,7 +92,7 @@ def test_invert(self):
         self.assertIsInstance(inv_flow, InverseFlow)
 
     def test_call(self):
-        flow = T.jit_compile(_MyFlow())
+        flow = tk.layers.jit_compile(_MyFlow())
         self.assertEqual(flow.get_x_event_ndims(), 1)
         self.assertEqual(flow.get_y_event_ndims(), 2)
         self.assertEqual(flow.is_explicitly_invertible(), True)
@@ -123,7 +123,7 @@ def test_call(self):
             _ = flow(expected_y, T.random.randn([2, 4]), inverse=True)
 
         # test output_log_det shape error
-        flow = T.jit_compile(_MyBadFlow())
+        flow = tk.layers.jit_compile(_MyBadFlow())
         with pytest.raises(Exception,
                            match='The shape of `output_log_det` is not expected'):
             _ = flow(x)
@@ -140,7 +140,7 @@ def test_constructor(self):
                                   explicitly_invertible=True)
         self.assertEqual(flow.get_event_ndims(), 2)
         self.assertEqual(flow.axis, -1)
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         self.assertEqual(flow.get_axis(), -1)
         self.assertEqual(flow.get_x_event_ndims(), 2)
@@ -163,12 +163,12 @@ def test_constructor(self):
 class InverseFlowTestCase(unittest.TestCase):
 
     def test_InverseFlow(self):
-        original_flow = T.jit_compile(_MyFlow())
+        original_flow = tk.layers.jit_compile(_MyFlow())
         flow = InverseFlow(original_flow)
         self.assertIs(flow.original_flow, original_flow)
         self.assertIs(flow.invert(), original_flow)
 
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
         self.assertEqual(flow.get_x_event_ndims(), 2)
         self.assertEqual(flow.get_y_event_ndims(), 1)
         self.assertTrue(flow.is_explicitly_invertible())
@@ -189,7 +189,7 @@ def test_InverseFlow(self):
         base_flow.explicitly_invertible = False
         with pytest.raises(TypeError,
                            match='`flow` must be an explicitly invertible flow'):
-            _ = InverseFlow(T.jit_compile(base_flow))
+            _ = InverseFlow(tk.layers.jit_compile(base_flow))
 
 
 class _MyFlow1(Flow):
@@ -211,7 +211,7 @@ def _transform(self,
 
         output_log_det = input_log_det
         if compute_log_det:
-            log_2 = T.as_tensor_backend(math.log(2.), dtype=output.dtype)
+            log_2 = T.float_scalar_like(math.log(2.), output)
             if output_log_det is None:
                 if inverse:
                     output_log_det = -log_2 * input.shape[-1]
@@ -229,16 +229,16 @@ def _transform(self,
 class SequentialFlowTestCase(unittest.TestCase):
 
     def test_constructor(self):
-        flows = [T.jit_compile(_MyFlow1()), T.jit_compile(_MyFlow())]
-        flow = T.jit_compile(SequentialFlow(flows))
+        flows = [tk.layers.jit_compile(_MyFlow1()), tk.layers.jit_compile(_MyFlow())]
+        flow = tk.layers.jit_compile(SequentialFlow(flows))
         self.assertEqual(flow.get_x_event_ndims(), 1)
         self.assertEqual(flow.get_y_event_ndims(), 2)
         self.assertTrue(flow.is_explicitly_invertible())
 
         flow2 = _MyFlow()
         flow2.explicitly_invertible = False
-        flows = [T.jit_compile(_MyFlow1()), T.jit_compile(flow2)]
-        flow = T.jit_compile(SequentialFlow(flows))
+        flows = [tk.layers.jit_compile(_MyFlow1()), tk.layers.jit_compile(flow2)]
+        flow = tk.layers.jit_compile(SequentialFlow(flows))
         self.assertFalse(flow.is_explicitly_invertible())
 
         with pytest.raises(ValueError,
@@ -261,8 +261,8 @@ def test_constructor(self):
 
     def test_call(self):
         # test call and inverse call
-        flows = [_MyFlow1(), T.jit_compile(_MyFlow1())]
-        flow = T.jit_compile(SequentialFlow(flows))
+        flows = [_MyFlow1(), tk.layers.jit_compile(_MyFlow1())]
+        flow = tk.layers.jit_compile(SequentialFlow(flows))
 
         x = T.random.randn([2, 3, 4])
         expected_y = (x * 2. + 1.) * 2. + 1.
@@ -275,7 +275,7 @@ def test_call(self):
         # test no inverse call
         flows = [_MyFlow1()]
         flows[0].explicitly_invertible = False
-        flow = T.jit_compile(SequentialFlow(flows))
+        flow = tk.layers.jit_compile(SequentialFlow(flows))
 
         with pytest.raises(Exception,
                            match='Not an explicitly invertible flow'):
@@ -313,7 +313,7 @@ def test_invertible_matrices(self):
                 self.assertEqual(repr(m), f'{cls.__qualname__}(size={n})')
                 self.assertEqual(m.size, n)
 
-                m = T.jit_compile(m)
+                m = tk.layers.jit_compile(m)
 
                 # check the initial value is an orthogonal matrix
                 matrix, _ = m(inverse=False, compute_log_det=False)
@@ -354,7 +354,7 @@ def check_invertible_linear(ctx,
         # construct the layer
         flow = invertible_linear_factory(num_features, strict=strict)
         ctx.assertIn(f'num_features={num_features}', repr(flow))
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         # derive the expected answer
         weight, log_det = flow.invertible_matrix(
@@ -518,7 +518,7 @@ def test_ExpScale(self):
 
         x = T.random.randn([2, 3, 4])
         scale = ExpScale()
-        scale = T.jit_compile(scale)
+        scale = tk.layers.jit_compile(scale)
 
         for pre_scale in [T.random.randn([4]),
                           T.random.randn([3, 1]),
@@ -541,7 +541,7 @@ def test_SigmoidScale(self):
             if pre_scale_bias is None:
                 pre_scale_bias = 0.
             self.assertIn(f'pre_scale_bias={pre_scale_bias}', repr(scale))
-            scale = T.jit_compile(scale)
+            scale = tk.layers.jit_compile(scale)
 
             for pre_scale in [T.random.randn([4]),
                               T.random.randn([3, 1]),
@@ -558,7 +558,7 @@ def test_LinearScale(self):
         x = T.random.randn([2, 3, 4])
         scale = LinearScale(epsilon=T.EPSILON)
         self.assertIn('epsilon=', repr(scale))
-        scale = T.jit_compile(scale)
+        scale = tk.layers.jit_compile(scale)
 
         for pre_scale in [T.random.randn([4]),
                           T.random.randn([3, 1]),
diff --git a/tests/flows/test_coupling.py b/tests/flows/test_coupling.py
index aa1b21f..eddd9a2 100644
--- a/tests/flows/test_coupling.py
+++ b/tests/flows/test_coupling.py
@@ -1,5 +1,4 @@
 import unittest
-from itertools import product
 
 import pytest
 
@@ -19,8 +18,8 @@ def check_coupling_layer(ctx,
     sigmoid_scale_bias = 1.5
 
     n1, n2 = (num_features // 2), (num_features - num_features // 2)
-    shift_and_pre_scale_1 = T.jit_compile(shift_and_pre_scale_factory(n1, n2))
-    shift_and_pre_scale_2 = T.jit_compile(shift_and_pre_scale_factory(n2, n1))
+    shift_and_pre_scale_1 = tk.layers.jit_compile(shift_and_pre_scale_factory(n1, n2))
+    shift_and_pre_scale_2 = tk.layers.jit_compile(shift_and_pre_scale_factory(n2, n1))
 
     def do_check(secondary, scale_type):
         x = T.random.randn(make_conv_shape(
@@ -35,7 +34,7 @@ def do_check(secondary, scale_type):
             sigmoid_scale_bias=sigmoid_scale_bias
         )
         ctx.assertIn(f'secondary={secondary}', repr(flow))
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         # obtain the expected output
         channel_axis = get_channel_axis(spatial_ndims)
@@ -51,7 +50,7 @@ def do_check(secondary, scale_type):
             scale = SigmoidScale(pre_scale_bias=sigmoid_scale_bias)
         elif scale_type == 'linear' or scale_type is LinearScale:
             scale = LinearScale()
-        elif isinstance(scale_type, Scale) or T.is_jit_layer(scale_type):
+        elif isinstance(scale_type, Scale) or tk.layers.is_jit_layer(scale_type):
             scale = scale_type
         else:
             raise ValueError(f'Invalid value for `scale`: {scale_type}')
@@ -72,7 +71,7 @@ def do_check(secondary, scale_type):
         do_check(secondary, 'exp')
 
     for scale_type in ('exp', 'sigmoid', 'linear',
-                       SigmoidScale, T.jit_compile(LinearScale())):
+                       SigmoidScale, tk.layers.jit_compile(LinearScale())):
         do_check(False, scale_type)
 
     # test error constructors
diff --git a/tests/flows/test_rearrangement.py b/tests/flows/test_rearrangement.py
index 1c576d2..451c9af 100644
--- a/tests/flows/test_rearrangement.py
+++ b/tests/flows/test_rearrangement.py
@@ -23,7 +23,7 @@ def check_shuffling_flow(ctx,
         inv_permutation = tk.layers.get_parameter(flow, 'inv_permutation')
         assert_equal(T.argsort(permutation), inv_permutation)
         assert_equal(T.argsort(inv_permutation), permutation)
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         # prepare for the answer
         x = T.random.randn(shape)
diff --git a/tests/flows/test_shape_.py b/tests/flows/test_shape_.py
index b972c73..f15ad38 100644
--- a/tests/flows/test_shape_.py
+++ b/tests/flows/test_shape_.py
@@ -20,7 +20,7 @@ def test_ReshapeFlow(self):
         self.assertEqual(flow.get_y_event_ndims(), 1)
         self.assertIn('x_event_shape=[4, -1]', repr(flow))
         self.assertIn('y_event_shape=[-1]', repr(flow))
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         x = T.random.randn([2, 3, 4, 5])
         expected_y = T.reshape_tail(x, 2, [-1])
diff --git a/tests/flows/test_split_.py b/tests/flows/test_split_.py
index 8250a6c..67b707f 100644
--- a/tests/flows/test_split_.py
+++ b/tests/flows/test_split_.py
@@ -33,7 +33,7 @@ def check_split_flow(ctx,
             ctx.assertIn(f'y_sections={y_sections}', repr(flow))
             ctx.assertIn(f'x_axis={x_axis}', repr(flow))
             ctx.assertIn(f'y_axis={y_axis}', repr(flow))
-            flow = T.jit_compile(flow)
+            flow = tk.layers.jit_compile(flow)
 
             x1, x2 = T.split(x, x_sections, axis=x_axis)
             y1, expected_log_det = left(x1, compute_log_det=True)
@@ -45,7 +45,7 @@ def check_split_flow(ctx,
 
         # with right
         flow = cls(x_sections, left, right, **kwargs)
-        flow = T.jit_compile(flow)
+        flow = tk.layers.jit_compile(flow)
 
         x1, x2 = T.split(x, x_sections, axis=x_axis)
         y1, expected_log_det = left(x1, compute_log_det=True)
@@ -91,8 +91,8 @@ def test_SplitFlow(self):
         T.random.seed(1234)
 
         # x and y with the same event ndims
-        left = T.jit_compile(InvertibleDense(2))
-        right = T.jit_compile(InvertibleDense(3))
+        left = tk.layers.jit_compile(InvertibleDense(2))
+        right = tk.layers.jit_compile(InvertibleDense(3))
 
         check_split_flow(
             ctx=self,
@@ -159,8 +159,8 @@ def test_SplitFlowNd(self):
             cls = getattr(tk.flows, f'SplitFlow{spatial_ndims}d')
             sub_cls = getattr(tk.flows, f'InvertibleConv{spatial_ndims}d')
 
-            left = T.jit_compile(sub_cls(2))
-            right = T.jit_compile(sub_cls(3))
+            left = tk.layers.jit_compile(sub_cls(2))
+            right = tk.layers.jit_compile(sub_cls(3))
 
             check_split_flow(
                 ctx=self,
diff --git a/tests/init/test_core.py b/tests/init/test_core.py
index 0039cdf..74774d0 100644
--- a/tests/init/test_core.py
+++ b/tests/init/test_core.py
@@ -377,7 +377,7 @@ def test_data_dependent_initializer(self):
         # also `set_initialized` will affect layers with `set_initialized()`
         # method, e.g., `ActNorm`
         x = T.random.randn([2, 3, 5])
-        layer = T.jit_compile(tk.layers.ActNorm(5))
+        layer = tk.layers.jit_compile(tk.layers.ActNorm(5))
         self.assertFalse(layer.flow.initialized)
 
         tk.init.set_initialized(layer)
diff --git a/tests/init/test_std_data_init.py b/tests/init/test_std_data_init.py
index 2802590..349720e 100644
--- a/tests/init/test_std_data_init.py
+++ b/tests/init/test_std_data_init.py
@@ -50,7 +50,7 @@ def check_x(layer):
         if not tk.settings.disable_jit:
             with pytest.raises(TypeError,
                                match='JIT compiled layer is not supported'):
-                layer = T.jit_compile(tk.layers.Linear(5, 3))
+                layer = tk.layers.jit_compile(tk.layers.Linear(5, 3))
                 tk.init.StdDataInit()(layer, [T.random.randn([3, 5])])
 
         with pytest.raises(TypeError, match='`layer` is not a core linear layer'):
diff --git a/tests/layers/test_composed.py b/tests/layers/test_composed.py
index 750da72..fb4721b 100644
--- a/tests/layers/test_composed.py
+++ b/tests/layers/test_composed.py
@@ -27,7 +27,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
         ctx.assertIsInstance(layer[0], linear_cls)
         ctx.assertEqual(layer[0].use_bias, expected_use_bias)
         assert_allclose(
-            T.jit_compile(layer)(input),
+            tk.layers.jit_compile(layer)(input),
             linear(input)
         )
 
@@ -52,7 +52,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
             ctx.assertEqual(layer[0].use_bias, expected_use_bias)
             ctx.assertIsInstance(layer[1], normalizer_cls)
             assert_allclose(
-                T.jit_compile(layer)(input),
+                tk.layers.jit_compile(layer)(input),
                 normalizer(linear(input)),
             )
 
@@ -72,7 +72,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
         ctx.assertIsInstance(layer[0], linear_cls)
         ctx.assertIsInstance(layer[1], tk.layers.Tanh)
         assert_allclose(
-            T.jit_compile(layer)(input),
+            tk.layers.jit_compile(layer)(input),
             activation_cls()(linear(input)),
         )
 
@@ -90,7 +90,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
     ctx.assertIsInstance(layer[0], linear_cls)
     out = linear(input)
     assert_allclose(
-        T.jit_compile(layer)(input),
+        tk.layers.jit_compile(layer)(input),
         T.nn.sigmoid(out[:, out_features:] + 2.0) * out[:, :out_features],
     )
 
@@ -109,7 +109,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
     ctx.assertIsInstance(layer[0], linear_cls)
     out = linear(input)
     assert_allclose(
-        T.jit_compile(layer)(input),
+        tk.layers.jit_compile(layer)(input),
         (T.nn.sigmoid(out[:, out_features:] + 2.0) *
          activation(out[:, :out_features])),
     )
@@ -131,7 +131,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
     ctx.assertIsInstance(layer[0], linear_cls)
     out = normalizer(linear(input))
     assert_allclose(
-        T.jit_compile(layer)(input),
+        tk.layers.jit_compile(layer)(input),
         (T.nn.sigmoid(out[:, out_features:] + 2.0) *
          activation(out[:, :out_features])),
     )
diff --git a/tests/layers/test_contextual.py b/tests/layers/test_contextual.py
index 7dd093b..7842fc1 100644
--- a/tests/layers/test_contextual.py
+++ b/tests/layers/test_contextual.py
@@ -11,7 +11,7 @@ def test_IgnoreContext(self):
         x = T.random.randn([2, 3, 4])
         context = [T.random.randn([2, 3, 4]),
                    T.random.randn([2, 3, 4])]
-        layer = T.jit_compile(tk.layers.IgnoreContext())
+        layer = tk.layers.jit_compile(tk.layers.IgnoreContext())
         assert_equal(layer(x), x)
         assert_equal(layer(x, context), x)
 
@@ -19,7 +19,7 @@ def test_AddContext(self):
         x = T.random.randn([2, 3, 4])
         context = [T.random.randn([2, 3, 4]),
                    T.random.randn([2, 3, 4])]
-        layer = T.jit_compile(tk.layers.AddContext())
+        layer = tk.layers.jit_compile(tk.layers.AddContext())
         assert_equal(layer(x), x)
         assert_equal(layer(x, context), x + context[0] + context[1])
 
@@ -27,6 +27,6 @@ def test_MultiplyContext(self):
         x = T.random.randn([2, 3, 4])
         context = [T.random.randn([2, 3, 4]),
                    T.random.randn([2, 3, 4])]
-        layer = T.jit_compile(tk.layers.MultiplyContext())
+        layer = tk.layers.jit_compile(tk.layers.MultiplyContext())
         assert_equal(layer(x), x)
         assert_equal(layer(x, context), x * context[0] * context[1])
diff --git a/tests/layers/test_core.py b/tests/layers/test_core.py
index 99a84c9..3237d8f 100644
--- a/tests/layers/test_core.py
+++ b/tests/layers/test_core.py
@@ -174,7 +174,7 @@ def test_get_bias_store(self):
 class IdentityTestCase(unittest.TestCase):
 
     def test_identity(self):
-        layer = T.jit_compile(Identity())
+        layer = tk.layers.jit_compile(Identity())
         x = T.random.randn([2, 3, 4])
         assert_equal(x, layer(x))
 
@@ -234,7 +234,7 @@ class _AutoRepr(BaseLayer):
 class BaseLayersTestCase(unittest.TestCase):
 
     def test_single_variate_layer(self):
-        layer = T.jit_compile(_MySingleVariateLayer())
+        layer = tk.layers.jit_compile(_MySingleVariateLayer())
         x = T.random.randn([2, 3, 4])
         np_offset = T.from_numpy(np.array([0., 1., 2., 3.]))
         assert_allclose(layer(x), x * 11. + np_offset)
@@ -242,7 +242,7 @@ def test_single_variate_layer(self):
         assert_allclose(layer(x), x * 11. + 7. + np_offset)
 
     def test_multi_variate_layer(self):
-        layer = T.jit_compile(_MyMultiVariateLayer())
+        layer = tk.layers.jit_compile(_MyMultiVariateLayer())
         x = T.random.randn([2, 3, 4])
         y = T.random.randn([2, 3, 4])
         z = T.random.randn([2, 3, 4])
@@ -251,7 +251,7 @@ def test_multi_variate_layer(self):
         assert_allclose(b, y + z)
 
     def test_split_layer(self):
-        layer = T.jit_compile(_MySplitLayer())
+        layer = tk.layers.jit_compile(_MySplitLayer())
         x = T.random.randn([2, 3, 4])
         a, b, c = layer(x)
         assert_allclose(a, x)
@@ -259,7 +259,7 @@ def test_split_layer(self):
         assert_allclose(c, x + 2)
 
     def test_merge_layer(self):
-        layer = T.jit_compile(_MyMergeLayer())
+        layer = tk.layers.jit_compile(_MyMergeLayer())
         x = T.random.randn([2, 3, 4])
         y = T.random.randn([2, 3, 4])
         z = T.random.randn([2, 3, 4])
@@ -287,7 +287,7 @@ def test_sequential(self):
 
         s = Sequential(layers[0], layers[1:2], [layers[2], [layers[3], layers[4]]])
         self.assertEqual(list(s), layers)
-        y = T.jit_compile(s)(x)
+        y = tk.layers.jit_compile(s)(x)
 
         y2 = x
         for layer in layers:
@@ -305,7 +305,7 @@ def check_core_linear(ctx, input, layer_factory, layer_name, numpy_fn):
     ctx.assertIsInstance(layer.weight_store, SimpleParamStore)
     weight = T.to_numpy(layer.weight_store())
     bias = T.to_numpy(layer.bias_store())
-    assert_allclose(T.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
+    assert_allclose(tk.layers.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
                     numpy_fn(input, weight=weight, bias=bias),
                     rtol=1e-4, atol=1e-6)
     ctx.assertNotIn('use_bias=', repr(layer))
@@ -314,7 +314,7 @@ def check_core_linear(ctx, input, layer_factory, layer_name, numpy_fn):
     layer = layer_factory(use_bias=False)
     ctx.assertIsInstance(layer.weight_store, SimpleParamStore)
     weight = T.to_numpy(layer.weight_store())
-    assert_allclose(T.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
+    assert_allclose(tk.layers.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
                     numpy_fn(input, weight=weight, bias=None),
                     rtol=1e-4, atol=1e-6)
     ctx.assertIn('use_bias=False', repr(layer))
@@ -325,7 +325,7 @@ def check_core_linear(ctx, input, layer_factory, layer_name, numpy_fn):
         ctx.assertIsInstance(layer.weight_store, NormedAndScaledWeightStore,
                              msg=f'weight_norm={wn}')
         weight = T.to_numpy(layer.weight_store())
-        assert_allclose(T.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
+        assert_allclose(tk.layers.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
                         numpy_fn(input, weight=weight, bias=None),
                         rtol=1e-4, atol=1e-6)
 
@@ -334,7 +334,7 @@ def check_core_linear(ctx, input, layer_factory, layer_name, numpy_fn):
         ctx.assertIsInstance(layer.weight_store, NormedWeightStore,
                              msg=f'weight_norm={wn}')
         weight = T.to_numpy(layer.weight_store())
-        assert_allclose(T.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
+        assert_allclose(tk.layers.jit_compile(layer)(T.as_tensor(input, dtype=T.float_x())),
                         numpy_fn(input, weight=weight, bias=None),
                         rtol=1e-4, atol=1e-6)
 
@@ -503,7 +503,7 @@ def test_batch_norm(self):
                                       else f'BatchNorm{spatial_ndims}d'))
             layer = cls(5, momentum=0.1, epsilon=eps)
             self.assertIn('BatchNorm', repr(layer))
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
 
             # layer output
             x = T.random.randn(make_conv_shape(
@@ -551,7 +551,7 @@ def test_dropout(self):
             layer = cls(p=0.3)
             self.assertIn('p=0.3', repr(layer))
             self.assertIn('Dropout', repr(layer))
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
 
             x = 1. + T.random.rand(
                 make_conv_shape([1], n_samples, [2, 2, 2][:spatial_ndims])
diff --git a/tests/layers/test_flow_layer.py b/tests/layers/test_flow_layer.py
index bb0e654..4d3b06f 100644
--- a/tests/layers/test_flow_layer.py
+++ b/tests/layers/test_flow_layer.py
@@ -30,9 +30,9 @@ def _transform(self,
 class FlowLayerTestCase(unittest.TestCase):
 
     def test_FlowLayer(self):
-        flow = T.jit_compile(_MyFlow(
+        flow = tk.layers.jit_compile(_MyFlow(
             x_event_ndims=0, y_event_ndims=0, explicitly_invertible=True))
-        layer = T.jit_compile(tk.layers.FlowLayer(flow))
+        layer = tk.layers.jit_compile(tk.layers.FlowLayer(flow))
 
         x = T.random.randn([3, 4, 5])
         assert_allclose(layer(x), x * 2.)
@@ -53,7 +53,7 @@ def test_ActNorm(self):
         _ = layer(T.random.randn([3, 4, 5]))
 
         # check call
-        layer = T.jit_compile(layer)
+        layer = tk.layers.jit_compile(layer)
         x = T.random.randn([3, 4, 5])
         assert_allclose(layer(x), flow(x)[0])
 
@@ -72,6 +72,6 @@ def test_ActNormNd(self):
             _ = layer(T.random.randn(shape))
 
             # check call
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
             x = T.random.randn(shape)
             assert_allclose(layer(x), flow(x)[0])
diff --git a/tests/layers/test_gated.py b/tests/layers/test_gated.py
index 89ce686..1f91d24 100644
--- a/tests/layers/test_gated.py
+++ b/tests/layers/test_gated.py
@@ -16,7 +16,7 @@ def test_Gated(self):
             'feature_axis=-2, num_features=3, gate_bias=1.5',
             repr(gated)
         )
-        gated = T.jit_compile(gated)
+        gated = tk.layers.jit_compile(gated)
 
         x = T.random.randn([6, 5])
         assert_allclose(gated(x), x[:3, ...] * T.nn.sigmoid(x[3:, ...] + 1.5))
@@ -37,7 +37,7 @@ def test_GatedWithActivation(self):
             'feature_axis=-2, num_features=3, gate_bias=1.5',
             repr(gated)
         )
-        gated = T.jit_compile(gated)
+        gated = tk.layers.jit_compile(gated)
 
         x = T.random.randn([6, 5])
         assert_allclose(
diff --git a/tests/layers/test_pixelcnn.py b/tests/layers/test_pixelcnn.py
index 59ede6f..ac2cc03 100644
--- a/tests/layers/test_pixelcnn.py
+++ b/tests/layers/test_pixelcnn.py
@@ -129,7 +129,7 @@ def test_causality_and_receptive_field(self):
                     1, 1, kernel_size=kernel_size, add_ones_channel=False,
                     weight_init=tk.init.ones,
                 )
-                input_layer = T.jit_compile(input_layer)
+                input_layer = tk.layers.jit_compile(input_layer)
 
                 with pytest.raises(Exception,
                                    match='`input` is expected to be .*d'):
@@ -157,7 +157,7 @@ def test_causality_and_receptive_field(self):
                 resblock_layer = resblock_layer_cls(
                     1, 1, kernel_size=kernel_size, weight_init=tk.init.ones
                 )
-                resblock_layer = T.jit_compile(resblock_layer)
+                resblock_layer = tk.layers.jit_compile(resblock_layer)
 
                 with pytest.raises(Exception):
                     _ = resblock_layer([T.zeros([])] * (spatial_ndims - 1))
@@ -167,7 +167,7 @@ def test_causality_and_receptive_field(self):
                 # the down-sampling and up-sampling layer
                 down_sample_cls = getattr(tk.layers, f'PixelCNNConv{spatial_ndims}d')
                 down_sample_layer = down_sample_cls(1, 1, kernel_size, stride=2)
-                down_sample_layer = T.jit_compile(down_sample_layer)
+                down_sample_layer = tk.layers.jit_compile(down_sample_layer)
 
                 down_sample_output_size = T.shape(down_sample_layer(
                     [T.zeros(make_conv_shape([1], 1, size))] * spatial_ndims)[0])
@@ -183,13 +183,13 @@ def test_causality_and_receptive_field(self):
                         padding='half',  # sum of the both sides == (kernel_size - 1) * dilation
                     )
                 )
-                up_sample_layer = T.jit_compile(up_sample_layer)
+                up_sample_layer = tk.layers.jit_compile(up_sample_layer)
 
                 # the output layer
                 output_layer_cls = getattr(
                     tk.layers, f'PixelCNNOutput{spatial_ndims}d')
                 output_layer = output_layer_cls()
-                output_layer = T.jit_compile(output_layer)
+                output_layer = tk.layers.jit_compile(output_layer)
 
                 with pytest.raises(Exception,
                                    match=r'`len\(inputs\)` is expected to be .*'):
@@ -256,7 +256,7 @@ def test_pixelcnn_network(self):
                     tk.layers, f'PixelCNNInput{spatial_ndims}d')
                 input_layer = input_layer_cls(
                     in_channels, out_channels, kernel_size=kernel_size)
-                input_layer = T.jit_compile(input_layer)
+                input_layer = tk.layers.jit_compile(input_layer)
 
                 # the pixelcnn layers
                 resblock_layer_cls = getattr(
@@ -294,7 +294,7 @@ def test_pixelcnn_network(self):
                         data_init=tk.init.StdDataInit,
                     ),
                 ]
-                pixelcnn_layers = [T.jit_compile(l) for l in pixelcnn_layers]
+                pixelcnn_layers = [tk.layers.jit_compile(l) for l in pixelcnn_layers]
 
                 # the pixelcnn network
                 network_cls = getattr(tk.layers, f'PixelCNN{spatial_ndims}d')
diff --git a/tests/layers/test_pool.py b/tests/layers/test_pool.py
index ac29e8d..aa8e499 100644
--- a/tests/layers/test_pool.py
+++ b/tests/layers/test_pool.py
@@ -83,7 +83,7 @@ def is_valid_padding(padding, kernel_size):
                         f'padding={padding})'
                     )
 
-                layer = T.jit_compile(layer)
+                layer = tk.layers.jit_compile(layer)
                 assert_allclose(
                     layer(x),
                     fn(x, kernel_size=kernel_size, stride=stride,
@@ -108,7 +108,7 @@ def fn(arr):
                 f'GlobalAvgPool{spatial_ndims}d(keepdims={keepdims})'
             )
 
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
             x = T.random.randn(make_conv_shape([4, 5], 6, [7, 8, 9][:spatial_ndims]))
             assert_allclose(layer(x), fn(T.to_numpy(x)), rtol=1e-4, atol=1e-6)
 
diff --git a/tests/layers/test_resnet.py b/tests/layers/test_resnet.py
index 79742c2..e579478 100644
--- a/tests/layers/test_resnet.py
+++ b/tests/layers/test_resnet.py
@@ -51,7 +51,7 @@ def check_resblock(ctx,
     ctx.assertFalse(layer.conv0.use_bias)
     ctx.assertFalse(layer.conv1.use_bias)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x),
         x + layer.conv1(layer.conv0(x)),
@@ -71,7 +71,7 @@ def check_resblock(ctx,
     ctx.assertIsNotNone(layer.conv0.bias_store)
     ctx.assertIsNotNone(layer.conv1.bias_store)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x),
         layer.shortcut(x) + layer.conv1(layer.conv0(x)),
@@ -122,7 +122,7 @@ def check_resblock(ctx,
     ctx.assertEqual(layer.conv1.dilation, dilation)
     ctx.assertEqual(layer.conv1.out_channels, 4)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(layer(x), layer.shortcut(x) + layer.conv1(layer.conv0(x)))
 
     # test resize_at_exit = True
@@ -141,7 +141,7 @@ def check_resblock(ctx,
     ctx.assertEqual(layer.conv1.padding, padding)
     ctx.assertEqual(layer.conv1.out_channels, 4)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x),
         layer.shortcut(x) + layer.conv1(layer.conv0(x)),
@@ -168,7 +168,7 @@ def check_resblock(ctx,
     ctx.assertIsInstance(layer.pre_conv1[1], tk.layers.LeakyReLU)
     ctx.assertEqual(len(layer.pre_conv1), 2)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x),
         (layer.shortcut(x) +
@@ -200,7 +200,7 @@ def check_resblock(ctx,
         _ = layer(x)
         tk.layers.set_train_mode(layer, False)
 
-        layer = T.jit_compile(layer)
+        layer = tk.layers.jit_compile(layer)
         assert_allclose(
             layer(x),
             (layer.shortcut(x) +
@@ -220,7 +220,7 @@ def check_resblock(ctx,
     ctx.assertIsInstance(layer.post_conv1, tk.layers.Gated)
     ctx.assertEqual(layer.post_conv1.gate_bias, 1.5)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x),
         (layer.shortcut(x) + layer.post_conv1(
@@ -242,7 +242,7 @@ def check_resblock(ctx,
     ctx_shape = make_conv_shape([3], 5, [1] * spatial_ndims)
     context = [T.random.randn(ctx_shape), T.random.randn(ctx_shape)]
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x, context),
         (layer.shortcut(x) +
@@ -265,7 +265,7 @@ def check_resblock(ctx,
     ctx.assertIs(layer.conv0, conv0)
     ctx.assertIs(layer.conv1, conv1)
 
-    layer = T.jit_compile(layer)
+    layer = tk.layers.jit_compile(layer)
     assert_allclose(
         layer(x),
         layer.shortcut(x) + layer.conv1(layer.conv0(x)),
diff --git a/tests/layers/test_shape_.py b/tests/layers/test_shape_.py
index 1636ffa..6f0bbbf 100644
--- a/tests/layers/test_shape_.py
+++ b/tests/layers/test_shape_.py
@@ -14,7 +14,7 @@ def test_FlattenToNDims(self):
         x = T.random.randn(make_conv_shape([3, 4], 6, [5]))
 
         internal = tk.layers.LinearConv1d(6, 7, kernel_size=1)
-        layer = T.jit_compile(tk.layers.FlattenToNDims(internal, 3))
+        layer = tk.layers.jit_compile(tk.layers.FlattenToNDims(internal, 3))
 
         xx, front_shape = T.flatten_to_ndims(x, 3)
         assert_equal(layer(x), T.unflatten_from_ndims(internal(xx), front_shape))
@@ -33,7 +33,7 @@ def test_ConstantPad(self):
                 repr(layer),
                 f'ConstantPad(padding=[(1, 1), (2, 3), (0, 5)], value={value})'
             )
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
 
             x = T.random.randn([3, 4, 5])
             assert_equal(layer(x), T.pad(x, [(1, 1), (2, 3), (0, 5)], value=value))
@@ -76,7 +76,7 @@ def fn(v):
                         repr(layer),
                         f'ConstantPad{spatial_ndims}d(padding={padding}, value={value})'
                     )
-                    layer = T.jit_compile(layer)
+                    layer = tk.layers.jit_compile(layer)
                     assert_equal(
                         layer(x),
                         spatial_pad(x, [pad_arg] * spatial_ndims)
@@ -90,7 +90,7 @@ def fn(v):
                     repr(layer),
                     f'ConstantPad{spatial_ndims}d(padding={padding}, value={value})'
                 )
-                layer = T.jit_compile(layer)
+                layer = tk.layers.jit_compile(layer)
                 assert_equal(layer(x), spatial_pad(x, padding))
 
                 # error padding argument
@@ -118,7 +118,7 @@ def test_channel_last_to_first(self):
             fn = getattr(T.nn, f'channel_last_to_first{spatial_ndims}d')
             x = T.random.randn([3, 4, 5, 6, 7][:spatial_ndims + 2])
 
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
             assert_equal(layer(x), fn(x))
 
     def test_channel_first_to_last(self):
@@ -131,5 +131,5 @@ def test_channel_first_to_last(self):
             fn = getattr(T.nn, f'channel_first_to_last{spatial_ndims}d')
             x = T.random.randn([3, 4, 5, 6, 7][:spatial_ndims + 2])
 
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
             assert_equal(layer(x), fn(x))
diff --git a/tests/layers/test_split_.py b/tests/layers/test_split_.py
index 0c5f095..141cfb3 100644
--- a/tests/layers/test_split_.py
+++ b/tests/layers/test_split_.py
@@ -17,7 +17,7 @@ def test_branch(self):
         for k in range(len(branches) + 1):
             # without shared module
             layer = tk.layers.Branch(branches[:k])
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
 
             out = layer(x)
             self.assertIsInstance(out, list)
@@ -28,7 +28,7 @@ def test_branch(self):
 
             # with shared module
             layer = tk.layers.Branch(branches[:k], shared=shared)
-            layer = T.jit_compile(layer)
+            layer = tk.layers.jit_compile(layer)
 
             out = layer(x)
             self.assertIsInstance(out, list)
diff --git a/tests/tensor/test_core.py b/tests/tensor/test_core.py
index f5d5124..9fb536f 100644
--- a/tests/tensor/test_core.py
+++ b/tests/tensor/test_core.py
@@ -23,16 +23,16 @@ def test_backend_info(self):
     def test_jit_compile(self):
         # test compile layer
         layer = tk.layers.Linear(5, 3)
-        layer2 = T.jit_compile(layer)
+        layer2 = tk.layers.jit_compile(layer)
         if not tk.settings.disable_jit:
-            self.assertTrue(T.is_jit_layer(layer2))
+            self.assertTrue(tk.layers.is_jit_layer(layer2))
         else:
-            self.assertFalse(T.is_jit_layer(layer2))
+            self.assertFalse(tk.layers.is_jit_layer(layer2))
 
         # not supported object
         with pytest.raises(TypeError,
                            match='Not supported by `jit_compile`'):
-            _ = T.jit_compile(object())
+            _ = tk.layers.jit_compile(object())
 
     def test_utilities(self):
         self.assertEqual(T.int_range(0, 10), list(range(10)))
@@ -50,13 +50,13 @@ def test_dtypes(self):
             self.assertIsInstance(dtype, str)
             self.assertFalse(T.is_floating_point(T.as_tensor(0, dtype=dtype)))
             self.assertFalse(T.is_floating_point_dtype(dtype))
-            self.assertEqual(T.get_dtype(T.cast(T.as_tensor_backend(x), dtype)), dtype)
+            self.assertEqual(T.get_dtype(T.cast(T.as_tensor(x), dtype)), dtype)
 
         for dtype in [T.float16, T.float32, T.float64]:
             self.assertIsInstance(dtype, str)
             self.assertTrue(T.is_floating_point(T.as_tensor(0, dtype=dtype)))
             self.assertTrue(T.is_floating_point_dtype(dtype))
-            self.assertEqual(T.get_dtype(T.cast(T.as_tensor_backend(x), dtype)), dtype)
+            self.assertEqual(T.get_dtype(T.cast(T.as_tensor(x), dtype)), dtype)
 
         # floatx
         self.assertEqual(settings.float_x, 'float32')
@@ -68,7 +68,7 @@ def test_dtypes(self):
             settings.float_x = 'float32'
 
         # as_tensor
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
         self.assertIsInstance(t, T.Tensor)
         assert_equal(t, x)
 
@@ -80,27 +80,28 @@ def test_dtypes(self):
             assert_equal(t2, x)
 
         # cast_like
-        for dtype_as in (t, t2):
-            t3 = T.cast_like(t, dtype_as)
+        for like in (t, t2):
+            t3 = T.cast_like(t, like)
             self.assertIsInstance(t3, T.Tensor)
-            self.assertEqual(T.get_dtype(t3), T.get_dtype(dtype_as))
+            self.assertEqual(T.get_dtype(t3), T.get_dtype(like))
+            self.assertEqual(T.get_device(t3), T.get_device(like))
             assert_equal(t3, x)
 
     def test_tensor_constructors(self):
         np.random.seed(1234)
 
-        # as_tensor_backend
-        for x in [1., 1, [1., 2., 3.], np.array([1., 2., 3.])]:
-            t = T.as_tensor_backend(x)
-            self.assertIsInstance(t, T.Tensor)
-            assert_equal(t, x)
-
-        x = T.as_tensor_backend(np.asarray([1, 2, 3], dtype=np.int32))
-        t = T.as_tensor_backend(x)
-        self.assertIs(t, x)
-
-        with pytest.raises(Exception):
-            _ = T.as_tensor_backend(object())  # not a tensor, should raise error
+        # # as_tensor
+        # for x in [1., 1, [1., 2., 3.], np.array([1., 2., 3.])]:
+        #     t = T.as_tensor(x)
+        #     self.assertIsInstance(t, T.Tensor)
+        #     assert_equal(t, x)
+        # 
+        # x = T.as_tensor(np.asarray([1, 2, 3], dtype=np.int32))
+        # t = T.as_tensor(x)
+        # self.assertIs(t, x)
+        # 
+        # with pytest.raises(Exception):
+        #     _ = T.as_tensor(object())  # not a tensor, should raise error
 
         # as_tensor
         def copy_tensor(o):
@@ -265,16 +266,16 @@ def copy_tensor(o):
                 I = np.eye(n_classes)
                 x = np.random.randint(0, n_classes, size=shape)
 
-                t = T.one_hot(T.as_tensor_backend(x), n_classes)
+                t = T.one_hot(T.as_tensor(x), n_classes)
                 assert_equal(t, I[x])
 
                 for dtype in number_dtypes:
-                    t = T.one_hot(T.as_tensor_backend(x), n_classes, dtype=dtype)
+                    t = T.one_hot(T.as_tensor(x), n_classes, dtype=dtype)
                     self.assertEqual(T.get_dtype(t), dtype)
                     assert_equal(t, I[x])
 
                 for axis in range(-(len(shape) + 1), len(shape) + 1):
-                    t = T.one_hot(T.as_tensor_backend(x), n_classes, axis=axis)
+                    t = T.one_hot(T.as_tensor(x), n_classes, axis=axis)
                     expected_t = list(range(0, len(shape)))
                     if axis < 0:
                         expected_t.insert(len(expected_t) + axis + 1, -1)
@@ -285,11 +286,11 @@ def copy_tensor(o):
 
                 for axis in [-(len(shape) + 2), len(shape) + 1]:
                     with pytest.raises(Exception, match='`axis` out of range'):
-                        _ = T.one_hot(T.as_tensor_backend(x), n_classes, axis=axis)
+                        _ = T.one_hot(T.as_tensor(x), n_classes, axis=axis)
 
     def test_to_numpy(self):
         x = np.random.randn(2, 3, 4)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
         out = T.to_numpy(t)
         self.assertIsInstance(out, np.ndarray)
         assert_equal(out, x)
@@ -298,7 +299,7 @@ def test_to_numpy(self):
             _ = T.to_numpy(object())
 
         x = np.asarray([True, False])
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
         out = T.to_numpy(t)
         self.assertIsInstance(out, np.ndarray)
         self.assertEqual(out.dtype, np.bool)
@@ -419,7 +420,7 @@ def test_assignment(self):
     def test_shape_utils(self):
         # test shape
         x = np.random.randn(2, 3, 4)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
         s = T.shape(t)
         self.assertEqual(s, [2, 3, 4])
 
@@ -436,7 +437,7 @@ def test_shape_utils(self):
 
         # test repeat
         x = np.random.randn(2, 1, 3)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
 
         t2 = T.repeat(t, [])
         self.assertEqual(T.shape(t2), [2, 1, 3])
@@ -465,9 +466,9 @@ def test_shape_utils(self):
 
         # test squeeze
         x = np.random.randn(1, 2, 1, 3, 1, 4, 1)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
 
-        t2 = T.squeeze(T.as_tensor_backend(x))
+        t2 = T.squeeze(T.as_tensor(x))
         s2 = [2, 3, 4]
         self.assertEqual(T.shape(t2), s2)
         assert_equal(t2, x.reshape(s2))
@@ -487,7 +488,7 @@ def test_shape_utils(self):
 
         # test expand dim
         x = np.random.randn(2, 3)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
 
         t2 = T.expand_dim(t, -1)
         s2 = [2, 3, 1]
@@ -551,7 +552,7 @@ def test_shape_utils(self):
 
         # test broadcast_to
         x = np.random.randn(1, 2, 1)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
 
         t2 = T.broadcast_to(t, [4, 5, 2, 1])
         self.assertEqual(T.shape(t2), [4, 5, 2, 1])
@@ -578,7 +579,7 @@ def explicit_broadcast(x, y):
         def check_explicit_broadcast(shape1, shape2):
             x = np.asarray(np.random.randn(*shape1))
             y = np.asarray(np.random.randn(*shape2))
-            out1, out2 = T.explicit_broadcast(T.as_tensor_backend(x), T.as_tensor_backend(y))
+            out1, out2 = T.explicit_broadcast(T.as_tensor(x), T.as_tensor(y))
             out1 = T.to_numpy(out1)
             out2 = T.to_numpy(out2)
             ans1, ans2 = explicit_broadcast(x, y)
@@ -623,7 +624,7 @@ def run_check(x, k):
 
         with pytest.raises(Exception,
                            match='`ndims` must be at least 1'):
-            _ = T.flatten_to_ndims(T.as_tensor_backend([0.]), 0)
+            _ = T.flatten_to_ndims(T.as_tensor([0.]), 0)
 
         with pytest.raises(Exception, match=r'rank\(x\) < ndims'):
             _ = T.flatten_to_ndims(T.zeros([3, 4]), 3)
@@ -634,7 +635,7 @@ def run_check(x, k):
         with pytest.raises(Exception,
                            match=r'Invalid input: rank\(x\) < 1, but '
                                  r'front_shape is not None'):
-            t = T.as_tensor_backend(123)
+            t = T.as_tensor(123)
             _ = T.unflatten_from_ndims(t, [2, 3])
 
         # test reshape_tail
@@ -667,46 +668,46 @@ def run_check(x, k):
     def test_index_select_and_others(self):
         # index_select
         x = np.random.randn(3, 4, 5)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
 
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(1), 0),
+            T.index_select(t, T.as_tensor(1), 0),
             x[1, ...]
         )
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(3), 1),
+            T.index_select(t, T.as_tensor(3), 1),
             x[:, 3, ...]
         )
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(2), -1),
+            T.index_select(t, T.as_tensor(2), -1),
             x[..., 2]
         )
 
         i = np.asarray([0, 2, 1, 1, 0, 2])
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(i), 0),
+            T.index_select(t, T.as_tensor(i), 0),
             x[i, ...]
         )
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(i), 1),
+            T.index_select(t, T.as_tensor(i), 1),
             x[:, i, ...]
         )
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(i), -1),
+            T.index_select(t, T.as_tensor(i), -1),
             x[..., i]
         )
 
         i = np.asarray([[0, 2, 1], [1, 0, 2]])
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(i), 0),
+            T.index_select(t, T.as_tensor(i), 0),
             x[i, ...]
         )
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(i), 1),
+            T.index_select(t, T.as_tensor(i), 1),
             x[:, i, ...]
         )
         assert_equal(
-            T.index_select(t, T.as_tensor_backend(i), -1),
+            T.index_select(t, T.as_tensor(i), -1),
             x[..., i]
         )
 
@@ -714,27 +715,27 @@ def test_index_select_and_others(self):
             # TODO: pytorch currently does not support negative index in many
             # of its functions.  enable these test when supported.
             assert_equal(
-                T.index_select(t, T.as_tensor_backend(-1), 1),
+                T.index_select(t, T.as_tensor(-1), 1),
                 x[:, -1]
             )
 
             i = np.asarray([0, 1, -1, 2, -2, 0])
             assert_equal(
-                T.index_select(t, T.as_tensor_backend(i), 1),
+                T.index_select(t, T.as_tensor(i), 1),
                 x[:, i, ...]
             )
 
             i = np.asarray([[0, 1, -1], [2, -2, 0]])
             assert_equal(
-                T.index_select(t, T.as_tensor_backend(i), 1),
+                T.index_select(t, T.as_tensor(i), 1),
                 x[:, i, ...]
             )
 
         with pytest.raises(Exception, match='`axis` out of range'):
-            _ = T.index_select(t, T.as_tensor_backend(0), 3)
+            _ = T.index_select(t, T.as_tensor(0), 3)
 
         with pytest.raises(Exception, match='`axis` out of range'):
-            _ = T.index_select(t, T.as_tensor_backend(0), -4)
+            _ = T.index_select(t, T.as_tensor(0), -4)
 
         # concat
         x = np.random.randn(2, 3, 4)
@@ -743,7 +744,7 @@ def test_index_select_and_others(self):
 
         for arrays, axis in [([x, x, y], -2), ([x, y, y], 1),
                              ([x, x, z], -1), ([x, z, z], 2)]:
-            t = T.concat([T.as_tensor_backend(arr) for arr in arrays], axis=axis)
+            t = T.concat([T.as_tensor(arr) for arr in arrays], axis=axis)
             expected = np.concatenate(arrays, axis=axis)
             assert_equal(t, expected)
 
@@ -929,8 +930,8 @@ def test_math_univariate_op(self):
 
         x = np.random.randn(2, 3)
         u = np.random.rand(2, 3)
-        x_t = T.as_tensor_backend(x)
-        u_t = T.as_tensor_backend(u)
+        x_t = T.as_tensor(x)
+        u_t = T.as_tensor(u)
 
         assert_allclose(T.floor(x_t), np.floor(x))
         assert_allclose(T.ceil(x_t), np.ceil(x))
@@ -939,9 +940,9 @@ def test_math_univariate_op(self):
         assert_allclose(T.square(x_t), x ** 2)
 
         assert_allclose(T.exp(x_t), np.exp(x))
-        assert_allclose(T.log(T.as_tensor_backend(np.abs(x))),
+        assert_allclose(T.log(T.as_tensor(np.abs(x))),
                         np.log(np.abs(x)))
-        assert_allclose(T.log1p(T.as_tensor_backend(np.abs(x) - 1. + 1e-7)),
+        assert_allclose(T.log1p(T.as_tensor(np.abs(x) - 1. + 1e-7)),
                         np.log1p(np.abs(x) - 1. + 1e-7))
 
         assert_allclose(T.sin(x_t), np.sin(x))
@@ -958,15 +959,15 @@ def test_math_bivariate_op(self):
         np.random.seed(1234)
         x = np.random.randn(2, 3)
         y = np.random.randn(3)
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
 
         assert_allclose(T.add(t1, t2), x + y)
         assert_allclose(T.sub(t1, t2), x - y)
         assert_allclose(T.mul(t1, t2), x * y)
-        assert_allclose(T.pow(T.as_tensor_backend(np.abs(x)), t2),
+        assert_allclose(T.pow(T.as_tensor(np.abs(x)), t2),
                         np.abs(x) ** y)
-        assert_allclose(T.sqrt(T.as_tensor_backend(np.abs(x))), np.sqrt(np.abs(x)))
+        assert_allclose(T.sqrt(T.as_tensor(np.abs(x))), np.sqrt(np.abs(x)))
 
         # for division, of course y should not equal to zero
         y = np.asarray(y == 0, dtype=y.dtype) + y
@@ -977,8 +978,8 @@ def test_math_bivariate_op(self):
         # to produce identical results with numpy when x > 0 and y > 0
         x = np.abs(x)
         y = np.abs(y)
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
         assert_allclose(T.floordiv(t1, t2), x // y)
         assert_allclose(T.mod(t1, t2), x % y)
 
@@ -994,8 +995,8 @@ def test_math_bivariate_op(self):
         x = np.random.randint(0, 255, size=(2, 3), dtype=np.uint8)
         y = np.random.randint(0, 255, size=(3,), dtype=np.uint8)
         y = y + (y == 0).astype(y.dtype)
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
         out = T.truediv(t1, t2)
         self.assertEqual(T.get_dtype(out), T.float32)
         assert_allclose(out, x.astype(np.float32) / y.astype(np.float32))
@@ -1004,8 +1005,8 @@ def test_math_bivariate_op(self):
         x = np.random.randint(-32768, 32767, size=(2, 3), dtype=np.int16)
         y = np.random.randint(-32768, 32767, size=(3,), dtype=np.int16)
         y = y + (y == 0).astype(y.dtype)
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
         out = T.truediv(t1, t2)
         self.assertEqual(T.get_dtype(out), T.float32)
         assert_allclose(out, x.astype(np.float32) / y.astype(np.float32))
@@ -1014,8 +1015,8 @@ def test_math_bivariate_op(self):
         x = np.random.randint(-100000, 100000, size=(2, 3), dtype=np.int32)
         y = np.random.randint(-100000, 100000, size=(3,), dtype=np.int32)
         y = y + (y == 0).astype(y.dtype)
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
         out = T.truediv(t1, t2)
         self.assertEqual(T.get_dtype(out), T.float64)
         assert_allclose(out, x.astype(np.float64) / y.astype(np.float64))
@@ -1027,7 +1028,7 @@ def test_math_sequential_op(self):
         z = np.random.randn(2, 1)
 
         assert_allclose(
-            T.add_n([T.as_tensor_backend(t) for t in (x, y, z)]),
+            T.add_n([T.as_tensor(t) for t in (x, y, z)]),
             x + y + z
         )
 
@@ -1050,7 +1051,7 @@ def log_f_exp(f, x, axis=None, keepdims=False):
         # prepare for the data
         np.random.seed(1234)
         x = np.random.randn(2, 3, 4)
-        t = T.as_tensor_backend(x)
+        t = T.as_tensor(x)
 
         # test sum, mean, max, min
         for name in ['sum', 'mean', 'min', 'max',
@@ -1160,8 +1161,8 @@ def with_raise(name, fn):
         x = np.asarray([[True, True, False, False],
                         [False, False, True, True]])
         y = np.asarray([True, False, False, True])
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
 
         # test as_boolean
         self.assertEqual(T.get_dtype(t1), T.boolean)
@@ -1170,25 +1171,25 @@ def with_raise(name, fn):
         # test logical_not
         out = T.logical_not(t1)
         assert_equal(read_bool(out), np.logical_not(x))
-        with_raise('x', lambda: T.logical_not(T.as_tensor_backend([1, 2, 3])))
+        with_raise('x', lambda: T.logical_not(T.as_tensor([1, 2, 3])))
 
         # test logical_and
         out = T.logical_and(t1, t2)
         assert_equal(read_bool(out), np.logical_and(x, y))
-        with_raise('x', lambda: T.logical_and(T.as_tensor_backend([1, 2, 3, 4]), t2))
-        with_raise('y', lambda: T.logical_and(t1, T.as_tensor_backend([1, 2, 3, 4])))
+        with_raise('x', lambda: T.logical_and(T.as_tensor([1, 2, 3, 4]), t2))
+        with_raise('y', lambda: T.logical_and(t1, T.as_tensor([1, 2, 3, 4])))
 
         # test logical_or
         out = T.logical_or(t1, t2)
         assert_equal(read_bool(out), np.logical_or(x, y))
-        with_raise('x', lambda: T.logical_or(T.as_tensor_backend([1, 2, 3, 4]), t2))
-        with_raise('y', lambda: T.logical_or(t1, T.as_tensor_backend([1, 2, 3, 4])))
+        with_raise('x', lambda: T.logical_or(T.as_tensor([1, 2, 3, 4]), t2))
+        with_raise('y', lambda: T.logical_or(t1, T.as_tensor([1, 2, 3, 4])))
 
         # test logical_xor
         out = T.logical_xor(t1, t2)
         assert_equal(read_bool(out), np.logical_xor(x, y))
-        with_raise('x', lambda: T.logical_xor(T.as_tensor_backend([1, 2, 3, 4]), t2))
-        with_raise('y', lambda: T.logical_xor(t1, T.as_tensor_backend([1, 2, 3, 4])))
+        with_raise('x', lambda: T.logical_xor(T.as_tensor([1, 2, 3, 4]), t2))
+        with_raise('y', lambda: T.logical_xor(t1, T.as_tensor([1, 2, 3, 4])))
 
         # test multiply_mask
         def test_multiply_mask(x, y, dtype, mask_dtype):
@@ -1268,8 +1269,8 @@ def read_bool(t):
         x = np.random.randn(2, 3, 4)
         y = np.random.randn(1, 3, 4)
         x = np.concatenate([y, x], axis=0)
-        t1 = T.as_tensor_backend(x)
-        t2 = T.as_tensor_backend(y)
+        t1 = T.as_tensor(x)
+        t2 = T.as_tensor(y)
 
         # test equal
         assert_equal(read_bool(T.equal(t1, t2)), (x == y))
@@ -1351,9 +1352,9 @@ def test_gradient(self):
         y = np.random.randn(2, 3, 4)
 
         # requires_grad
-        yt = T.requires_grad(T.as_tensor_backend(y))
+        yt = T.requires_grad(T.as_tensor(y))
 
-        xt = T.as_tensor_backend(x)
+        xt = T.as_tensor(x)
         xt_copy = T.requires_grad(xt, copy=False)
         self.assertIs(xt_copy, xt)
         l_sum = T.reduce_sum(xt + xt_copy)
@@ -1449,7 +1450,7 @@ def test_assertions(self):
         for x in [np.array([-1, 0, 1]), np.array([1., 2., 3.]),
                   np.array([np.inf, 0.]), np.array([np.nan, 0.]),
                   np.array([np.inf, np.nan])]:
-            t = T.as_tensor_backend(x)
+            t = T.as_tensor(x)
             assert_equal(T.is_finite(t), np.isfinite(x))
             is_finite = np.all(np.isfinite(x))
 
diff --git a/tests/tensor/test_nn.py b/tests/tensor/test_nn.py
index 78798d5..79ae63b 100644
--- a/tests/tensor/test_nn.py
+++ b/tests/tensor/test_nn.py
@@ -24,7 +24,7 @@ def test_activation_functions(self):
         self.assertTrue(np.any(x < 0))
         self.assertTrue(np.any(x > 0))
         self.assertTrue(np.any(x == 0))
-        x_t = T.as_tensor_backend(x)
+        x_t = T.as_tensor(x)
 
         # test relu
         assert_allclose(T.nn.relu(x_t), x * (x >= 0))
@@ -126,7 +126,7 @@ def binary_cross_entropy(logits, labels, reduction, negative):
         self.assertEqual(labels.shape, (3, 4))
         self.assertEqual(set(labels.flatten().tolist()), {0, 1})
 
-        _f = T.as_tensor_backend
+        _f = T.as_tensor
 
         for reduction in ['none', 'mean', 'sum']:
             for negative in [False, True]:
@@ -193,7 +193,7 @@ def cross_entropy(logits, labels, reduction, negative):
         self.assertEqual(labels.shape, (3, 4, 5))
         self.assertEqual(set(labels.flatten().tolist()), {0, 1, 2, 3, 4, 5})
 
-        _f = T.as_tensor_backend
+        _f = T.as_tensor
 
         for reduction in ['none', 'mean', 'sum']:
             for negative in [False, True]:
diff --git a/tests/tensor/test_random.py b/tests/tensor/test_random.py
index 400f9f4..d642337 100644
--- a/tests/tensor/test_random.py
+++ b/tests/tensor/test_random.py
@@ -32,12 +32,12 @@ class TensorRandomTestCase(unittest.TestCase):
 
     def test_seed(self):
         T.random.seed(1234)
-        x = T.to_numpy(T.random.normal(T.as_tensor_backend(0.), T.as_tensor_backend(1.)))
-        y = T.to_numpy(T.random.normal(T.as_tensor_backend(0.), T.as_tensor_backend(1.)))
+        x = T.to_numpy(T.random.normal(T.as_tensor(0.), T.as_tensor(1.)))
+        y = T.to_numpy(T.random.normal(T.as_tensor(0.), T.as_tensor(1.)))
         self.assertFalse(np.allclose(x, y))
 
         T.random.seed(1234)
-        z = T.to_numpy(T.random.normal(T.as_tensor_backend(0.), T.as_tensor_backend(1.)))
+        z = T.to_numpy(T.random.normal(T.as_tensor(0.), T.as_tensor(1.)))
         assert_allclose(x, z)
 
     def test_rand(self):
@@ -226,9 +226,9 @@ def log_prob(given):
         # test n_samples by manual expanding the param shape
         for dtype in float_dtypes:
             # test sample dtype and shape
-            mean_t = T.cast(T.expand(T.as_tensor_backend(mean), [n_samples, 2, 3, 4]), dtype)
-            std_t = T.cast(T.expand(T.as_tensor_backend(std), [n_samples, 1, 3, 4]), dtype)
-            logstd_t = T.cast(T.expand(T.as_tensor_backend(logstd), [n_samples, 1, 3, 4]), dtype)
+            mean_t = T.cast(T.expand(T.as_tensor(mean), [n_samples, 2, 3, 4]), dtype)
+            std_t = T.cast(T.expand(T.as_tensor(std), [n_samples, 1, 3, 4]), dtype)
+            logstd_t = T.cast(T.expand(T.as_tensor(logstd), [n_samples, 1, 3, 4]), dtype)
             t = T.random.normal(mean_t, std_t)
             self.assertEqual(T.get_dtype(t), dtype)
             self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
@@ -600,7 +600,7 @@ def do_test_sample(n_z, sample_shape, float_dtype, dtype):
             do_test_sample(n_z, sample_shape, T.float64, dtype)
 
         with pytest.raises(Exception, match='`n_samples` must be at least 1'):
-            _ = T.random.bernoulli(probs=T.as_tensor_backend(probs), n_samples=0)
+            _ = T.random.bernoulli(probs=T.as_tensor(probs), n_samples=0)
 
         # given has lower rank than params, broadcasted to match param
         for float_dtype in float_dtypes:
@@ -721,7 +721,7 @@ def do_test_sample(is_one_hot: bool,
             is_one_hot = Z_sample_fn == T.random.one_hot_categorical
             this_probs = probs[0, 0]
             t = Z_sample_fn(
-                probs=T.as_tensor_backend(this_probs),
+                probs=T.as_tensor(this_probs),
                 n_samples=100
             )
             self.assertEqual(
@@ -730,7 +730,7 @@ def do_test_sample(is_one_hot: bool,
             )
 
             x = T.to_numpy(t)
-            logits_t = T.as_tensor_backend(np.log(this_probs))
+            logits_t = T.as_tensor(np.log(this_probs))
             do_check_log_prob(
                 given=t,
                 batch_ndims=len(t.shape) - int(is_one_hot),
@@ -761,11 +761,11 @@ def do_test_sample(is_one_hot: bool,
         # argument error
         for Z_sample_fn in (T.random.categorical, T.random.one_hot_categorical):
             with pytest.raises(Exception, match='`n_samples` must be at least 1'):
-                _ = Z_sample_fn(probs=T.as_tensor_backend(probs), n_samples=0)
+                _ = Z_sample_fn(probs=T.as_tensor(probs), n_samples=0)
 
             with pytest.raises(Exception, match='The rank of `probs` must be at '
                                                 'least 1'):
-                _ = Z_sample_fn(probs=T.as_tensor_backend(probs[0, 0, 0, 0]))
+                _ = Z_sample_fn(probs=T.as_tensor(probs[0, 0, 0, 0]))
 
     def test_discretized_logistic(self):
         np.random.seed(1234)
diff --git a/tests/test_arg_check.py b/tests/test_arg_check.py
index 9b2a2dc..0b4115e 100644
--- a/tests/test_arg_check.py
+++ b/tests/test_arg_check.py
@@ -20,7 +20,7 @@ def test_validate_positive_int(self):
 
     def test_validate_layer(self):
         layer = tk.layers.Linear(5, 3)
-        for v in [layer, T.jit_compile(layer)]:
+        for v in [layer, tk.layers.jit_compile(layer)]:
             self.assertIs(validate_layer('v', v), v)
 
         with pytest.raises(TypeError,
@@ -40,7 +40,7 @@ def test_get_layer_from_layer_or_factory(self):
         factory = lambda in_features, out_features: \
             tk.layers.Linear(in_features, out_features)
         layer = factory(5, 3)
-        for v in [layer, T.jit_compile(layer),
+        for v in [layer, tk.layers.jit_compile(layer),
                   tk.layers.Linear, factory]:
             out = get_layer_from_layer_or_factory(
                 'v', v, args=(5,), kwargs=dict(out_features=3))

From 9ca107ea8788036264a5d2bd35c581568f7c5434 Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Mon, 17 Feb 2020 21:23:34 +0800
Subject: [PATCH 3/7] now GPU device can work

---
 tensorkit/backend/pytorch_/core.py            |  6 +--
 tensorkit/backend/pytorch_/layers.py          |  2 +-
 tensorkit/backend/pytorch_/nn.py              |  4 +-
 tensorkit/backend/pytorch_/random.py          |  9 +++-
 tensorkit/distributions/bernoulli.py          |  7 ++--
 tensorkit/distributions/categorical.py        |  4 +-
 tensorkit/distributions/uniform.py            |  5 ++-
 tensorkit/examples/classification/mnist.py    | 10 +++++
 .../examples/classification/mnist_resnet.py   | 10 +++++
 tensorkit/examples/utils/prepare_data.py      |  7 ++--
 tensorkit/utils/tensor_stream.py              |  3 ++
 tests/distributions/test_base.py              |  3 +-
 tests/distributions/test_bernoulli.py         |  7 +---
 tests/distributions/test_categorical.py       |  7 +---
 tests/distributions/test_discretized.py       |  9 ++--
 tests/distributions/test_flow.py              |  4 +-
 tests/distributions/test_mixture.py           |  5 +--
 tests/distributions/test_normal.py            | 20 +++------
 tests/distributions/test_uniform.py           | 12 +-----
 tests/distributions/test_utils.py             | 27 ++++++++++--
 tests/flows/test_act_norm.py                  |  4 +-
 tests/flows/test_core.py                      | 23 ++++------
 tests/flows/test_coupling.py                  |  2 +-
 tests/flows/test_rearrangement.py             |  2 +-
 tests/flows/test_shape_.py                    |  6 +--
 tests/flows/test_split_.py                    |  6 +--
 tests/helper.py                               | 37 ++++++++++++++++
 tests/init/test_core.py                       | 16 ++-----
 tests/init/test_std_data_init.py              |  2 +-
 tests/layers/test_composed.py                 |  2 +-
 tests/layers/test_contextual.py               |  2 +-
 tests/layers/test_core.py                     | 42 +++++++------------
 tests/layers/test_flow_layer.py               |  4 +-
 tests/layers/test_gated.py                    |  2 +-
 tests/layers/test_pixelcnn.py                 |  3 +-
 tests/layers/test_pool.py                     |  4 +-
 tests/layers/test_resnet.py                   |  4 +-
 tests/layers/test_shape_.py                   |  6 +--
 tests/layers/test_split_.py                   |  2 +-
 tests/layers/test_utils.py                    |  2 +-
 tests/tensor/test_core.py                     | 19 +++------
 tests/tensor/test_linalg.py                   |  6 +--
 tests/tensor/test_nn.py                       |  8 +---
 tests/tensor/test_random.py                   | 36 ++--------------
 tests/tensor/test_utils.py                    |  3 +-
 tests/test_arg_check.py                       |  3 +-
 tests/test_bayes.py                           |  2 +-
 tests/test_stochastic.py                      |  2 +-
 tests/train/test_core.py                      |  5 +--
 tests/variational/test_chain.py               |  2 +-
 tests/variational/test_estimators.py          |  7 ++--
 tests/variational/test_evaluation.py          |  3 +-
 tests/variational/test_inference.py           |  2 +-
 tests/variational/test_objectives.py          |  5 +--
 54 files changed, 205 insertions(+), 230 deletions(-)

diff --git a/tensorkit/backend/pytorch_/core.py b/tensorkit/backend/pytorch_/core.py
index c1a2aa0..f5a535b 100644
--- a/tensorkit/backend/pytorch_/core.py
+++ b/tensorkit/backend/pytorch_/core.py
@@ -261,7 +261,7 @@ def as_tensor(data,
             another tensor, a :class:`~tensorkit.StochasticTensor`, or anything
             else that the backend supports.
         dtype: The expected dtype of the constructed tensor.
-        device: Where to put the new tensor.
+        device: The device where to place new tensors and variables.
         force_copy: Force to copy `data` even if it is not necessary.
             The gradient propagation will not be stopped from the copied tensor
             to the original tensor.  The caller may need to use `T.stop_grad()`
@@ -1443,9 +1443,9 @@ def maybe_clip(x: Tensor,
     if x_min is not None and x_max is not None:
         return clip(x, x_min, x_max)
     elif x_min is not None:
-        return torch.max(x, torch.as_tensor(x_min, dtype=x.dtype))
+        return torch.max(x, torch.as_tensor(x_min, dtype=x.dtype, device=x.device))
     elif x_max is not None:
-        return torch.min(x, torch.as_tensor(x_max, dtype=x.dtype))
+        return torch.min(x, torch.as_tensor(x_max, dtype=x.dtype, device=x.device))
     else:
         return x
 
diff --git a/tensorkit/backend/pytorch_/layers.py b/tensorkit/backend/pytorch_/layers.py
index 4f16167..9e4d22d 100644
--- a/tensorkit/backend/pytorch_/layers.py
+++ b/tensorkit/backend/pytorch_/layers.py
@@ -304,7 +304,7 @@ def get(self) -> Tensor:
     def set(self, value: TensorOrData) -> None:
         with no_grad():
             v, g = weight_norm_decompose(
-                as_tensor(value, dtype=get_dtype(self.v), device=get_dtype(self.v)),
+                as_tensor(value, dtype=get_dtype(self.v), device=get_device(self.v)),
                 self.norm_axis,
                 self.epsilon,
             )
diff --git a/tensorkit/backend/pytorch_/nn.py b/tensorkit/backend/pytorch_/nn.py
index a083b39..20ef54c 100644
--- a/tensorkit/backend/pytorch_/nn.py
+++ b/tensorkit/backend/pytorch_/nn.py
@@ -56,8 +56,8 @@ def sigmoid(x: Tensor) -> Tensor:
 def log_sigmoid(x: Tensor) -> Tensor:
     # using `neg_x` and `pos_x` separately can avoid having NaN or Infinity
     # on either of the path.
-    neg_x = torch.min(x, torch.as_tensor(0., dtype=x.dtype))
-    pos_x = torch.max(x, torch.as_tensor(0., dtype=x.dtype))
+    neg_x = torch.min(x, torch.as_tensor(0., dtype=x.dtype, device=x.device))
+    pos_x = torch.max(x, torch.as_tensor(0., dtype=x.dtype, device=x.device))
     return torch.where(
         x < 0.,
         neg_x - log1p(exp(neg_x)),  # log(exp(x) / (1 + exp(x)))
diff --git a/tensorkit/backend/pytorch_/random.py b/tensorkit/backend/pytorch_/random.py
index aec0f2b..97215f2 100644
--- a/tensorkit/backend/pytorch_/random.py
+++ b/tensorkit/backend/pytorch_/random.py
@@ -9,7 +9,7 @@
 from ...settings_ import settings
 
 __all__ = [
-    'seed',
+    'seed', 'set_deterministic',
 
     # uniform
     'rand', 'uniform',
@@ -41,6 +41,13 @@ def seed(seed: int):
     torch.cuda.manual_seed_all(seed)
 
 
+def set_deterministic(deterministic: bool = True):
+    if hasattr(torch, 'backends') and hasattr(torch.backends, 'cudnn'):
+        torch.backends.cudnn.enabled = not deterministic
+        torch.backends.cudnn.benchmark = not deterministic
+        torch.backends.cudnn.deterministic = deterministic
+
+
 # ---- uniform distribution ----
 @jit
 def rand(shape: List[int],
diff --git a/tensorkit/distributions/bernoulli.py b/tensorkit/distributions/bernoulli.py
index 3000bdf..31014e9 100644
--- a/tensorkit/distributions/bernoulli.py
+++ b/tensorkit/distributions/bernoulli.py
@@ -65,9 +65,11 @@ def __init__(self,
         if logits is not None:
             value_shape = T.shape(logits)
             mutual_params = {'logits': logits}
+            device = device or T.get_device(logits)
         else:
             value_shape = T.shape(probs)
             mutual_params = {'probs': probs}
+            device = device or T.get_device(probs)
         epsilon = float(epsilon)
 
         # construct the object
@@ -75,7 +77,7 @@ def __init__(self,
             dtype=dtype,
             value_shape=value_shape,
             event_ndims=event_ndims,
-            device=device or T.get_device(logits),
+            device=device,
             validate_tensors=validate_tensors,
         )
         for k, v in mutual_params.items():
@@ -134,8 +136,7 @@ def copy(self, **overrided_params):
         return copy_distribution(
             cls=Bernoulli,
             base=self,
-            attrs=('dtype', 'device', 'event_ndims', 'validate_tensors',
-                   'epsilon'),
+            attrs=('dtype', 'event_ndims', 'epsilon', 'device', 'validate_tensors'),
             mutual_attrs=(('logits', 'probs'),),
             compute_deps={'logits': ('epsilon',)},
             original_mutual_params=self._mutual_params,
diff --git a/tensorkit/distributions/categorical.py b/tensorkit/distributions/categorical.py
index 1140474..1c28f77 100644
--- a/tensorkit/distributions/categorical.py
+++ b/tensorkit/distributions/categorical.py
@@ -44,9 +44,11 @@ def __init__(self,
         if logits is not None:
             param_shape = T.shape(logits)
             mutual_params = {'logits': logits}
+            device = device or T.get_device(logits)
         else:
             param_shape = T.shape(probs)
             mutual_params = {'probs': probs}
+            device = device or T.get_device(probs)
         epsilon = float(epsilon)
 
         if len(param_shape) < 1:
@@ -61,7 +63,7 @@ def __init__(self,
             dtype=dtype,
             value_shape=value_shape,
             event_ndims=event_ndims,
-            device=device or T.get_device(logits),
+            device=device,
             validate_tensors=validate_tensors,
         )
         for k, v in mutual_params.items():
diff --git a/tensorkit/distributions/uniform.py b/tensorkit/distributions/uniform.py
index 547c0c6..172f66f 100644
--- a/tensorkit/distributions/uniform.py
+++ b/tensorkit/distributions/uniform.py
@@ -88,13 +88,16 @@ def __init__(self,
             dtype = T.get_dtype(low)
             value_shape = (value_shape +
                            T.broadcast_shape(T.shape(low), T.shape(high)))
+            device = device or T.get_device(low)
+        else:
+            device = T.current_device()
 
         super().__init__(
             dtype=dtype,
             value_shape=value_shape,
             reparameterized=reparameterized,
             event_ndims=event_ndims,
-            device=device or T.get_device(low),
+            device=device,
             validate_tensors=validate_tensors,
         )
 
diff --git a/tensorkit/examples/classification/mnist.py b/tensorkit/examples/classification/mnist.py
index b91a8bb..7657f16 100644
--- a/tensorkit/examples/classification/mnist.py
+++ b/tensorkit/examples/classification/mnist.py
@@ -8,6 +8,7 @@ class Config(mltk.Config):
     max_epoch: int = 10
     batch_size: int = 32
     test_batch_size: int = 64
+    init_batch_count: int = 32
     lr: float = 0.001
     lr_anneal_ratio: float = 0.5
     lr_anneal_epochs: int = 5
@@ -35,6 +36,15 @@ def main(exp: mltk.Experiment[Config]):
         log_softmax(). \
         build()
 
+    # initialize the network with first few batches of train data
+    init_x, _ = train_stream.get_arrays(max_batch=exp.config.init_batch_count)
+    _ = net(T.as_tensor(init_x))
+    mltk.print_with_time('Network initialized')
+
+    # we have initialized the network, now we can compile the net with JIT engine
+    net = tk.layers.jit_compile(net)
+    mltk.print_with_time('Network compiled to JIT module')
+
     # define the train and evaluate functions
     def train_step(x, y):
         logits = net(x)
diff --git a/tensorkit/examples/classification/mnist_resnet.py b/tensorkit/examples/classification/mnist_resnet.py
index 374c6a5..bb9a13e 100644
--- a/tensorkit/examples/classification/mnist_resnet.py
+++ b/tensorkit/examples/classification/mnist_resnet.py
@@ -8,6 +8,7 @@ class Config(mltk.Config):
     max_epoch: int = 10
     batch_size: int = 32
     test_batch_size: int = 64
+    init_batch_count: int = 10
     lr: float = 0.01
     lr_anneal_ratio: float = 0.5
     lr_anneal_epochs: int = 2
@@ -41,6 +42,15 @@ def main(exp: mltk.Experiment[Config]):
         log_softmax(). \
         build()
 
+    # initialize the network with first few batches of train data
+    init_x, _ = train_stream.get_arrays(max_batch=exp.config.init_batch_count)
+    _ = net(T.as_tensor(init_x))
+    mltk.print_with_time('Network initialized')
+
+    # we have initialized the network, now we can compile the net with JIT engine
+    net = tk.layers.jit_compile(net)
+    mltk.print_with_time('Network compiled to JIT module')
+
     # the train, test and validate functions
     def train_step(x, y):
         logits = net(x)
diff --git a/tensorkit/examples/utils/prepare_data.py b/tensorkit/examples/utils/prepare_data.py
index 2f3950f..7474a6f 100644
--- a/tensorkit/examples/utils/prepare_data.py
+++ b/tensorkit/examples/utils/prepare_data.py
@@ -4,7 +4,6 @@
 import numpy as np
 
 import tensorkit as tk
-from tensorkit import tensor as T
 
 __all__ = [
     'get_mnist_streams'
@@ -48,10 +47,10 @@ def get_mnist_streams(batch_size: int,
 
     # split train & valid set, and construct the streams
     def make_stream(arrays, **kwargs):
-        stream = mltk.DataStream.arrays(arrays, **kwargs)
+        ret = mltk.DataStream.arrays(arrays, **kwargs)
         if as_tensor_stream:
-            stream = tk.utils.as_tensor_stream(stream, prefetch=prefetch)
-        return stream
+            ret = tk.utils.as_tensor_stream(ret, prefetch=prefetch)
+        return ret
 
     if val_portion is not None:
         (train_x, train_y), (val_x, val_y) = \
diff --git a/tensorkit/utils/tensor_stream.py b/tensorkit/utils/tensor_stream.py
index 3b0abee..57e4584 100644
--- a/tensorkit/utils/tensor_stream.py
+++ b/tensorkit/utils/tensor_stream.py
@@ -45,6 +45,9 @@ def _minibatch_iterator(self) -> Generator[ArrayTuple, None, None]:
         finally:
             g.close()
 
+    def _concat_arrays(self, arrays: Sequence[T.Tensor]) -> T.Tensor:
+        return T.concat(list(arrays), axis=0)
+
 
 def as_tensor_stream(source: mltk.DataStream,
                      device: Optional[str] = None,
diff --git a/tests/distributions/test_base.py b/tests/distributions/test_base.py
index f35a989..343ba88 100644
--- a/tests/distributions/test_base.py
+++ b/tests/distributions/test_base.py
@@ -11,7 +11,7 @@
 from tests.helper import *
 
 
-class BaseDistributionTestCase(unittest.TestCase):
+class BaseDistributionTestCase(TestCase):
 
     def test_construct(self):
         def check_all_specified_by_constructor(cls):
@@ -239,7 +239,6 @@ def do_check(given, group_ndims, args):
                      (group_ndims, 1 + group_ndims))
 
     def test_prob(self):
-        np.random.seed(1234)
         t00 = np.random.randn(2, 3)
         t0 = T.as_tensor(t00)
         d = Distribution(
diff --git a/tests/distributions/test_bernoulli.py b/tests/distributions/test_bernoulli.py
index b71905c..16202ae 100644
--- a/tests/distributions/test_bernoulli.py
+++ b/tests/distributions/test_bernoulli.py
@@ -17,10 +17,9 @@ def sigmoid(x):
                     np.exp(x) / (1 + np.exp(x)))
 
 
-class BernoulliTestCase(unittest.TestCase):
+class BernoulliTestCase(TestCase):
 
     def test_construct(self):
-        np.random.seed(1234)
         logits = np.random.randn(2, 3, 4)
         probs = sigmoid(logits)
 
@@ -68,7 +67,6 @@ def test_construct(self):
                         **{key: T.as_tensor(np.nan, dtype=float_dtype)})
 
     def test_copy(self):
-        np.random.seed(1234)
         logits = np.random.randn(2, 3, 4)
         logits_t = T.as_tensor(logits)
         bernoulli = Bernoulli(logits=logits_t, event_ndims=1)
@@ -82,7 +80,7 @@ def test_copy(self):
             self.assertEqual(f_copy.call_args, ((), {
                 'cls': Bernoulli,
                 'base': bernoulli,
-                'attrs': ('dtype', 'event_ndims', 'validate_tensors', 'epsilon'),
+                'attrs': ('dtype', 'event_ndims', 'epsilon', 'device', 'validate_tensors'),
                 'mutual_attrs': (('logits', 'probs'),),
                 'compute_deps': {'logits': ('epsilon',)},
                 'original_mutual_params': {'logits': bernoulli.logits},
@@ -90,7 +88,6 @@ def test_copy(self):
             }))
 
     def test_sample_and_log_prob(self):
-        np.random.seed(1234)
         logits = np.random.randn(2, 3, 4)
         logits_t = T.as_tensor(logits)
 
diff --git a/tests/distributions/test_categorical.py b/tests/distributions/test_categorical.py
index b3600db..5c36e34 100644
--- a/tests/distributions/test_categorical.py
+++ b/tests/distributions/test_categorical.py
@@ -32,10 +32,9 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
 
-class CategoricalTestCase(unittest.TestCase):
+class CategoricalTestCase(TestCase):
 
     def test_construct_base(self):
-        np.random.seed(1234)
         logits = np.random.randn(2, 3, 4)
         probs = softmax(logits)
         logits = np.log(probs)
@@ -101,7 +100,6 @@ def test_construct_base(self):
                     )
 
     def test_copy(self):
-        np.random.seed(1234)
         logits = np.random.randn(2, 3, 4)
         logits_t = T.as_tensor(logits)
         cat = _MyBaseCategorical(logits=logits_t, probs=None, event_ndims=1,
@@ -116,7 +114,7 @@ def test_copy(self):
             self.assertEqual(f_copy.call_args, ((), {
                 'cls': _MyBaseCategorical,
                 'base': cat,
-                'attrs': ('dtype', 'event_ndims', 'validate_tensors', 'epsilon'),
+                'attrs': ('dtype', 'event_ndims', 'epsilon', 'device', 'validate_tensors'),
                 'mutual_attrs': (('logits', 'probs'),),
                 'compute_deps': {'logits': ('epsilon',)},
                 'original_mutual_params': {'logits': cat.logits},
@@ -124,7 +122,6 @@ def test_copy(self):
             }))
 
     def test_Categorical_and_OneHotCategorical(self):
-        np.random.seed(1234)
         logits = np.random.randn(2, 3, 4)
 
         def do_test(dtype, float_dtype, is_one_hot):
diff --git a/tests/distributions/test_discretized.py b/tests/distributions/test_discretized.py
index 4467ce1..2fc1f70 100644
--- a/tests/distributions/test_discretized.py
+++ b/tests/distributions/test_discretized.py
@@ -11,11 +11,9 @@
 from tests.helper import *
 
 
-class DiscretizedLogisticTestCase(unittest.TestCase):
+class DiscretizedLogisticTestCase(TestCase):
 
     def test_discretized_logsitic(self):
-        T.random.seed(1234)
-
         mean = T.random.randn([3, 1, 4])
         log_scale = T.random.randn([2, 1])
 
@@ -90,8 +88,6 @@ def log_prob_fn(t):
             _ = DiscretizedLogistic(mean, T.zeros([7]), 1./32)
 
     def test_copy(self):
-        T.random.seed(1234)
-
         mean = T.random.randn([3, 1, 4])
         log_scale = T.random.randn([2, 1])
 
@@ -118,7 +114,8 @@ def test_copy(self):
                 'attrs': (
                     'mean', 'log_scale', 'bin_size', 'min_val', 'max_val',
                     'biased_edges', 'discretize_given', 'discretize_sample',
-                    'reparameterized', 'event_ndims', 'epsilon', 'validate_tensors'
+                    'reparameterized', 'event_ndims', 'epsilon', 'device',
+                    'validate_tensors'
                 ),
                 'overrided_params': {'event_ndims': 2,
                                      'discretize_sample': False,
diff --git a/tests/distributions/test_flow.py b/tests/distributions/test_flow.py
index a80e12f..b932837 100644
--- a/tests/distributions/test_flow.py
+++ b/tests/distributions/test_flow.py
@@ -123,11 +123,9 @@ def log_prob_fn(t):
         fn(None, None, validate_tensors)
 
 
-class FlowDistributionTestCase(unittest.TestCase):
+class FlowDistributionTestCase(TestCase):
 
     def test_FlowDistribution(self):
-        T.random.seed(1234)
-
         check_flow_distribution(
             self,
             UnitNormal([], event_ndims=0),
diff --git a/tests/distributions/test_mixture.py b/tests/distributions/test_mixture.py
index 47a597f..8613b90 100644
--- a/tests/distributions/test_mixture.py
+++ b/tests/distributions/test_mixture.py
@@ -9,6 +9,7 @@
 from tensorkit.distributions.utils import copy_distribution
 from tensorkit.flows import ActNorm
 from tests.distributions.test_flow import check_distribution_instance
+from tests.helper import *
 
 
 def check_mixture(ctx,
@@ -96,11 +97,9 @@ def log_prob_fn(t):
         fn(categorical, components, None, None, validate_tensors)
 
 
-class MixtureTestCase(unittest.TestCase):
+class MixtureTestCase(TestCase):
 
     def test_mixture(self):
-        T.random.seed(1234)
-
         check_mixture(
             self,
             Categorical(logits=T.random.randn([4, 5, 1])),
diff --git a/tests/distributions/test_normal.py b/tests/distributions/test_normal.py
index e23b435..a1b0dab 100644
--- a/tests/distributions/test_normal.py
+++ b/tests/distributions/test_normal.py
@@ -14,11 +14,9 @@
 from tests.helper import *
 
 
-class UnitNormalTestCase(unittest.TestCase):
+class UnitNormalTestCase(TestCase):
 
     def test_construct(self):
-        np.random.seed(1234)
-
         for shape, event_ndims, dtype in \
                 product(([], [2, 3]), range(0, 3), float_dtypes):
             if event_ndims > len(shape):
@@ -34,7 +32,6 @@ def test_construct(self):
             assert_equal(normal.logstd, np.zeros(shape))
 
     def test_copy(self):
-        np.random.seed(1234)
         shape = [2, 3]
         normal = UnitNormal(shape=[2, 3], event_ndims=1, dtype=T.float32)
 
@@ -71,8 +68,6 @@ def test_copy(self):
             self.assertIsNot(getattr(normal2, key), getattr(normal, key))
 
     def test_sample_and_log_prob(self):
-        np.random.seed(1234)
-
         for dtype in float_dtypes:
             normal = UnitNormal(shape=[2, 3, 4], event_ndims=1, dtype=dtype)
 
@@ -122,19 +117,19 @@ def __init__(self,
                  logstd: Optional[T.Tensor] = None,
                  reparameterized: bool = True,
                  event_ndims: int = 0,
+                 device: Optional[str] = None,
                  validate_tensors: Optional[bool] = None,
                  xyz: int = 0):
         super().__init__(
-            mean=mean, std=std, logstd=logstd,  reparameterized=reparameterized,
-            event_ndims=event_ndims, validate_tensors=validate_tensors
+            mean=mean, std=std, logstd=logstd, reparameterized=reparameterized,
+            event_ndims=event_ndims, device=device, validate_tensors=validate_tensors
         )
         self.xyz = xyz
 
 
-class NormalTestCase(unittest.TestCase):
+class NormalTestCase(TestCase):
 
     def test_construct(self):
-        np.random.seed(1234)
         mean = np.random.randn(3, 4)
         logstd = np.random.randn(2, 3, 4)
         std = np.exp(logstd)
@@ -202,7 +197,6 @@ def test_construct(self):
                 _ = normal.logstd
 
     def test_copy(self):
-        np.random.seed(1234)
         mean = np.random.randn(3, 4)
         logstd = np.random.randn(2, 3, 4)
         mean_t = T.as_tensor(mean)
@@ -224,14 +218,13 @@ def test_copy(self):
                 'cls': _MyBaseNormal,
                 'base': normal,
                 'attrs': ('mean', 'reparameterized', 'event_ndims',
-                          'validate_tensors', 'xyz'),
+                          'device', 'validate_tensors', 'xyz'),
                 'mutual_attrs': (('std', 'logstd'),),
                 'original_mutual_params': {'logstd': normal.logstd},
                 'overrided_params': {'event_ndims': 2},
             }))
 
     def test_Normal(self):
-        np.random.seed(1234)
         mean = np.random.randn(3, 4)
         logstd = np.random.randn(2, 3, 4)
         mean_t = T.as_tensor(mean)
@@ -280,7 +273,6 @@ def test_Normal(self):
             )
 
     def test_TruncatedNormal(self):
-        np.random.seed(1234)
         mean = np.random.randn(3, 4)
         logstd = np.random.randn(2, 3, 4)
         std = np.exp(logstd)
diff --git a/tests/distributions/test_uniform.py b/tests/distributions/test_uniform.py
index 9b200ec..d27cb05 100644
--- a/tests/distributions/test_uniform.py
+++ b/tests/distributions/test_uniform.py
@@ -12,11 +12,9 @@
 from tests.helper import *
 
 
-class UniformTestCase(unittest.TestCase):
+class UniformTestCase(TestCase):
 
     def test_construct(self):
-        np.random.seed(1234)
-
         for dtype in float_dtypes:
             # specify no args
             uniform = Uniform(dtype=dtype, event_ndims=0)
@@ -127,9 +125,6 @@ def test_construct(self):
                         validate_tensors=True)
 
     def test_copy(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         for dtype in float_dtypes:
             low_t = T.full([2, 1], -1., dtype=dtype)
             high_t = T.full([1, 3], 2., dtype=dtype)
@@ -159,14 +154,11 @@ def test_copy(self):
                     'base': uniform,
                     'attrs': (('shape', '_shape'), 'low', 'high', 'dtype',
                               'reparameterized', 'event_ndims', 'log_zero',
-                              'validate_tensors'),
+                              'device', 'validate_tensors'),
                     'overrided_params': {'event_ndims': 2},
                 }))
 
     def test_sample_and_log_prob(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         array_low = np.random.randn(2, 1)
         array_high = np.exp(np.random.randn(1, 3)) + 1.
         log_zero = -1e6
diff --git a/tests/distributions/test_utils.py b/tests/distributions/test_utils.py
index 88d382c..9813318 100644
--- a/tests/distributions/test_utils.py
+++ b/tests/distributions/test_utils.py
@@ -47,7 +47,7 @@ class Sink2(BaseSink):
     pass
 
 
-class DistributionUtilsTestCase(unittest.TestCase):
+class DistributionUtilsTestCase(TestCase):
 
     def test_get_overrided_parameterized(self):
         cls = Mock(__qualname__='xyz')
@@ -110,8 +110,6 @@ def test_get_tail_size(self):
                 _ = get_tail_size([], len(shape) + 1)
 
     def test_log_pdf_mask(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
         x = np.random.randn(3, 4, 5)
 
         for dtype in float_dtypes:
@@ -139,6 +137,7 @@ def test_check_tensor_arg_types(self):
                 for t, v in [(a, 1.0), (b, 2.0), (e, e_orig), (f, f_orig.tensor)]:
                     self.assertIsInstance(t, T.Tensor)
                     self.assertEqual(T.get_dtype(t), dtype)
+                    self.assertEqual(T.get_device(t), T.current_device())
                     if isinstance(v, float):
                         assert_equal(t, v)
                     else:
@@ -172,6 +171,28 @@ def test_check_tensor_arg_types(self):
                                          f'{T.float32} vs {dtype}'):
                     _ = check_tensor_arg_types(('a', a_orig), ('b', b_orig))
 
+            # check `device` and `default_device`
+            if T.current_device() != T.CPU_DEVICE:
+                [a] = check_tensor_arg_types(('a', [1., 2., 3.]), device=T.CPU_DEVICE)
+                self.assertEqual(T.get_device(a), T.CPU_DEVICE)
+
+                [a] = check_tensor_arg_types(('a', [1., 2., 3.]), default_device=T.CPU_DEVICE)
+                self.assertEqual(T.get_device(a), T.CPU_DEVICE)
+
+                [a] = check_tensor_arg_types(('a', [1., 2., 3.]), device=T.CPU_DEVICE,
+                                             default_device=T.current_device())
+                self.assertEqual(T.get_device(a), T.CPU_DEVICE)
+
+                a = T.as_tensor([1., 2., 3.], device=T.current_device())
+                with pytest.raises(ValueError,
+                                   match=f'`a.device` != `device`'):
+                    _ = check_tensor_arg_types(('a', a), device=T.CPU_DEVICE)
+
+                b = T.as_tensor([1., 2., 3.], device=T.CPU_DEVICE)
+                with pytest.raises(ValueError,
+                                   match=f'`b.device` != `a.device`'):
+                    _ = check_tensor_arg_types(('a', a), ('b', b))
+
             # check tensor cannot be None
             with pytest.raises(ValueError,
                                match='`a` must be specified.'):
diff --git a/tests/flows/test_act_norm.py b/tests/flows/test_act_norm.py
index 7e02103..85b9248 100644
--- a/tests/flows/test_act_norm.py
+++ b/tests/flows/test_act_norm.py
@@ -83,16 +83,14 @@ def do_check(batch_shape, scale_type, initialized, dtype):
         do_check([11], 'exp', False, dtype)
 
 
-class ActNormTestCase(unittest.TestCase):
+class ActNormTestCase(TestCase):
 
     @slow_test
     def test_ActNorm(self):
-        T.random.seed(1234)
         check_act_norm(self, 0, ActNorm)
 
     @slow_test
     def test_ActNormNd(self):
-        T.random.seed(1234)
         for spatial_ndims in (1, 2, 3):
             check_act_norm(
                 self,
diff --git a/tests/flows/test_core.py b/tests/flows/test_core.py
index a7e30b3..27ebfa1 100644
--- a/tests/flows/test_core.py
+++ b/tests/flows/test_core.py
@@ -69,7 +69,7 @@ def _transform(self,
         return output, output_log_det
 
 
-class BaseFlowTestCase(unittest.TestCase):
+class BaseFlowTestCase(TestCase):
 
     def test_constructor(self):
         flow = Flow(x_event_ndims=1,
@@ -132,7 +132,7 @@ def test_call(self):
             _ = flow(x, inverse=True)
 
 
-class FeatureMappingFlowTestCase(unittest.TestCase):
+class FeatureMappingFlowTestCase(TestCase):
 
     def test_constructor(self):
         flow = FeatureMappingFlow(axis=-1,
@@ -160,7 +160,7 @@ def test_constructor(self):
             _ = FeatureMappingFlow(axis=0, event_ndims=1, explicitly_invertible=True)
 
 
-class InverseFlowTestCase(unittest.TestCase):
+class InverseFlowTestCase(TestCase):
 
     def test_InverseFlow(self):
         original_flow = tk.layers.jit_compile(_MyFlow())
@@ -226,7 +226,7 @@ def _transform(self,
         return output, output_log_det
 
 
-class SequentialFlowTestCase(unittest.TestCase):
+class SequentialFlowTestCase(TestCase):
 
     def test_constructor(self):
         flows = [tk.layers.jit_compile(_MyFlow1()), tk.layers.jit_compile(_MyFlow())]
@@ -304,7 +304,7 @@ def check_invertible_matrix(ctx, m, size):
                     rtol=1e-4, atol=1e-6)
 
 
-class InvertibleMatrixTestCase(unittest.TestCase):
+class InvertibleMatrixTestCase(TestCase):
 
     def test_invertible_matrices(self):
         for cls in (LooseInvertibleMatrix, StrictInvertibleMatrix):
@@ -378,10 +378,9 @@ def check_invertible_linear(ctx,
                             T.random.randn(batch_shape))
 
 
-class InvertibleLinearTestCase(unittest.TestCase):
+class InvertibleLinearTestCase(TestCase):
 
     def test_invertible_dense(self):
-        T.random.seed(1234)
         for strict in (True, False):
             check_invertible_linear(
                 self,
@@ -392,7 +391,6 @@ def test_invertible_dense(self):
             )
 
     def test_invertible_conv_nd(self):
-        T.random.seed(1234)
         for spatial_ndims in (1, 2, 3):
             for strict in (True, False):
                 check_invertible_linear(
@@ -511,11 +509,9 @@ def _scale_and_log_scale(self,
         return scale, log_scale
 
 
-class ScaleTestCase(unittest.TestCase):
+class ScaleTestCase(TestCase):
 
     def test_ExpScale(self):
-        T.random.seed(1234)
-
         x = T.random.randn([2, 3, 4])
         scale = ExpScale()
         scale = tk.layers.jit_compile(scale)
@@ -529,8 +525,6 @@ def test_ExpScale(self):
             check_scale(self, scale, x, pre_scale, expected_y, expected_log_det)
 
     def test_SigmoidScale(self):
-        T.random.seed(1234)
-
         x = T.random.randn([2, 3, 4])
 
         for pre_scale_bias in [None, 0., 1.5]:
@@ -553,8 +547,6 @@ def test_SigmoidScale(self):
                 check_scale(self, scale, x, pre_scale, expected_y, expected_log_det)
 
     def test_LinearScale(self):
-        T.random.seed(1234)
-
         x = T.random.randn([2, 3, 4])
         scale = LinearScale(epsilon=T.EPSILON)
         self.assertIn('epsilon=', repr(scale))
@@ -570,7 +562,6 @@ def test_LinearScale(self):
             check_scale(self, scale, x, pre_scale, expected_y, expected_log_det)
 
     def test_bad_output(self):
-        T.random.seed(1234)
         x = T.random.randn([2, 3, 1])
 
         scale = _BadScale1()
diff --git a/tests/flows/test_coupling.py b/tests/flows/test_coupling.py
index eddd9a2..a2e544e 100644
--- a/tests/flows/test_coupling.py
+++ b/tests/flows/test_coupling.py
@@ -85,7 +85,7 @@ def do_check(secondary, scale_type):
             _ = cls(shift_and_pre_scale, scale=scale)
 
 
-class CouplingLayerTestCase(unittest.TestCase):
+class CouplingLayerTestCase(TestCase):
 
     @slow_test
     def test_CouplingLayer(self):
diff --git a/tests/flows/test_rearrangement.py b/tests/flows/test_rearrangement.py
index 451c9af..ff54038 100644
--- a/tests/flows/test_rearrangement.py
+++ b/tests/flows/test_rearrangement.py
@@ -40,7 +40,7 @@ def check_shuffling_flow(ctx,
                             T.random.randn(batch_shape))
 
 
-class RearrangementTestCase(unittest.TestCase):
+class RearrangementTestCase(TestCase):
 
     def test_FeatureShuffleFlow(self):
         check_shuffling_flow(self, 0, FeatureShufflingFlow)
diff --git a/tests/flows/test_shape_.py b/tests/flows/test_shape_.py
index f15ad38..786c8a9 100644
--- a/tests/flows/test_shape_.py
+++ b/tests/flows/test_shape_.py
@@ -10,7 +10,7 @@
 from tests.ops import *
 
 
-class ReshapeFlowTestCase(unittest.TestCase):
+class ReshapeFlowTestCase(TestCase):
 
     def test_ReshapeFlow(self):
         flow = ReshapeFlow([4, -1], [-1])
@@ -48,11 +48,9 @@ def test_ReshapeFlow(self):
             _ = ReshapeFlow([-1], [-1, -2])
 
 
-class SpaceDepthTransformFlowTestCase(unittest.TestCase):
+class SpaceDepthTransformFlowTestCase(TestCase):
 
     def test_space_depth_transform(self):
-        T.random.seed(1234)
-
         for spatial_ndims, batch_shape, block_size in product(
                     (1, 2, 3),
                     ([2], [2, 3]),
diff --git a/tests/flows/test_split_.py b/tests/flows/test_split_.py
index 67b707f..fe41610 100644
--- a/tests/flows/test_split_.py
+++ b/tests/flows/test_split_.py
@@ -84,12 +84,10 @@ def check_split_flow(ctx,
         _ = cls([2, 3], left, tk.layers.Linear(2, 3))
 
 
-class SplitFlowTestCase(unittest.TestCase):
+class SplitFlowTestCase(TestCase):
 
     @slow_test
     def test_SplitFlow(self):
-        T.random.seed(1234)
-
         # x and y with the same event ndims
         left = tk.layers.jit_compile(InvertibleDense(2))
         right = tk.layers.jit_compile(InvertibleDense(3))
@@ -153,8 +151,6 @@ def test_SplitFlow(self):
 
     @slow_test
     def test_SplitFlowNd(self):
-        T.random.seed(1234)
-
         for spatial_ndims in (1, 2, 3):
             cls = getattr(tk.flows, f'SplitFlow{spatial_ndims}d')
             sub_cls = getattr(tk.flows, f'InvertibleConv{spatial_ndims}d')
diff --git a/tests/helper.py b/tests/helper.py
index 520ff8d..bd07d66 100644
--- a/tests/helper.py
+++ b/tests/helper.py
@@ -1,4 +1,7 @@
 import os
+import random
+import unittest
+from functools import wraps
 
 import numpy as np
 import pytest
@@ -15,6 +18,8 @@
     'slow_test',
 
     'check_distribution_instance', 'flow_standard_check',
+
+    'TestCase',
 ]
 
 
@@ -216,3 +221,35 @@ def flow_standard_check(ctx, flow, x, expected_y, expected_log_det,
     x, log_det = flow(y, inverse=True, compute_log_det=False)
     assert_allclose(x, expected_x, rtol=1e-4, atol=1e-6)
     ctx.assertIsNone(log_det)
+
+
+class TestCaseMeta(type):
+
+    def __new__(cls, name, parents, dct):
+        def make_wrapper(method):
+            @wraps(method)
+            def wrapper(*args, **kwargs):
+                T.random.set_deterministic(True)
+                T.random.seed(1234)
+                np.random.seed(1234)
+                random.seed(1234)
+
+                try:
+                    with T.use_device(T.first_gpu_device()):
+                        return method(*args, **kwargs)
+                finally:
+                    T.random.set_deterministic(False)
+            return wrapper
+
+        keys = list(dct)
+        for key in keys:
+            val = dct[key]
+            if key.startswith('test_'):
+                val = make_wrapper(val)
+            dct[key] = val
+
+        return super().__new__(cls, name, parents, dct)
+
+
+class TestCase(unittest.TestCase, metaclass=TestCaseMeta):
+    pass
diff --git a/tests/init/test_core.py b/tests/init/test_core.py
index 74774d0..fc8ce3c 100644
--- a/tests/init/test_core.py
+++ b/tests/init/test_core.py
@@ -13,7 +13,7 @@
 from tests.helper import *
 
 
-class UtilitiesTestCase(unittest.TestCase):
+class UtilitiesTestCase(TestCase):
 
     def test_calculate_fan_in_and_fan_out(self):
         for layer, fan_in_and_out in [
@@ -160,7 +160,7 @@ def test_apply_initializer(self):
                 tk.init.apply_initializer(weight, object())
 
 
-class TensorInitiailizersTestCase(unittest.TestCase):
+class TensorInitiailizersTestCase(TestCase):
 
     def test_zeros(self):
         for dtype in float_dtypes:
@@ -185,8 +185,6 @@ def test_fill(self):
             assert_equal(weight, T.full_like(weight, 123.))
 
     def test_uniform(self):
-        T.random.seed(1234)
-
         for dtype in float_dtypes:
             weight = T.variable([n_samples // 50, 50], dtype=dtype,
                                 initializer=0.)
@@ -208,8 +206,6 @@ def test_uniform(self):
             )
 
     def test_normal(self):
-        T.random.seed(1234)
-
         for dtype in float_dtypes:
             weight = T.variable([n_samples // 50, 50], dtype=dtype,
                                 initializer=0.)
@@ -227,12 +223,10 @@ def test_normal(self):
                 weight, partial(tk.init.normal, mean=1., std=3.))
             self.assertLessEqual(
                 np.abs(T.to_numpy(T.reduce_mean(weight)) - 1.),
-                5.0 / 3. / np.sqrt(n_samples)
+                5.0 * 3. / np.sqrt(n_samples)
             )
 
     def test_xavier_initializer(self):
-        T.random.seed(1234)
-
         for dtype, initializer, mode in product(
                     float_dtypes,
                     (tk.init.xavier_normal, tk.init.xavier_uniform),
@@ -268,8 +262,6 @@ def test_xavier_initializer(self):
             )
 
     def test_kaming_initializer(self):
-        T.random.seed(1234)
-
         for dtype, initializer, mode in product(
                     float_dtypes,
                     (tk.init.kaming_normal, tk.init.kaming_uniform),
@@ -330,7 +322,7 @@ def _init(self, layer: T.Module, inputs: List[T.Tensor]) -> None:
         self.watcher.append((layer, inputs))
 
 
-class DataDependentInitializerTestCase(unittest.TestCase):
+class DataDependentInitializerTestCase(TestCase):
 
     def test_data_dependent_initializer(self):
         data_init = _MyDataDependentInitializer([])
diff --git a/tests/init/test_std_data_init.py b/tests/init/test_std_data_init.py
index 349720e..205e546 100644
--- a/tests/init/test_std_data_init.py
+++ b/tests/init/test_std_data_init.py
@@ -9,7 +9,7 @@
 from tests.ops import *
 
 
-class StdDataInitTestCase(unittest.TestCase):
+class StdDataInitTestCase(TestCase):
 
     def test_repr(self):
         data_init = tk.init.StdDataInit()
diff --git a/tests/layers/test_composed.py b/tests/layers/test_composed.py
index fb4721b..0636370 100644
--- a/tests/layers/test_composed.py
+++ b/tests/layers/test_composed.py
@@ -137,7 +137,7 @@ def check_composed_layer(ctx, input, layer_cls, linear_cls, normalizer_cls,
     )
 
 
-class ComposedTestCase(unittest.TestCase):
+class ComposedTestCase(TestCase):
 
     def test_dense(self):
         check_composed_layer(
diff --git a/tests/layers/test_contextual.py b/tests/layers/test_contextual.py
index 7842fc1..74d1bca 100644
--- a/tests/layers/test_contextual.py
+++ b/tests/layers/test_contextual.py
@@ -5,7 +5,7 @@
 from tests.helper import *
 
 
-class ContextualTestCase(unittest.TestCase):
+class ContextualTestCase(TestCase):
 
     def test_IgnoreContext(self):
         x = T.random.randn([2, 3, 4])
diff --git a/tests/layers/test_core.py b/tests/layers/test_core.py
index 3237d8f..a6c1eae 100644
--- a/tests/layers/test_core.py
+++ b/tests/layers/test_core.py
@@ -28,7 +28,7 @@ def forward(self, input: Tensor) -> Tensor:
         return self.wrapped(input)
 
 
-class UtilsAndConstantsTestCase(unittest.TestCase):
+class UtilsAndConstantsTestCase(TestCase):
 
     def test_constants(self):
         self.assertEqual(tk.layers.DEFAULT_GATE_BIAS, 2.0)
@@ -171,7 +171,7 @@ def test_get_bias_store(self):
         self.assertIsNone(store)
 
 
-class IdentityTestCase(unittest.TestCase):
+class IdentityTestCase(TestCase):
 
     def test_identity(self):
         layer = tk.layers.jit_compile(Identity())
@@ -231,15 +231,15 @@ class _AutoRepr(BaseLayer):
     b: float
 
 
-class BaseLayersTestCase(unittest.TestCase):
+class BaseLayersTestCase(TestCase):
 
     def test_single_variate_layer(self):
         layer = tk.layers.jit_compile(_MySingleVariateLayer())
         x = T.random.randn([2, 3, 4])
         np_offset = T.from_numpy(np.array([0., 1., 2., 3.]))
-        assert_allclose(layer(x), x * 11. + np_offset)
+        assert_allclose(layer(x), x * 11. + np_offset, rtol=1e-4, atol=1e-6)
         layer.set_bias(7.)
-        assert_allclose(layer(x), x * 11. + 7. + np_offset)
+        assert_allclose(layer(x), x * 11. + 7. + np_offset, rtol=1e-4, atol=1e-6)
 
     def test_multi_variate_layer(self):
         layer = tk.layers.jit_compile(_MyMultiVariateLayer())
@@ -247,16 +247,16 @@ def test_multi_variate_layer(self):
         y = T.random.randn([2, 3, 4])
         z = T.random.randn([2, 3, 4])
         a, b = layer([x, y, z])
-        assert_allclose(a, x + y)
-        assert_allclose(b, y + z)
+        assert_allclose(a, x + y, rtol=1e-4, atol=1e-6)
+        assert_allclose(b, y + z, rtol=1e-4, atol=1e-6)
 
     def test_split_layer(self):
         layer = tk.layers.jit_compile(_MySplitLayer())
         x = T.random.randn([2, 3, 4])
         a, b, c = layer(x)
-        assert_allclose(a, x)
-        assert_allclose(b, x + 1)
-        assert_allclose(c, x + 2)
+        assert_allclose(a, x, rtol=1e-4, atol=1e-6)
+        assert_allclose(b, x + 1, rtol=1e-4, atol=1e-6)
+        assert_allclose(c, x + 2, rtol=1e-4, atol=1e-6)
 
     def test_merge_layer(self):
         layer = tk.layers.jit_compile(_MyMergeLayer())
@@ -264,7 +264,7 @@ def test_merge_layer(self):
         y = T.random.randn([2, 3, 4])
         z = T.random.randn([2, 3, 4])
         out = layer([x, y, z])
-        assert_allclose(out, x + y + z)
+        assert_allclose(out, x + y + z, rtol=1e-4, atol=1e-6)
 
     def test_auto_repr(self):
         layer = _AutoRepr()
@@ -279,7 +279,7 @@ def test_auto_repr(self):
         self.assertNotIn('weight=', repr(layer))
 
 
-class SequentialTestCase(unittest.TestCase):
+class SequentialTestCase(TestCase):
 
     def test_sequential(self):
         x = T.random.randn([4, 5])
@@ -297,8 +297,6 @@ def test_sequential(self):
 
 
 def check_core_linear(ctx, input, layer_factory, layer_name, numpy_fn):
-    T.random.seed(1234)
-
     # test with bias
     layer = layer_factory(use_bias=True)
     ctx.assertIn(layer_name, repr(layer))
@@ -362,11 +360,9 @@ class _MyDataDependentInitializer(init.DataDependentInitializer):
         _ = layer_factory(data_init=lambda: 'hello')
 
 
-class CoreLinearTestCase(unittest.TestCase):
+class CoreLinearTestCase(TestCase):
 
     def test_linear(self):
-        np.random.seed(1234)
-
         layer = Linear(5, 3)
         self.assertEqual(
             repr(layer),
@@ -390,8 +386,6 @@ def test_linear(self):
 
     @slow_test
     def test_conv_nd(self):
-        np.random.seed(1234)
-
         def do_check(spatial_ndims, kernel_size, stride,
                      dilation, padding):
             cls_name = f'LinearConv{spatial_ndims}d'
@@ -430,8 +424,6 @@ def do_check(spatial_ndims, kernel_size, stride,
 
     @slow_test
     def test_conv_transpose_nd(self):
-        np.random.seed(1234)
-
         def is_valid_output_padding(spatial_ndims, output_padding, stride, dilation):
             if not hasattr(output_padding, '__iter__'):
                 output_padding = [output_padding] * spatial_ndims
@@ -492,11 +484,9 @@ def do_check(spatial_ndims, kernel_size, stride,
         do_check(3, (3, 2, 1), (3, 2, 1), (3, 2, 1), PaddingMode.HALF, 0)
 
 
-class BatchNormTestCase(unittest.TestCase):
+class BatchNormTestCase(TestCase):
 
     def test_batch_norm(self):
-        T.random.seed(1234)
-
         eps = T.EPSILON
         for spatial_ndims in (0, 1, 2, 3):
             cls = getattr(tk.layers, ('BatchNorm' if not spatial_ndims
@@ -539,12 +529,10 @@ def test_batch_norm(self):
                 )
 
 
-class DropoutTestCase(unittest.TestCase):
+class DropoutTestCase(TestCase):
 
     def test_dropout(self):
         n_samples = 10000
-        T.random.seed(1234)
-
         for spatial_ndims in (0, 1, 2, 3):
             cls = getattr(tk.layers, ('Dropout' if not spatial_ndims
                                       else f'Dropout{spatial_ndims}d'))
diff --git a/tests/layers/test_flow_layer.py b/tests/layers/test_flow_layer.py
index 4d3b06f..8a1707d 100644
--- a/tests/layers/test_flow_layer.py
+++ b/tests/layers/test_flow_layer.py
@@ -27,7 +27,7 @@ def _transform(self,
         return output, input_log_det
 
 
-class FlowLayerTestCase(unittest.TestCase):
+class FlowLayerTestCase(TestCase):
 
     def test_FlowLayer(self):
         flow = tk.layers.jit_compile(_MyFlow(
@@ -41,7 +41,7 @@ def test_FlowLayer(self):
             _ = tk.layers.FlowLayer(object())
 
 
-class ActNormLayerTestCase(unittest.TestCase):
+class ActNormLayerTestCase(TestCase):
 
     def test_ActNorm(self):
         layer = tk.layers.ActNorm(5)
diff --git a/tests/layers/test_gated.py b/tests/layers/test_gated.py
index 1f91d24..814a656 100644
--- a/tests/layers/test_gated.py
+++ b/tests/layers/test_gated.py
@@ -7,7 +7,7 @@
 from tests.helper import *
 
 
-class GatedTestCase(unittest.TestCase):
+class GatedTestCase(TestCase):
 
     def test_Gated(self):
         gated = tk.layers.Gated(feature_axis=-2, num_features=3,
diff --git a/tests/layers/test_pixelcnn.py b/tests/layers/test_pixelcnn.py
index ac2cc03..2d009e0 100644
--- a/tests/layers/test_pixelcnn.py
+++ b/tests/layers/test_pixelcnn.py
@@ -114,7 +114,7 @@ def forward(self, input: Tensor, context: List[Tensor]) -> Tensor:
             raise ValueError('Expected context to have 0 or 1 element.')
 
 
-class PixelCNNTestCase(unittest.TestCase):
+class PixelCNNTestCase(TestCase):
 
     def test_causality_and_receptive_field(self):
         for size in [[12], [12, 11], [12, 11, 10]]:
@@ -242,7 +242,6 @@ def test_causality_and_receptive_field(self):
                 )
 
     def test_pixelcnn_network(self):
-        T.random.seed(1234)
         in_channels = 3
         out_channels = 5
 
diff --git a/tests/layers/test_pool.py b/tests/layers/test_pool.py
index aa8e499..cbde3d5 100644
--- a/tests/layers/test_pool.py
+++ b/tests/layers/test_pool.py
@@ -10,11 +10,9 @@
 from tests.ops import *
 
 
-class PoolTestCase(unittest.TestCase):
+class PoolTestCase(TestCase):
 
     def test_AvgPool_and_MaxPool(self):
-        T.random.seed(1234)
-
         def is_valid_padding(padding, kernel_size):
             for p, k in zip(padding, kernel_size):
                 if isinstance(p, int):
diff --git a/tests/layers/test_resnet.py b/tests/layers/test_resnet.py
index e579478..2803389 100644
--- a/tests/layers/test_resnet.py
+++ b/tests/layers/test_resnet.py
@@ -320,10 +320,9 @@ def check_resblock(ctx,
     ctx.assertIsInstance(layer.conv1.weight_store, tk.layers.NormedAndScaledWeightStore)
 
 
-class ResBlockTestCase(unittest.TestCase):
+class ResBlockTestCase(TestCase):
 
     def test_resblock(self):
-        T.random.seed(1234)
         for spatial_ndims in (1, 2, 3):
             resblock_cls = getattr(tk.layers, f'ResBlock{spatial_ndims}d')
             check_resblock(
@@ -340,7 +339,6 @@ def test_resblock(self):
                                  output_padding=1)
 
     def test_resblock_transpose(self):
-        T.random.seed(1234)
         for spatial_ndims, output_padding in product((1, 2, 3), (0, 1)):
             check_resblock(
                 ctx=self,
diff --git a/tests/layers/test_shape_.py b/tests/layers/test_shape_.py
index 6f0bbbf..6b7fca2 100644
--- a/tests/layers/test_shape_.py
+++ b/tests/layers/test_shape_.py
@@ -8,7 +8,7 @@
 from tests.ops import make_conv_shape
 
 
-class FlattenToNDimsTestCase(unittest.TestCase):
+class FlattenToNDimsTestCase(TestCase):
 
     def test_FlattenToNDims(self):
         x = T.random.randn(make_conv_shape([3, 4], 6, [5]))
@@ -23,7 +23,7 @@ def test_FlattenToNDims(self):
             _ = layer(T.random.randn([1, 1]))
 
 
-class ConstantPadTestCase(unittest.TestCase):
+class ConstantPadTestCase(TestCase):
 
     def test_ConstantPad(self):
         for value_arg in [{}, {'value': 123.0}]:
@@ -106,7 +106,7 @@ def fn(v):
                         _ = layer_factory(0, 1, 2, 3)
 
 
-class ChannelSwapTestCase(unittest.TestCase):
+class ChannelSwapTestCase(TestCase):
 
     def test_channel_last_to_first(self):
         for spatial_ndims in (1, 2, 3):
diff --git a/tests/layers/test_split_.py b/tests/layers/test_split_.py
index 141cfb3..ee34e0b 100644
--- a/tests/layers/test_split_.py
+++ b/tests/layers/test_split_.py
@@ -5,7 +5,7 @@
 from tests.helper import *
 
 
-class BranchTestCase(unittest.TestCase):
+class BranchTestCase(TestCase):
 
     def test_branch(self):
         shared = tk.layers.Linear(5, 5)
diff --git a/tests/layers/test_utils.py b/tests/layers/test_utils.py
index 9c11feb..07ac35a 100644
--- a/tests/layers/test_utils.py
+++ b/tests/layers/test_utils.py
@@ -10,7 +10,7 @@
 from tests.ops import *
 
 
-class UtilsTestCase(unittest.TestCase):
+class UtilsTestCase(TestCase):
 
     def test_flatten_nested_layers(self):
         layers = [tk.layers.Linear(5, 5) for _ in range(5)]
diff --git a/tests/tensor/test_core.py b/tests/tensor/test_core.py
index 9fb536f..c621c97 100644
--- a/tests/tensor/test_core.py
+++ b/tests/tensor/test_core.py
@@ -15,7 +15,7 @@
 from tests.ops import *
 
 
-class TensorCoreTestCase(unittest.TestCase):
+class TensorCoreTestCase(TestCase):
 
     def test_backend_info(self):
         self.assertEqual(T.backend_name, settings.backend)
@@ -29,10 +29,10 @@ def test_jit_compile(self):
         else:
             self.assertFalse(tk.layers.is_jit_layer(layer2))
 
-        # not supported object
-        with pytest.raises(TypeError,
-                           match='Not supported by `jit_compile`'):
-            _ = tk.layers.jit_compile(object())
+    def test_device(self):
+        # ensure we're using GPU if GPU is available
+        if T.gpu_device_list():
+            self.assertEqual(T.current_device(), T.gpu_device_list()[0])
 
     def test_utilities(self):
         self.assertEqual(T.int_range(0, 10), list(range(10)))
@@ -88,8 +88,6 @@ def test_dtypes(self):
             assert_equal(t3, x)
 
     def test_tensor_constructors(self):
-        np.random.seed(1234)
-
         # # as_tensor
         # for x in [1., 1, [1., 2., 3.], np.array([1., 2., 3.])]:
         #     t = T.as_tensor(x)
@@ -926,8 +924,6 @@ def test_index_select_and_others(self):
                 _ = T.shift_axis(x_t, axis, 0)
 
     def test_math_univariate_op(self):
-        np.random.seed(1234)
-
         x = np.random.randn(2, 3)
         u = np.random.rand(2, 3)
         x_t = T.as_tensor(x)
@@ -956,7 +952,6 @@ def test_math_univariate_op(self):
         assert_allclose(T.erfinv(u_t), erfinv(u))
 
     def test_math_bivariate_op(self):
-        np.random.seed(1234)
         x = np.random.randn(2, 3)
         y = np.random.randn(3)
         t1 = T.as_tensor(x)
@@ -1049,7 +1044,6 @@ def log_f_exp(f, x, axis=None, keepdims=False):
         log_mean_exp = partial(log_f_exp, np.mean)
 
         # prepare for the data
-        np.random.seed(1234)
         x = np.random.randn(2, 3, 4)
         t = T.as_tensor(x)
 
@@ -1265,7 +1259,6 @@ def read_bool(t):
             self.assertEqual(T.get_dtype(t), T.boolean)
             return T.to_numpy(t)
 
-        np.random.seed(1234)
         x = np.random.randn(2, 3, 4)
         y = np.random.randn(1, 3, 4)
         x = np.concatenate([y, x], axis=0)
@@ -1332,8 +1325,6 @@ def test_sort(self):
             )
 
     def test_matrix_ops(self):
-        np.random.seed(1234)
-
         for k in [1, 5]:
             x = np.random.randn(4, k)
             y = np.random.randn(k, k)
diff --git a/tests/tensor/test_linalg.py b/tests/tensor/test_linalg.py
index 3a750e5..b1a16c8 100644
--- a/tests/tensor/test_linalg.py
+++ b/tests/tensor/test_linalg.py
@@ -3,13 +3,12 @@
 import numpy as np
 
 from tensorkit import tensor as T
-from tests.helper import assert_allclose
+from tests.helper import *
 
 
-class LinalgTestCase(unittest.TestCase):
+class LinalgTestCase(TestCase):
 
     def test_qr(self):
-        np.random.seed(1234)
         for k in [1, 5]:
             m = np.random.randn(k, k)
             q, r = T.linalg.qr(T.as_tensor(m))
@@ -18,7 +17,6 @@ def test_qr(self):
             assert_allclose(r, expected_r)
 
     def test_slogdet(self):
-        np.random.seed(1234)
         for k in [1, 5]:
             m = np.random.randn(k, k)
             sign, logdet = T.linalg.slogdet(T.as_tensor(m))
diff --git a/tests/tensor/test_nn.py b/tests/tensor/test_nn.py
index 79ae63b..b97b14c 100644
--- a/tests/tensor/test_nn.py
+++ b/tests/tensor/test_nn.py
@@ -11,14 +11,13 @@
 from tests.ops import *
 
 
-class TensorNNTestCase(unittest.TestCase):
+class TensorNNTestCase(TestCase):
 
     def test_constants(self):
         self.assertEqual(T.nn.LEAKY_RELU_DEFAULT_SLOPE, 0.01)
         self.assertFalse(T.nn.AVG_POOL_DEFAULT_COUNT_PADDED_ZEROS)
 
     def test_activation_functions(self):
-        np.random.seed(1234)
         x = np.random.randn(2, 3, 4)
         x = np.concatenate([x, np.zeros([2, 3, 1])], axis=-1)
         self.assertTrue(np.any(x < 0))
@@ -115,8 +114,6 @@ def binary_cross_entropy(logits, labels, reduction, negative):
                 out = -out
             return out
 
-        np.random.seed(1234)
-
         logits = np.random.randn(2, 3, 4)
         sparse_labels = sigmoid(np.random.randn(3, 4))
         labels = (sparse_labels < 0.5).astype(np.int32)
@@ -183,8 +180,6 @@ def cross_entropy(logits, labels, reduction, negative):
             return sparse_cross_entropy(
                 logits, sparse_labels, reduction, negative)
 
-        np.random.seed(1234)
-
         logits = np.random.randn(2, 3, 4, 5, 6)
         sparse_labels = softmax(np.random.randn(3, 4, 5, 6), axis=-1)
         labels = np.argmax(sparse_labels, axis=-1)
@@ -429,7 +424,6 @@ def do_check(pool_type, spatial_ndims, x, kernel_size, stride, padding,
                         f'count_padded_zeros={count_padded_zeros}'
             )
 
-        np.random.seed(1234)
         spatial_shape = [12, 13, 14]
         for spatial_ndims in (1, 2):
             x = np.random.uniform(
diff --git a/tests/tensor/test_random.py b/tests/tensor/test_random.py
index d642337..0502c91 100644
--- a/tests/tensor/test_random.py
+++ b/tests/tensor/test_random.py
@@ -18,7 +18,7 @@ def do_check_log_prob(given, batch_ndims, Z_log_prob_fn, np_log_prob):
         assert_allclose(
             Z_log_prob_fn(given, group_ndims=group_ndims),
             np.sum(np_log_prob, axis=tuple(range(-group_ndims, 0))),
-            rtol=1e-2
+            rtol=1e-2, atol=1e-5,
         )
     with pytest.raises(Exception, match='`group_ndims` is too large'):
         _ = Z_log_prob_fn(given, group_ndims=batch_ndims + 1)
@@ -28,7 +28,7 @@ def normal_cdf(x):
     return norm.cdf(x)
 
 
-class TensorRandomTestCase(unittest.TestCase):
+class TensorRandomTestCase(TestCase):
 
     def test_seed(self):
         T.random.seed(1234)
@@ -41,9 +41,6 @@ def test_seed(self):
         assert_allclose(x, z)
 
     def test_rand(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         for dtype in float_dtypes:
             # test sample dtype and shape
             t = T.random.rand([n_samples, 2, 3, 4], dtype=dtype)
@@ -60,9 +57,6 @@ def test_rand(self):
             )
 
     def test_uniform(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         for low, high in [(-1., 2.), (3.5, 7.5)]:
             for dtype in float_dtypes:
                 # test sample dtype and shape
@@ -85,7 +79,6 @@ def test_uniform(self):
             _ = T.random.uniform([2, 3, 4], low=2., high=1.)
 
     def test_shuffle_and_random_permutation(self):
-        T.random.seed(1234)
         x = np.arange(24).reshape([2, 3, 4])
 
         # shuffle
@@ -113,9 +106,6 @@ def test_shuffle_and_random_permutation(self):
                     self.assertLess(equal_count, 100)
 
     def test_randn(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         for dtype in float_dtypes:
             # test sample dtype and shape
             t = T.random.randn([n_samples, 2, 3, 4], dtype=dtype)
@@ -138,8 +128,6 @@ def test_randn(self):
                 np_log_prob=np.log(np.exp(-x ** 2 / 2.) / np.sqrt(2 * np.pi)))
 
     def test_truncated_randn(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
         log_zero = -1e6
 
         def log_prob(given, low, high):
@@ -208,9 +196,6 @@ def log_prob(given, low, high):
         )
 
     def test_normal(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         mean = np.random.randn(2, 3, 4)
         logstd = np.random.randn(3, 4)
         std = np.exp(logstd)
@@ -238,7 +223,7 @@ def log_prob(given):
             x_mean = np.mean(x, axis=0)
             np.testing.assert_array_less(
                 np.abs(x_mean - mean),
-                np.tile(np.expand_dims(3 * std / np.sqrt(n_samples), axis=0),
+                np.tile(np.expand_dims(5 * std / np.sqrt(n_samples), axis=0),
                         [2, 1, 1])
             )
 
@@ -265,7 +250,7 @@ def log_prob(given):
             x_mean = np.mean(x, axis=0)
             np.testing.assert_array_less(
                 np.abs(x_mean - mean),
-                np.tile(np.expand_dims(3 * std / np.sqrt(n_samples), axis=0),
+                np.tile(np.expand_dims(5 * std / np.sqrt(n_samples), axis=0),
                         [2, 1, 1])
             )
 
@@ -349,9 +334,6 @@ def log_prob(given):
                 t, mean_t, logstd_t, validate_tensors=True)
 
     def test_truncated_normal(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         mean = np.random.randn(2, 3, 4)
         logstd = np.random.randn(3, 4)
         std = np.exp(logstd)
@@ -542,9 +524,6 @@ def log_prob(given):
                 (1 - given) * log_sigmoid(-logits)
             )
 
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         logits = np.random.randn(2, 3, 4)
         probs = sigmoid(logits)
 
@@ -632,9 +611,6 @@ def log_prob(given, probs, n_classes: int, is_one_hot: bool = False):
             # return np.log(np.prod(element_pow(probs, one-hot-given), axis=-1))
             return np.sum(given * np.log(probs), axis=-1)
 
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         n_classes = 5
         logits = np.clip(np.random.randn(2, 3, 4, n_classes) / 10.,
                          a_min=-0.3, a_max=0.3)
@@ -768,7 +744,6 @@ def do_test_sample(is_one_hot: bool,
                 _ = Z_sample_fn(probs=T.as_tensor(probs[0, 0, 0, 0]))
 
     def test_discretized_logistic(self):
-        np.random.seed(1234)
         next_seed_val = [1234]
 
         def next_seed():
@@ -980,9 +955,6 @@ def do_test_sample(bin_size: float,
                 given_t, mean_t, log_scale_t, 1 / 255., max_val=2.)
 
     def test_random_init(self):
-        np.random.seed(1234)
-        T.random.seed(1234)
-
         for dtype in float_dtypes:
             t = T.variable([n_samples, 2, 3], dtype=dtype)
             for fn, mean, std in [
diff --git a/tests/tensor/test_utils.py b/tests/tensor/test_utils.py
index 5bd668e..f5aca99 100644
--- a/tests/tensor/test_utils.py
+++ b/tests/tensor/test_utils.py
@@ -6,9 +6,10 @@
 import tensorkit as tk
 from tensorkit import tensor as T
 from tests.ops import *
+from tests.helper import *
 
 
-class UtilsTestCase(unittest.TestCase):
+class UtilsTestCase(TestCase):
 
     def test_split_channel_spatial_shape(self):
         for spatial_ndims in (1, 2, 3):
diff --git a/tests/test_arg_check.py b/tests/test_arg_check.py
index 0b4115e..f11ef30 100644
--- a/tests/test_arg_check.py
+++ b/tests/test_arg_check.py
@@ -5,9 +5,10 @@
 import tensorkit as tk
 from tensorkit import tensor as T
 from tensorkit.arg_check import *
+from tests.helper import *
 
 
-class ArgCheckTestCase(unittest.TestCase):
+class ArgCheckTestCase(TestCase):
 
     def test_validate_positive_int(self):
         for v in [1, 2, 3]:
diff --git a/tests/test_bayes.py b/tests/test_bayes.py
index f3f67c6..5d96d10 100644
--- a/tests/test_bayes.py
+++ b/tests/test_bayes.py
@@ -10,7 +10,7 @@
 from tests.helper import *
 
 
-class BayesianNetTestCase(unittest.TestCase):
+class BayesianNetTestCase(TestCase):
 
     def test_construct(self):
         # no observation
diff --git a/tests/test_stochastic.py b/tests/test_stochastic.py
index 3b8b6b4..3cc7c43 100644
--- a/tests/test_stochastic.py
+++ b/tests/test_stochastic.py
@@ -8,7 +8,7 @@
 from tests.helper import *
 
 
-class StochasticTensorTestCase(unittest.TestCase):
+class StochasticTensorTestCase(TestCase):
 
     def test_basic_interface(self):
         normal = UnitNormal(shape=[2, 3])
diff --git a/tests/train/test_core.py b/tests/train/test_core.py
index 2b1f793..a36f8ab 100644
--- a/tests/train/test_core.py
+++ b/tests/train/test_core.py
@@ -1,17 +1,16 @@
 import os
-import unittest
 from tempfile import TemporaryDirectory
 
 import numpy as np
 import pytest
 import torch
-
 from mltk import SimpleStatefulObject
 
 import tensorkit as tk
+from tests.helper import *
 
 
-class TorchCheckpointTestCase(unittest.TestCase):
+class TorchCheckpointTestCase(TestCase):
 
     def test_invalid_type(self):
         with pytest.raises(TypeError,
diff --git a/tests/variational/test_chain.py b/tests/variational/test_chain.py
index 02abc84..ae691e2 100644
--- a/tests/variational/test_chain.py
+++ b/tests/variational/test_chain.py
@@ -7,7 +7,7 @@
 from tests.helper import *
 
 
-class VariationalChainTestCase(unittest.TestCase):
+class VariationalChainTestCase(TestCase):
 
     def prepare_model(self):
         def p_log_probs(names):
diff --git a/tests/variational/test_estimators.py b/tests/variational/test_estimators.py
index a3d9909..daf466c 100644
--- a/tests/variational/test_estimators.py
+++ b/tests/variational/test_estimators.py
@@ -6,11 +6,10 @@
 
 from tensorkit import tensor as T
 from tensorkit.variational import *
-from tests.helper import assert_allclose
+from tests.helper import *
 
 
 def prepare_test_payload(reparameterized):
-    np.random.seed(1234)
     x = T.as_tensor(np.random.normal(size=[7, 13]))  # input
     y = T.requires_grad(T.as_tensor(np.random.normal(size=[13])))  # param
     if reparameterized:
@@ -23,7 +22,7 @@ def prepare_test_payload(reparameterized):
     return x, y, z, f, log_f, log_q
 
 
-class SGVBEstimatorTestCase(unittest.TestCase):
+class SGVBEstimatorTestCase(TestCase):
 
     def test_sgvb(self):
         assert_allclose_ = functools.partial(assert_allclose, rtol=1e-5, atol=1e-6)
@@ -59,7 +58,7 @@ def test_sgvb(self):
         )
 
 
-class IWAEEstimatorTestCase(unittest.TestCase):
+class IWAEEstimatorTestCase(TestCase):
 
     def test_error(self):
         x, y, z, f, log_f, log_q = prepare_test_payload(reparameterized=True)
diff --git a/tests/variational/test_evaluation.py b/tests/variational/test_evaluation.py
index 4520c0e..fc1c129 100644
--- a/tests/variational/test_evaluation.py
+++ b/tests/variational/test_evaluation.py
@@ -10,7 +10,6 @@
 
 
 def prepare_test_payload():
-    np.random.seed(1234)
     log_p = T.as_tensor(np.random.normal(size=[13]))
     log_q = T.as_tensor(np.random.normal(size=[7, 13]))
     return log_p, log_q
@@ -19,7 +18,7 @@ def prepare_test_payload():
 assert_allclose_ = partial(assert_allclose, atol=1e-4)
 
 
-class ImportanceSamplingLogLikelihoodTestCase(unittest.TestCase):
+class ImportanceSamplingLogLikelihoodTestCase(TestCase):
 
     def test_error(self):
         log_p, log_q = prepare_test_payload()
diff --git a/tests/variational/test_inference.py b/tests/variational/test_inference.py
index 53bf5e8..b195d60 100644
--- a/tests/variational/test_inference.py
+++ b/tests/variational/test_inference.py
@@ -7,7 +7,7 @@
 from tests.helper import *
 
 
-class VariationalInferenceTestCase(unittest.TestCase):
+class VariationalInferenceTestCase(TestCase):
 
     def test_construction(self):
         vi = VariationalInference(T.float_scalar(1.),
diff --git a/tests/variational/test_objectives.py b/tests/variational/test_objectives.py
index 34c62e0..411b45d 100644
--- a/tests/variational/test_objectives.py
+++ b/tests/variational/test_objectives.py
@@ -9,13 +9,12 @@
 
 
 def prepare_test_payload():
-    np.random.seed(1234)
     log_p = T.as_tensor(np.random.normal(size=[13]))
     log_q = T.as_tensor(np.random.normal(size=[7, 13]))
     return log_p, log_q
 
 
-class ELBOObjectiveTestCase(unittest.TestCase):
+class ELBOObjectiveTestCase(TestCase):
 
     def test_elbo(self):
         log_p, log_q = prepare_test_payload()
@@ -36,7 +35,7 @@ def test_elbo(self):
         )
 
 
-class MonteCarloObjectiveTestCase(unittest.TestCase):
+class MonteCarloObjectiveTestCase(TestCase):
 
     def test_error(self):
         log_p, log_q = prepare_test_payload()

From e84f34aadc6e157d3a624e2035d4cf4d6b0d2e89 Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Tue, 18 Feb 2020 00:51:35 +0800
Subject: [PATCH 4/7] added device to existing tests

---
 README.md                                     |   2 +-
 tensorkit/backend/pytorch_/core.py            |  40 +-
 tensorkit/examples/classification/mnist.py    |   2 +-
 .../examples/classification/mnist_resnet.py   |   2 +-
 tensorkit/layers/utils.py                     |   1 +
 tests/layers/test_core.py                     |  54 +++
 tests/layers/test_utils.py                    |   1 +
 tests/tensor/test_core.py                     | 399 +++++++++++-------
 tests/tensor/test_random.py                   |  38 +-
 9 files changed, 356 insertions(+), 183 deletions(-)

diff --git a/README.md b/README.md
index 70cf60a..ecdcc01 100644
--- a/README.md
+++ b/README.md
@@ -2,5 +2,5 @@
 
 ### Requirements
 
-* PyTorch >= 1.4.0
+* PyTorch: 1.3.1
 
diff --git a/tensorkit/backend/pytorch_/core.py b/tensorkit/backend/pytorch_/core.py
index f5a535b..f26faca 100644
--- a/tensorkit/backend/pytorch_/core.py
+++ b/tensorkit/backend/pytorch_/core.py
@@ -27,9 +27,11 @@
     # utilities
     'int_range', 'identity',
 
+    # cast
+    'cast', 'cast_like',
+
     # dtypes
-    'cast', 'cast_like', 'get_dtype', 'is_floating_point',
-    'is_floating_point_dtype',
+    'get_dtype', 'is_floating_point', 'is_floating_point_dtype',
 
     # tensor constructors
     'as_tensor', 'from_numpy',
@@ -191,33 +193,39 @@ def int_range(start: int, end: int, step: int = 1) -> List[int]:
         return ret
 
 
-# ---- dtypes ----
+# ---- cast dtype and device ----
 @jit
-def cast(input: Tensor, dtype: str, device: Optional[str] = None) -> Tensor:
-    if dtype == 'float32':
-        target_dtype = torch.float32
-    elif dtype == 'int32':
-        target_dtype = torch.int32
+def cast(input: Tensor,
+         dtype: Optional[str] = None,
+         device: Optional[str] = None) -> Tensor:
+    if dtype is None:
+        target_dtype = input.dtype
     else:
-        target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
+        if dtype == 'float32':
+            target_dtype = torch.float32
+        elif dtype == 'int32':
+            target_dtype = torch.int32
+        else:
+            target_dtype = {'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, 'int64': torch.int64, 'float16': torch.float16, 'float64': torch.float64, 'bool': torch.bool}[dtype]
 
     if target_dtype != input.dtype and device is not None:
-        input = input.to(dtype=target_dtype, device=device)
+        output = input.to(dtype=target_dtype, device=device)
     elif target_dtype != input.dtype:
-        input = input.to(dtype=target_dtype)
+        output = input.to(dtype=target_dtype)
     elif device is not None:
-        input = input.to(device=device)
+        output = input.to(device=device)
+    else:
+        output = input
 
-    return input
+    return output
 
 
 @jit
 def cast_like(input: Tensor, like: Tensor) -> Tensor:
-    if like.dtype != input.dtype:
-        input = input.to(dtype=like.dtype, device=like.device)
-    return input
+    return input.to(dtype=like.dtype, device=like.device)
 
 
+# ---- dtypes ----
 @jit
 def get_dtype(input: Tensor) -> str:
     if input.dtype == torch.float32:
diff --git a/tensorkit/examples/classification/mnist.py b/tensorkit/examples/classification/mnist.py
index 7657f16..9083898 100644
--- a/tensorkit/examples/classification/mnist.py
+++ b/tensorkit/examples/classification/mnist.py
@@ -43,7 +43,7 @@ def main(exp: mltk.Experiment[Config]):
 
     # we have initialized the network, now we can compile the net with JIT engine
     net = tk.layers.jit_compile(net)
-    mltk.print_with_time('Network compiled to JIT module')
+    mltk.print_with_time('Network compiled with JIT')
 
     # define the train and evaluate functions
     def train_step(x, y):
diff --git a/tensorkit/examples/classification/mnist_resnet.py b/tensorkit/examples/classification/mnist_resnet.py
index bb9a13e..dccbe78 100644
--- a/tensorkit/examples/classification/mnist_resnet.py
+++ b/tensorkit/examples/classification/mnist_resnet.py
@@ -49,7 +49,7 @@ def main(exp: mltk.Experiment[Config]):
 
     # we have initialized the network, now we can compile the net with JIT engine
     net = tk.layers.jit_compile(net)
-    mltk.print_with_time('Network compiled to JIT module')
+    mltk.print_with_time('Network compiled with JIT')
 
     # the train, test and validate functions
     def train_step(x, y):
diff --git a/tensorkit/layers/utils.py b/tensorkit/layers/utils.py
index d04b837..19b277e 100644
--- a/tensorkit/layers/utils.py
+++ b/tensorkit/layers/utils.py
@@ -54,6 +54,7 @@ def do_flatten(target, layer_or_layers):
     'leakyrelu': LeakyReLU,
     'sigmoid': Sigmoid,
     'tanh': Tanh,
+    'logsoftmax': LogSoftmax,
 }
 
 
diff --git a/tests/layers/test_core.py b/tests/layers/test_core.py
index a6c1eae..a520d9e 100644
--- a/tests/layers/test_core.py
+++ b/tests/layers/test_core.py
@@ -28,6 +28,12 @@ def forward(self, input: Tensor) -> Tensor:
         return self.wrapped(input)
 
 
+class _MyGetTraining(BaseLayer):
+
+    def forward(self) -> bool:
+        return self.training
+
+
 class UtilsAndConstantsTestCase(TestCase):
 
     def test_constants(self):
@@ -82,6 +88,44 @@ def test_param_and_buffer(self):
         self.assertDictEqual(dict(get_named_buffers(seq)), {'wrapped.c': c, 'wrapped.c2': c2})
         self.assertDictEqual(dict(get_named_buffers(seq, recursive=False)), {})
 
+    def test_layer_to_device(self):
+        for device in [None, T.CPU_DEVICE]:
+            layer = ResBlock2d(3, 4, kernel_size=2, device=device)
+            for param in tk.layers.get_parameters(layer):
+                self.assertEqual(T.get_device(param), device or T.current_device())
+
+            for device2 in [None, T.CPU_DEVICE]:
+                layer2 = tk.layers.layer_to_device(layer, device=device2)
+                for param in tk.layers.get_parameters(layer2):
+                    self.assertEqual(T.get_device(param), device2 or T.current_device())
+
+    def test_set_train_mode(self):
+        layers = [tk.layers.jit_compile(_MyGetTraining())
+                  for _ in range(3)]
+        layer = layers[0]
+
+        # set_train_mode
+        self.assertIs(tk.layers.set_train_mode(layer, True), layer)
+        self.assertEqual(layer(), True)
+        self.assertIs(tk.layers.set_train_mode(layer, False), layer)
+        self.assertEqual(layer(), False)
+
+        # set_eval_mode
+        tk.layers.set_train_mode(layer, True)
+        self.assertEqual(layer(), True)
+        self.assertIs(tk.layers.set_eval_mode(layer), layer)
+        self.assertEqual(layer(), False)
+
+        # scoped_eval_mode
+        for l in layers:
+            tk.layers.set_train_mode(l, True)
+            self.assertEqual(l(), True)
+        with tk.layers.scoped_eval_mode(layers[0], layers[1:]):
+            for l in layers:
+                self.assertEqual(l(), False)
+        for l in layers:
+            self.assertEqual(l(), True)
+
     def test_SimpleParamStore(self):
         initial_value = np.random.randn(2, 3, 4)
         store = SimpleParamStore([2, 3, 4], initializer=initial_value)
@@ -504,6 +548,9 @@ def test_batch_norm(self):
             _ = layer(x)
             set_train_mode(layer, False)
             y = layer(x)
+            set_train_mode(layer, True)
+            set_eval_mode(layer)
+            y2 = layer(x)
 
             # manually compute the expected output
             if T.backend_name == 'PyTorch':
@@ -519,6 +566,7 @@ def test_batch_norm(self):
 
             # check output
             assert_allclose(y, expected, rtol=1e-4, atol=1e-6)
+            assert_allclose(y2, expected, rtol=1e-4, atol=1e-6)
 
             # check invalid dimensions
             with pytest.raises(Exception, match='only supports .d input'):
@@ -571,3 +619,9 @@ def test_dropout(self):
             y = layer(x)
             self.assertTrue(np.all(T.to_numpy(y) != 0))
             assert_allclose(y, x, rtol=1e-4, atol=1e-6)
+
+            set_train_mode(layer, True)
+            set_eval_mode(layer)
+            y = layer(x)
+            self.assertTrue(np.all(T.to_numpy(y) != 0))
+            assert_allclose(y, x, rtol=1e-4, atol=1e-6)
diff --git a/tests/layers/test_utils.py b/tests/layers/test_utils.py
index 07ac35a..a3102c5 100644
--- a/tests/layers/test_utils.py
+++ b/tests/layers/test_utils.py
@@ -45,6 +45,7 @@ def test_get_activation_class(self):
                     ('Leaky_ReLU', tk.layers.LeakyReLU, (), {'negative_slope': 0.2}, T.nn.leaky_relu(x, 0.2)),
                     ('Sigmoid', tk.layers.Sigmoid, (), {}, T.nn.sigmoid(x)),
                     ('Tanh', tk.layers.Tanh, (), {}, T.tanh(x)),
+                    ('Log_Softmax', tk.layers.LogSoftmax, (), {}, T.nn.log_softmax(x)),
                 ]:
             name_candidates = (None,) if origin_name is None else (
                 origin_name,
diff --git a/tests/tensor/test_core.py b/tests/tensor/test_core.py
index c621c97..3d17e22 100644
--- a/tests/tensor/test_core.py
+++ b/tests/tensor/test_core.py
@@ -32,7 +32,30 @@ def test_jit_compile(self):
     def test_device(self):
         # ensure we're using GPU if GPU is available
         if T.gpu_device_list():
-            self.assertEqual(T.current_device(), T.gpu_device_list()[0])
+            gpu_list = T.gpu_device_list()
+            self.assertEqual(T.current_device(), gpu_list[0])
+            self.assertEqual(T.first_gpu_device(), gpu_list[0])
+        else:
+            self.assertEqual(T.first_gpu_device(), T.CPU_DEVICE)
+            with pytest.raises(RuntimeError, match='No GPU is available.'):
+                _ = T.first_gpu_device(fallback_to_cpu=False)
+
+        # test `get_device`
+        t = T.random.randn([2, 3, 4], dtype=T.float32)
+        self.assertEqual(T.get_device(t), T.current_device())
+
+        # test `to_device`
+        if T.current_device() != T.CPU_DEVICE:
+            t = T.random.randn([2, 3, 4], dtype=T.float32)
+            t2 = T.to_device(t, T.CPU_DEVICE)
+            self.assertEqual(T.get_device(t2), T.CPU_DEVICE)
+            assert_allclose(t, t2)
+
+        # test `use_device`
+        with T.use_device(T.CPU_DEVICE):
+            self.assertEqual(T.current_device(), T.CPU_DEVICE)
+            t = T.random.randn([2, 3, 4], dtype=T.float32)
+            self.assertEqual(T.get_device(t), T.CPU_DEVICE)
 
     def test_utilities(self):
         self.assertEqual(T.int_range(0, 10), list(range(10)))
@@ -72,35 +95,42 @@ def test_dtypes(self):
         self.assertIsInstance(t, T.Tensor)
         assert_equal(t, x)
 
-        # cast
-        for dtype in number_dtypes:
-            t2 = T.cast(t, dtype)
+    def test_cast(self):
+        x = np.asarray([1, 2, 3])
+        t = T.as_tensor(x)
+
+        # cast dtype
+        for dtype, device in itertools.product(
+                number_dtypes, [None, T.CPU_DEVICE]):
+            t2 = T.cast(t, dtype=dtype, device=device)
             self.assertIsInstance(t2, T.Tensor)
             self.assertEqual(T.get_dtype(t2), dtype)
+            self.assertEqual(T.get_device(t2), device or T.current_device())
             assert_equal(t2, x)
 
-        # cast_like
-        for like in (t, t2):
-            t3 = T.cast_like(t, like)
-            self.assertIsInstance(t3, T.Tensor)
-            self.assertEqual(T.get_dtype(t3), T.get_dtype(like))
-            self.assertEqual(T.get_device(t3), T.get_device(like))
-            assert_equal(t3, x)
+            # cast_like
+            for like in (t, t2):
+                t3 = T.cast_like(t, like)
+                self.assertIsInstance(t3, T.Tensor)
+                self.assertEqual(T.get_dtype(t3), T.get_dtype(like))
+                self.assertEqual(T.get_device(t3), T.get_device(like))
+                assert_equal(t3, x)
+
+        # only device
+        for device in [None, T.CPU_DEVICE]:
+            t2 = T.cast(t, device=device)
+            self.assertIsInstance(t2, T.Tensor)
+            self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+            self.assertEqual(T.get_device(t2), device or T.current_device())
+            assert_equal(t2, x)
 
-    def test_tensor_constructors(self):
-        # # as_tensor
-        # for x in [1., 1, [1., 2., 3.], np.array([1., 2., 3.])]:
-        #     t = T.as_tensor(x)
-        #     self.assertIsInstance(t, T.Tensor)
-        #     assert_equal(t, x)
-        # 
-        # x = T.as_tensor(np.asarray([1, 2, 3], dtype=np.int32))
-        # t = T.as_tensor(x)
-        # self.assertIs(t, x)
-        # 
-        # with pytest.raises(Exception):
-        #     _ = T.as_tensor(object())  # not a tensor, should raise error
+        # null cast
+        t2 = T.cast(t)
+        self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+        self.assertEqual(T.get_device(t2), T.get_device(t))
+        assert_equal(t2, x)
 
+    def test_tensor_constructors(self):
         # as_tensor
         def copy_tensor(o):
             if isinstance(o, StochasticTensor):
@@ -128,135 +158,187 @@ def copy_tensor(o):
                 x_value = copy.copy(x)
 
             for should_copy in [True, False]:
-                for dtype in (None,) + number_dtypes:
+                for dtype, device in itertools.product(
+                        (None,) + number_dtypes, [None, T.CPU_DEVICE]):
                     xx = copy_tensor(x)
                     self.assertIsInstance(xx, type(x))
-                    dtype_kwargs = {'dtype': dtype} if dtype is not None else {}
+                    kwargs = {'dtype': dtype} if dtype is not None else {}
+                    if device is not None:
+                        kwargs['device'] = device
 
-                    t = T.as_tensor(xx, force_copy=should_copy, **dtype_kwargs)
+                    t = T.as_tensor(xx, force_copy=should_copy, **kwargs)
                     self.assertIsInstance(t, T.Tensor)
+                    self.assertEqual(T.get_device(t), device or T.current_device())
                     if should_copy:
                         if hasattr(xx, '__setitem__'):
                             xx[0] = 12345.
                     assert_equal(t, x_value,
-                                 err_msg=f'{x}, {should_copy}, {dtype}')
+                                 err_msg=f'{x}, {should_copy}, {dtype}, {device}')
 
         with pytest.raises(Exception):
             _ = T.as_tensor(object())  # not a tensor, should raise error
 
         # from numpy: force copied
         for x in [np.array([1., 2., 3.])]:
-            for dtype in (None,) + number_dtypes:
+            for dtype, device in itertools.product(
+                    (None,) + number_dtypes, [None, T.CPU_DEVICE]):
                 xx = copy.copy(x)
                 self.assertIsInstance(xx, type(x))
-                dtype_kwargs = {'dtype': dtype} if dtype is not None else {}
-                t = T.from_numpy(xx, **dtype_kwargs)
+                kwargs = {'dtype': dtype} if dtype is not None else {}
+                if device is not None:
+                    kwargs['device'] = device
+                t = T.from_numpy(xx, **kwargs)
                 self.assertIsInstance(t, T.Tensor)
+                self.assertEqual(T.get_device(t), device or T.current_device())
                 xx[0] = 12345.
-                assert_equal(t, x, err_msg=f'{x}, {dtype}')
+                assert_equal(t, x, err_msg=f'{x}, {dtype}, {device}')
 
         with pytest.raises(Exception):
             _ = T.from_numpy(object())  # not a tensor, should raise error
 
         # float_scalar
-        for value in (1.25, 125):
-            for dtype in (T.float16, T.float32, T.float64):
-                t = T.float_scalar(value, dtype=dtype)
-                self.assertEqual(T.get_dtype(t), dtype)
-                assert_equal(t, value)
-        self.assertEqual(T.get_dtype(T.float_scalar(1.25)), T.float_x())
+        for value, dtype, device in itertools.product(
+                (1.25, 125),
+                (T.float16, T.float32, T.float64),
+                (None, T.CPU_DEVICE)):
+            t = T.float_scalar(value, dtype=dtype, device=device)
+            self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
+            assert_equal(t, value)
+
+            # float_scalar_like
+            t2 = T.float_scalar_like(value, t)
+            self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+            self.assertEqual(T.get_device(t2), T.get_device(t))
+
+        t = T.float_scalar(1.25)
+        assert_equal(t, 1.25)
+        self.assertEqual(T.get_dtype(t), T.float_x())
+        self.assertEqual(T.get_device(t), T.current_device())
 
         # int_scalar
-        for value in (2, 125):
-            for dtype in (T.int8, T.int16, T.int32, T.int64):
-                t = T.int_scalar(value, dtype=dtype)
-                self.assertEqual(T.get_dtype(t), dtype)
-                assert_equal(t, value)
-        self.assertEqual(T.get_dtype(T.int_scalar(125)), T.int32)
+        for value, dtype, device in itertools.product(
+                (2, 125),
+                (T.int8, T.int16, T.int32, T.int64),
+                (None, T.CPU_DEVICE)):
+            t = T.int_scalar(value, dtype=dtype)
+            self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
+            assert_equal(t, value)
 
-        # zeros
-        for shape in ([1, 2, 3], []):
-            for dtype in number_dtypes:
-                t = T.zeros(shape, dtype=dtype)
-                self.assertIsInstance(t, T.Tensor)
-                self.assertEqual(T.get_dtype(t), dtype)
-                assert_equal(t, np.zeros(shape))
+            # int_scalar_like
+            t2 = T.float_scalar_like(value, t)
+            self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+            self.assertEqual(T.get_device(t2), T.get_device(t))
 
-                # zeros_like
-                t2 = T.zeros_like(t)
-                self.assertIsInstance(t2, T.Tensor)
-                self.assertEqual(T.get_dtype(t2), dtype)
-                assert_equal(t, np.zeros(shape))
+        t = T.int_scalar(125)
+        self.assertEqual(T.get_dtype(t), T.int32)
+        self.assertEqual(T.get_device(t), T.current_device())
 
-                for dtype2 in (None,) + number_dtypes:
-                    for shape2 in (None, [7, 8]):
-                        t2 = T.zeros_like(t, dtype=dtype2, shape=shape2)
-                        self.assertIsInstance(t2, T.Tensor)
-                        self.assertEqual(T.get_dtype(t2), dtype2 or dtype)
-                        assert_equal(t2, np.zeros(shape2 or shape))
+        # zeros
+        for shape, dtype, device in itertools.product(
+                ([1, 2, 3], []),
+                number_dtypes,
+                (None, T.CPU_DEVICE)):
+            t = T.zeros(shape, dtype=dtype, device=device)
+            self.assertIsInstance(t, T.Tensor)
+            self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
+            assert_equal(t, np.zeros(shape))
 
-        # ones
-        for shape in ([1, 2, 3], []):
-            for dtype in number_dtypes:
-                t = T.ones(shape, dtype=dtype)
-                self.assertIsInstance(t, T.Tensor)
-                self.assertEqual(T.get_dtype(t), dtype)
-                assert_equal(t, np.ones(shape))
+            # zeros_like
+            t2 = T.zeros_like(t)
+            self.assertIsInstance(t2, T.Tensor)
+            self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+            self.assertEqual(T.get_device(t2), T.get_device(t))
+            assert_equal(t, np.zeros(shape))
+
+            for dtype2 in (None,) + number_dtypes:
+                for shape2 in (None, [7, 8]):
+                    t2 = T.zeros_like(t, dtype=dtype2, shape=shape2)
+                    self.assertIsInstance(t2, T.Tensor)
+                    self.assertEqual(T.get_dtype(t2), dtype2 or dtype)
+                    self.assertEqual(T.get_device(t2), T.get_device(t))
+                    assert_equal(t2, np.zeros(shape2 or shape))
 
-                # ones_like
-                t2 = T.ones_like(t)
-                self.assertIsInstance(t2, T.Tensor)
-                self.assertEqual(T.get_dtype(t2), dtype)
-                assert_equal(t, np.ones(shape))
+        # ones
+        for shape, dtype, device in itertools.product(
+                ([1, 2, 3], []),
+                number_dtypes,
+                (None, T.CPU_DEVICE)):
+            t = T.ones(shape, dtype=dtype)
+            self.assertIsInstance(t, T.Tensor)
+            self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
+            assert_equal(t, np.ones(shape))
 
-                for dtype2 in (None,) + number_dtypes:
-                    for shape2 in (None, [7, 8]):
-                        t2 = T.ones_like(t, dtype=dtype2, shape=shape2)
-                        self.assertIsInstance(t2, T.Tensor)
-                        self.assertEqual(T.get_dtype(t2), dtype2 or dtype)
-                        assert_equal(t2, np.ones(shape2 or shape))
+            # ones_like
+            t2 = T.ones_like(t)
+            self.assertIsInstance(t2, T.Tensor)
+            self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+            self.assertEqual(T.get_device(t2), T.get_device(t))
+            assert_equal(t, np.ones(shape))
+
+            for dtype2 in (None,) + number_dtypes:
+                for shape2 in (None, [7, 8]):
+                    t2 = T.ones_like(t, dtype=dtype2, shape=shape2)
+                    self.assertIsInstance(t2, T.Tensor)
+                    self.assertEqual(T.get_dtype(t2), dtype2 or dtype)
+                    self.assertEqual(T.get_device(t2), T.get_device(t))
+                    assert_equal(t2, np.ones(shape2 or shape))
 
         # full
         fill_value = 123
-        for shape in ([1, 2, 3], []):
-            for dtype in number_dtypes:
-                t = T.full(shape, fill_value, dtype=dtype)
-                self.assertIsInstance(t, T.Tensor)
-                self.assertEqual(T.get_dtype(t), dtype)
-                assert_equal(t, np.full(shape, fill_value))
-
-                # zeros_like
-                t2 = T.full_like(t, fill_value)
-                self.assertIsInstance(t2, T.Tensor)
-                self.assertEqual(T.get_dtype(t2), dtype)
-                assert_equal(t, np.full(shape, fill_value))
-
-                for dtype2 in (None,) + number_dtypes:
-                    for shape2 in (None, [7, 8]):
-                        t2 = T.full_like(t, fill_value, dtype=dtype2,
-                                         shape=shape2)
-                        self.assertIsInstance(t2, T.Tensor)
-                        self.assertEqual(T.get_dtype(t2), dtype2 or dtype)
-                        assert_equal(t2, np.full(shape2 or shape, fill_value))
+        for shape, dtype, device in itertools.product(
+                ([1, 2, 3], []),
+                number_dtypes,
+                (None, T.CPU_DEVICE)):
+            t = T.full(shape, fill_value, dtype=dtype, device=device)
+            self.assertIsInstance(t, T.Tensor)
+            self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
+            assert_equal(t, np.full(shape, fill_value))
+
+            # full_like
+            t2 = T.full_like(t, fill_value)
+            self.assertIsInstance(t2, T.Tensor)
+            self.assertEqual(T.get_dtype(t2), T.get_dtype(t))
+            self.assertEqual(T.get_device(t2), T.get_device(t))
+            assert_equal(t, np.full(shape, fill_value))
+
+            for dtype2 in (None,) + number_dtypes:
+                for shape2 in (None, [7, 8]):
+                    t2 = T.full_like(t, fill_value, dtype=dtype2,
+                                     shape=shape2)
+                    self.assertIsInstance(t2, T.Tensor)
+                    self.assertEqual(T.get_dtype(t2), dtype2 or dtype)
+                    self.assertEqual(T.get_device(t2), T.get_device(t))
+                    assert_equal(t2, np.full(shape2 or shape, fill_value))
 
         # arange
-        for start, end in [(1, 10), (0, 10)]:
-            t = T.arange(start, end)
-            self.assertIsInstance(t, T.Tensor)
-            self.assertEqual(T.get_dtype(t), T.int32)
-            assert_equal(t, np.arange(start, end))
+        for device in [None, T.current_device()]:
+            expected_device = device or T.current_device()
 
-        for start, end, step in [(0, 10, 2), (-2, -15, -3)]:
-            t = T.arange(start, end, step)
-            self.assertIsInstance(t, T.Tensor)
-            self.assertEqual(T.get_dtype(t), T.int32)
-            assert_equal(t, np.arange(start, end, step))
+            for start, end in [(1, 10), (0, 10)]:
+                t = T.arange(start, end, device=device)
+                self.assertIsInstance(t, T.Tensor)
+                self.assertEqual(T.get_dtype(t), T.int32)
+                self.assertEqual(T.get_device(t), expected_device)
+                assert_equal(t, np.arange(start, end))
 
-        for dtype in number_dtypes:
-            t = T.arange(0, 10, dtype=dtype)
-            self.assertIsInstance(t, T.Tensor)
-            self.assertEqual(T.get_dtype(t), dtype)
-            assert_equal(t, np.arange(10))
+            for start, end, step in [(0, 10, 2), (-2, -15, -3)]:
+                t = T.arange(start, end, step, device=device)
+                self.assertIsInstance(t, T.Tensor)
+                self.assertEqual(T.get_dtype(t), T.int32)
+                self.assertEqual(T.get_device(t), expected_device)
+                assert_equal(t, np.arange(start, end, step))
+
+            for dtype in number_dtypes:
+                t = T.arange(0, 10, dtype=dtype, device=device)
+                self.assertIsInstance(t, T.Tensor)
+                self.assertEqual(T.get_dtype(t), dtype)
+                self.assertEqual(T.get_device(t), expected_device)
+                assert_equal(t, np.arange(10))
 
         # one_hot
         for n_classes in [1, 5]:
@@ -265,6 +347,7 @@ def copy_tensor(o):
                 x = np.random.randint(0, n_classes, size=shape)
 
                 t = T.one_hot(T.as_tensor(x), n_classes)
+                self.assertEqual(T.get_device(t), T.current_device())
                 assert_equal(t, I[x])
 
                 for dtype in number_dtypes:
@@ -312,51 +395,61 @@ def is_requires_grad(t):
             except Exception:
                 return False
 
-        for dtype in number_dtypes:
-            t = T.variable([3], dtype=dtype, requires_grad=False)
-            self.assertIsInstance(t, T.Variable)
-            self.assertEqual(T.get_dtype(t), dtype)
-            self.assertEqual(T.shape(t), [3])
+        for device in [None, T.CPU_DEVICE]:
+            for dtype in number_dtypes:
+                t = T.variable([3], dtype=dtype, device=device, requires_grad=False)
+                self.assertIsInstance(t, T.Variable)
+                self.assertEqual(T.get_dtype(t), dtype)
+                self.assertEqual(T.get_device(t), device or T.current_device())
+                self.assertEqual(T.shape(t), [3])
 
-            t = T.variable([2, 3], dtype=t.dtype, requires_grad=False)
-            self.assertIsInstance(t, T.Variable)
-            self.assertEqual(T.get_dtype(t), dtype)
-            self.assertEqual(T.shape(t), [2, 3])
+                t = T.variable([2, 3], dtype=t.dtype, device=device, requires_grad=False)
+                self.assertIsInstance(t, T.Variable)
+                self.assertEqual(T.get_dtype(t), dtype)
+                self.assertEqual(T.get_device(t), device or T.current_device())
+                self.assertEqual(T.shape(t), [2, 3])
 
-        for dtype in float_dtypes:
-            # scalar initializer
-            for v in (123, 123., np.array(123.)):
+            for dtype in float_dtypes:
+                # scalar initializer
+                for v in (123, 123., np.array(123.)):
+                    for requires_grad in (True, False):
+                        t = T.variable(
+                            [3], dtype=dtype, device=device,
+                            requires_grad=requires_grad, initializer=v)
+                        self.assertIsInstance(t, T.Variable)
+                        self.assertEqual(T.get_dtype(t), dtype)
+                        self.assertEqual(T.get_device(t), device or T.current_device())
+                        self.assertEqual(is_requires_grad(t), requires_grad)
+                        assert_equal(t, np.array([v] * 3))
+
+                # array initializer
                 for requires_grad in (True, False):
-                    t = T.variable([3], dtype=dtype, requires_grad=requires_grad,
-                                   initializer=v)
+                    t = T.variable(
+                        [3], dtype=dtype, device=device,
+                        requires_grad=requires_grad, initializer=np.array([1., 2., 3.]))
                     self.assertIsInstance(t, T.Variable)
                     self.assertEqual(T.get_dtype(t), dtype)
+                    self.assertEqual(T.get_device(t), device or T.current_device())
                     self.assertEqual(is_requires_grad(t), requires_grad)
-                    assert_equal(t, np.array([v] * 3))
+                    assert_equal(t, [1., 2., 3.])
 
-            # array initializer
-            for requires_grad in (True, False):
-                t = T.variable([3], dtype=dtype, requires_grad=requires_grad,
-                               initializer=np.array([1., 2., 3.]))
-                self.assertIsInstance(t, T.Variable)
-                self.assertEqual(T.get_dtype(t), dtype)
-                self.assertEqual(is_requires_grad(t), requires_grad)
-                assert_equal(t, [1., 2., 3.])
-
-            with pytest.raises(ValueError,
-                               match=r'`initializer.shape` != `shape`: '
-                                     r'\[3\] vs \[4\]'):
-                _ = T.variable([4], dtype=dtype, requires_grad=False,
-                               initializer=np.array([1., 2., 3.]))
-
-            # callable initializer
-            for requires_grad in (True, False):
-                t = T.variable([3], dtype=dtype, requires_grad=requires_grad,
-                               initializer=partial(T.fill, fill_value=123.))
-                self.assertIsInstance(t, T.Variable)
-                self.assertEqual(T.get_dtype(t), dtype)
-                self.assertEqual(is_requires_grad(t), requires_grad)
-                assert_equal(t, [123.] * 3)
+                with pytest.raises(ValueError,
+                                   match=r'`initializer.shape` != `shape`: '
+                                         r'\[3\] vs \[4\]'):
+                    _ = T.variable(
+                        [4], dtype=dtype, device=device, requires_grad=False,
+                        initializer=np.array([1., 2., 3.]))
+
+                # callable initializer
+                for requires_grad in (True, False):
+                    t = T.variable(
+                        [3], dtype=dtype, device=device, requires_grad=requires_grad,
+                        initializer=partial(T.fill, fill_value=123.))
+                    self.assertIsInstance(t, T.Variable)
+                    self.assertEqual(T.get_dtype(t), dtype)
+                    self.assertEqual(T.get_device(t), device or T.current_device())
+                    self.assertEqual(is_requires_grad(t), requires_grad)
+                    assert_equal(t, [123.] * 3)
 
         # unsupported initializer
         with pytest.raises(TypeError, match='Unsupported initializer'):
diff --git a/tests/tensor/test_random.py b/tests/tensor/test_random.py
index 0502c91..8ddd8f6 100644
--- a/tests/tensor/test_random.py
+++ b/tests/tensor/test_random.py
@@ -15,8 +15,10 @@
 def do_check_log_prob(given, batch_ndims, Z_log_prob_fn, np_log_prob):
     # test log_prob
     for group_ndims in range(0, batch_ndims + 1):
+        out = Z_log_prob_fn(given, group_ndims=group_ndims)
+        assert(T.get_device(out) == T.get_device(given))
         assert_allclose(
-            Z_log_prob_fn(given, group_ndims=group_ndims),
+            out,
             np.sum(np_log_prob, axis=tuple(range(-group_ndims, 0))),
             rtol=1e-2, atol=1e-5,
         )
@@ -41,10 +43,11 @@ def test_seed(self):
         assert_allclose(x, z)
 
     def test_rand(self):
-        for dtype in float_dtypes:
+        for dtype, device in product(float_dtypes, [None, T.CPU_DEVICE]):
             # test sample dtype and shape
-            t = T.random.rand([n_samples, 2, 3, 4], dtype=dtype)
+            t = T.random.rand([n_samples, 2, 3, 4], dtype=dtype, device=device)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
             self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
 
             # test sample mean
@@ -58,11 +61,14 @@ def test_rand(self):
 
     def test_uniform(self):
         for low, high in [(-1., 2.), (3.5, 7.5)]:
-            for dtype in float_dtypes:
+            for dtype, device in product(float_dtypes, [None, T.CPU_DEVICE]):
                 # test sample dtype and shape
-                t = T.random.uniform([n_samples, 2, 3, 4], low=low, high=high,
-                                     dtype=dtype)
+                t = T.random.uniform(
+                    [n_samples, 2, 3, 4], low=low, high=high, dtype=dtype,
+                    device=device
+                )
                 self.assertEqual(T.get_dtype(t), dtype)
+                self.assertEqual(T.get_device(t), device or T.current_device())
                 self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
 
                 # test sample mean
@@ -70,7 +76,7 @@ def test_uniform(self):
                 x_mean = np.mean(x, axis=0)
                 np.testing.assert_array_less(
                     np.abs(x_mean - 0.5 * (high + low)),
-                    (3. * np.sqrt((high - low) ** 2 / 12) / np.sqrt(n_samples) *
+                    (5. * np.sqrt((high - low) ** 2 / 12) / np.sqrt(n_samples) *
                      np.ones_like(x_mean))
                 )
 
@@ -92,13 +98,14 @@ def test_shuffle_and_random_permutation(self):
             self.assertLess(equal_count, 100)
 
         # random_permutation
-        for dtype in int_dtypes:
+        for dtype, device in product(int_dtypes, [None, T.CPU_DEVICE]):
             for n in [0, 1, 5]:
                 x = np.arange(n)
                 equal_count = 0
                 for k in range(100):
-                    t = T.random.random_permutation(n, dtype=dtype)
+                    t = T.random.random_permutation(n, dtype=dtype, device=device)
                     self.assertEqual(T.get_dtype(t), dtype)
+                    self.assertEqual(T.get_device(t), device or T.current_device())
                     if np.all(np.equal(T.to_numpy(t), x)):
                         equal_count += 1
                     assert_equal(np.sort(T.to_numpy(t)), x)
@@ -106,10 +113,11 @@ def test_shuffle_and_random_permutation(self):
                     self.assertLess(equal_count, 100)
 
     def test_randn(self):
-        for dtype in float_dtypes:
+        for dtype, device in product(float_dtypes, [None, T.CPU_DEVICE]):
             # test sample dtype and shape
-            t = T.random.randn([n_samples, 2, 3, 4], dtype=dtype)
+            t = T.random.randn([n_samples, 2, 3, 4], dtype=dtype, device=device)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), device or T.current_device())
             self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
 
             # test sample mean
@@ -216,6 +224,7 @@ def log_prob(given):
             logstd_t = T.cast(T.expand(T.as_tensor(logstd), [n_samples, 1, 3, 4]), dtype)
             t = T.random.normal(mean_t, std_t)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
             self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
 
             # test sample mean
@@ -243,6 +252,7 @@ def log_prob(given):
             logstd_t = T.as_tensor(logstd, dtype)
             t = T.random.normal(mean_t, std_t, n_samples=n_samples)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
             self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
 
             # test sample mean
@@ -269,6 +279,7 @@ def log_prob(given):
             logstd_t = T.as_tensor(logstd, dtype)
             t = T.random.normal(mean_t, std_t)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
 
             # test log_prob
             x = T.to_numpy(t)
@@ -371,6 +382,7 @@ def do_test(low, high, dtype):
             t = T.random.truncated_normal(
                 mean_t, std_t, n_samples=n_samples, low=low, high=high)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
             self.assertEqual(T.shape(t), [n_samples, 2, 3, 4])
 
             # test sample value range
@@ -408,6 +420,7 @@ def do_test(low, high, dtype):
             logstd_t = T.as_tensor(logstd, dtype)
             t = T.random.truncated_normal(mean_t, std_t, low=low, high=high)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
 
             # test sample value range
             x = T.to_numpy(t)
@@ -552,6 +565,7 @@ def do_test_sample(n_z, sample_shape, float_dtype, dtype):
             t = T.random.bernoulli(
                 probs=probs_t, n_samples=n_z, dtype=dtype)
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
             self.assertEqual(T.shape(t), sample_shape + [2, 3, 4])
 
             # all values must be either 0 or 1
@@ -657,6 +671,7 @@ def do_test_sample(is_one_hot: bool,
             t = (T.random.one_hot_categorical if is_one_hot
                  else T.random.categorical)(probs_t, n_samples=n_z, **kwargs)
             self.assertEqual(T.get_dtype(t), expected_dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
             self.assertEqual(T.shape(t), sample_shape + [2, 3, 4] + value_shape)
 
             # check values
@@ -866,6 +881,7 @@ def do_test_sample(bin_size: float,
                 validate_tensors=validate_tensors,
             )
             self.assertEqual(T.get_dtype(t), dtype)
+            self.assertEqual(T.get_device(t), T.current_device())
             self.assertEqual(T.shape(t), sample_shape + value_shape)
 
             # check values

From a5908571ad16b8eb44d47ea6a43a0b8e4baa73a5 Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Tue, 18 Feb 2020 02:16:02 +0800
Subject: [PATCH 5/7] added some tests

---
 tensorkit/backend/pytorch_/core.py |  5 ++-
 tensorkit/utils/data_utils.py      |  2 --
 tensorkit/utils/tensor_stream.py   | 29 ++++++++++++++---
 tests/tensor/test_core.py          |  2 +-
 tests/utils/__init__.py            |  0
 tests/utils/test_data_utils.py     | 50 +++++++++++++++++++++++++++++
 tests/utils/test_tensor_stream.py  | 51 ++++++++++++++++++++++++++++++
 7 files changed, 129 insertions(+), 10 deletions(-)
 create mode 100644 tests/utils/__init__.py
 create mode 100644 tests/utils/test_data_utils.py
 create mode 100644 tests/utils/test_tensor_stream.py

diff --git a/tensorkit/backend/pytorch_/core.py b/tensorkit/backend/pytorch_/core.py
index f26faca..93a28cb 100644
--- a/tensorkit/backend/pytorch_/core.py
+++ b/tensorkit/backend/pytorch_/core.py
@@ -159,9 +159,8 @@ def use_device(device: str):
     else:
         old_device = _current_device[0]
         try:
-            with torch.cuda.device(device):
-                _current_device[0] = device
-                yield
+            _current_device[0] = device
+            yield
         finally:
             _current_device[0] = old_device
 
diff --git a/tensorkit/utils/data_utils.py b/tensorkit/utils/data_utils.py
index a6a0168..fe18ae4 100644
--- a/tensorkit/utils/data_utils.py
+++ b/tensorkit/utils/data_utils.py
@@ -1,5 +1,3 @@
-import mltk
-
 import numpy as np
 
 from tensorkit import tensor as T
diff --git a/tensorkit/utils/tensor_stream.py b/tensorkit/utils/tensor_stream.py
index 57e4584..0d1a6c1 100644
--- a/tensorkit/utils/tensor_stream.py
+++ b/tensorkit/utils/tensor_stream.py
@@ -12,11 +12,22 @@
 
 
 class TensorStream(mltk.DataStream):
+    """
+    A subclass of :class:`mltk.DataStream` that transforms the underlying
+    NumPy array data stream into tensor data stream.
+    """
 
     source: mltk.DataStream
     device: str
 
     def __init__(self, source: mltk.DataStream, device: Optional[str] = None):
+        """
+        Construct a new :class:`TensorStream`.
+
+        Args:
+            source: The source data stream.
+            device: The device where to place new tensors.
+        """
         device = device or T.current_device()
         super().__init__(
             batch_size=source.batch_size,
@@ -45,14 +56,24 @@ def _minibatch_iterator(self) -> Generator[ArrayTuple, None, None]:
         finally:
             g.close()
 
-    def _concat_arrays(self, arrays: Sequence[T.Tensor]) -> T.Tensor:
-        return T.concat(list(arrays), axis=0)
-
 
 def as_tensor_stream(source: mltk.DataStream,
                      device: Optional[str] = None,
                      prefetch: Optional[int] = None
-                     ) -> mltk.DataStream:
+                     ) -> Union[TensorStream, mltk.data.ThreadingDataStream]:
+    """
+    Construct a tensor data stream.
+
+    Args:
+        source: The source NumPy array stream.
+        device: The device where to place new tensors.
+        prefetch: Number of batches to prefetch in background.
+            If specified, will wrap the constructed :class:`TensorStream`
+            with a :class:`mltk.data.ThreadingDataStream`.
+
+    Returns:
+        The tensor data stream.
+    """
     stream = TensorStream(source, device=device)
     if prefetch is not None:
         stream = stream.threaded(prefetch)
diff --git a/tests/tensor/test_core.py b/tests/tensor/test_core.py
index 3d17e22..fb2eb09 100644
--- a/tests/tensor/test_core.py
+++ b/tests/tensor/test_core.py
@@ -221,7 +221,7 @@ def copy_tensor(o):
                 (2, 125),
                 (T.int8, T.int16, T.int32, T.int64),
                 (None, T.CPU_DEVICE)):
-            t = T.int_scalar(value, dtype=dtype)
+            t = T.int_scalar(value, dtype=dtype, device=device)
             self.assertEqual(T.get_dtype(t), dtype)
             self.assertEqual(T.get_device(t), device or T.current_device())
             assert_equal(t, value)
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/utils/test_data_utils.py b/tests/utils/test_data_utils.py
new file mode 100644
index 0000000..db0b95e
--- /dev/null
+++ b/tests/utils/test_data_utils.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pytest
+
+import tensorkit as tk
+from tensorkit import tensor as T
+from tests.helper import *
+from tests.ops import *
+
+
+class DataUtilsTestCase(TestCase):
+
+    def test_channel_from_last_to_first_nd(self):
+        for spatial_ndims in (1, 2, 3):
+            bad_input = np.random.randn(*([7, 8, 9, 10][:spatial_ndims + 1]))
+            last_to_first = getattr(
+                tk.utils,
+                f'numpy_channel_from_last_to_first{spatial_ndims}d'
+            )
+            first_to_last = getattr(
+                tk.utils,
+                f'numpy_channel_from_first_to_last{spatial_ndims}d'
+            )
+            last_to_default = getattr(
+                tk.utils,
+                f'numpy_channel_from_last_to_default{spatial_ndims}d'
+            )
+            default_to_last = getattr(
+                tk.utils,
+                f'numpy_channel_from_default_to_last{spatial_ndims}d'
+            )
+
+            for op in (first_to_last, last_to_first):
+                with pytest.raises(ValueError,
+                                   match=f'`input` is expected to be at least '
+                                         f'{spatial_ndims + 2}d'):
+                    _ = op(bad_input)
+
+            for batch_shape in ([5], [2, 3]):
+                x = np.random.randn(*(
+                    batch_shape + [7, 8, 9, 10][:spatial_ndims + 1]))  # assume x is channel last
+                y = last_to_first(x)
+                assert_allclose(y, channel_to_first_nd(x, spatial_ndims))
+                assert_allclose(first_to_last(y), channel_to_last_nd(y, spatial_ndims))
+
+                if T.IS_CHANNEL_LAST:
+                    assert_allclose(last_to_default(x), x)
+                    assert_allclose(default_to_last(x), x)
+                else:
+                    assert_allclose(last_to_default(x), y)
+                    assert_allclose(default_to_last(y), x)
diff --git a/tests/utils/test_tensor_stream.py b/tests/utils/test_tensor_stream.py
new file mode 100644
index 0000000..d54650c
--- /dev/null
+++ b/tests/utils/test_tensor_stream.py
@@ -0,0 +1,51 @@
+from itertools import product
+
+import mltk
+import numpy as np
+import tensorkit as tk
+from tensorkit import tensor as T
+from tests.helper import *
+
+
+class TensorStreamTestCase(TestCase):
+
+    def test_TensorStream(self):
+        x = np.random.randn(17, 3, 4)
+        y = np.random.randn(17, 5)
+        source = mltk.DataStream.arrays(
+            [x, y], batch_size=3, random_state=np.random.RandomState())
+
+        # test tensor stream
+        for device in [None, T.CPU_DEVICE]:
+            stream = tk.utils.as_tensor_stream(source, device=device)
+            self.assertIsInstance(stream, tk.utils.TensorStream)
+            self.assertEqual(stream.device, device or T.current_device())
+
+            for attr in ('batch_size', 'array_count', 'data_shapes',
+                         'data_length', 'random_state'):
+                self.assertEqual(getattr(stream, attr), getattr(source, attr))
+
+            out_x, out_y = stream.get_arrays()
+            assert_allclose(out_x, x, rtol=1e-4, atol=1e-6)
+            assert_allclose(out_y, y, rtol=1e-4, atol=1e-6)
+
+            for batch_x, batch_y in stream:
+                self.assertIsInstance(batch_x, T.Tensor)
+                self.assertEqual(T.get_device(batch_x), device or T.current_device())
+                self.assertIsInstance(batch_y, T.Tensor)
+                self.assertEqual(T.get_device(batch_y), device or T.current_device())
+
+            # test copy
+            for device2 in [None, T.CPU_DEVICE]:
+                kwargs = {'device': device2} if device2 is not None else {}
+                stream2 = stream.copy(**kwargs)
+                self.assertIs(stream2.source, stream.source)
+                self.assertEqual(stream2.device, device2 or stream.device)
+
+        # test prefetch
+        stream = tk.utils.as_tensor_stream(source, prefetch=3)
+        self.assertIsInstance(stream.source, tk.utils.TensorStream)
+
+        out_x, out_y = stream.get_arrays()
+        assert_allclose(out_x, x, rtol=1e-4, atol=1e-6)
+        assert_allclose(out_y, y, rtol=1e-4, atol=1e-6)

From a1e54a684430d715810a7776aa2f57812e047c1d Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Tue, 18 Feb 2020 11:40:45 +0800
Subject: [PATCH 6/7] add tests for optimizers and lr_scheduler

---
 tensorkit/backend/pytorch_/core.py  |   2 +
 tensorkit/backend/pytorch_/optim.py |  27 +++++--
 tensorkit/optim/lr_scheduler.py     |  10 ++-
 tests/helper.py                     |  18 ++++-
 tests/optim/__init__.py             |   0
 tests/optim/test_lr_scheduler.py    |  65 ++++++++++++++++
 tests/optim/test_optimizer.py       | 115 ++++++++++++++++++++++++++++
 7 files changed, 223 insertions(+), 14 deletions(-)
 create mode 100644 tests/optim/__init__.py
 create mode 100644 tests/optim/test_lr_scheduler.py
 create mode 100644 tests/optim/test_optimizer.py

diff --git a/tensorkit/backend/pytorch_/core.py b/tensorkit/backend/pytorch_/core.py
index 93a28cb..2d85510 100644
--- a/tensorkit/backend/pytorch_/core.py
+++ b/tensorkit/backend/pytorch_/core.py
@@ -586,6 +586,8 @@ def variable(shape: List[int],
         if list(initializer.shape) != shape:
             raise ValueError(f'`initializer.shape` != `shape`: '
                              f'{list(initializer.shape)} vs {shape}')
+        if isinstance(initializer, Tensor):
+            initializer = to_numpy(initializer)
         ret = as_tensor(initializer, dtype=target_dtype,
                         device=device, force_copy=force_copy)
         if requires_grad:
diff --git a/tensorkit/backend/pytorch_/optim.py b/tensorkit/backend/pytorch_/optim.py
index 70e3ca9..dedb60a 100644
--- a/tensorkit/backend/pytorch_/optim.py
+++ b/tensorkit/backend/pytorch_/optim.py
@@ -2,8 +2,6 @@
 from typing import *
 
 import torch
-from torch.optim import (adam, adadelta, adagrad, adamax,
-                         rmsprop, sgd)
 from torch.optim.optimizer import Optimizer as TorchOptimizer
 
 from .core import *
@@ -22,7 +20,7 @@ def lr(self) -> float:
     def set_lr(self, lr: float):
         raise NotImplementedError()
 
-    def add_params(self, params: List[Variable]):
+    def add_param_group(self, params: Iterator[Variable]):
         raise NotImplementedError()
 
     def clear_grad(self):
@@ -56,6 +54,7 @@ def __init__(self,
         self.torch_optimizer = torch_optimizer
         self.set_lr(lr)
 
+    @property
     def lr(self) -> float:
         return self._lr
 
@@ -65,7 +64,7 @@ def set_lr(self, lr: float):
                 group['lr'] = lr
         self._lr = lr
 
-    def add_params(self, params: Sequence[Variable]):
+    def add_param_group(self, params: Iterator[Variable]):
         self.torch_optimizer.add_param_group({
             'params': list(params),
             'lr': self._lr,
@@ -91,6 +90,13 @@ def state_dict(self) -> Dict[str, Any]:
     def load_state_dict(self, state_dict: Dict[str, Any]):
         self.torch_optimizer.load_state_dict(state_dict)
 
+        # ensure that we've got all state on the same device as the parameters.
+        device = self.torch_optimizer.param_groups[0]['params'][0].device
+        for state in self.torch_optimizer.state.values():
+            for k, v in state.items():
+                if isinstance(v, torch.Tensor):
+                    state[k] = v.to(device=device)
+
 
 class SGD(BackendOptimizer):
 
@@ -99,9 +105,18 @@ def __init__(self,
                  lr: float,
                  momentum: float = 0.,
                  nesterov: bool = False):
+        """
+        Construct a new :class:`SGD` optimizer.
+
+        Args:
+            params: The parameters to be optimized.
+            lr: The learning rate.
+            momentum: The momentum.  Typically 0.9 for momentum SGD optimization.
+            nesterov: Whether or not to use Nesterov momentum optimizer?
+        """
         super().__init__(
             lr=lr,
-            torch_optimizer=torch.optim.sgd.SGD(
+            torch_optimizer=torch.optim.SGD(
                 params=params,
                 lr=lr,
                 momentum=momentum,
@@ -121,7 +136,7 @@ def __init__(self,
                  amsgrad: bool = False):
         super().__init__(
             lr=lr,
-            torch_optimizer=adam.Adam(
+            torch_optimizer=torch.optim.Adam(
                 params=params,
                 lr=lr,
                 betas=(beta_1, beta_2),
diff --git a/tensorkit/optim/lr_scheduler.py b/tensorkit/optim/lr_scheduler.py
index 8dfb597..ad24aee 100644
--- a/tensorkit/optim/lr_scheduler.py
+++ b/tensorkit/optim/lr_scheduler.py
@@ -1,5 +1,3 @@
-from typing import *
-
 import mltk
 
 from .core import *
@@ -30,8 +28,8 @@ def update_lr(self):
         """Update the learning rate of the optimizer according to the loop."""
         raise NotImplementedError()
 
-    def close(self):
-        """Close this scheduler, such that it will no longer affect the optimizer."""
+    def unbind_events(self):
+        """Unregister this scheduler from the loop events."""
         self._unbind_events(self.loop)
 
     def _bind_events(self, loop: mltk.TrainLoop):
@@ -42,6 +40,10 @@ def _unbind_events(self, loop: mltk.TrainLoop):
 
 
 class AnnealingLR(LRScheduler):
+    """
+    Learning rate scheduler that anneals the learning rate after every few
+    `epochs`, by a specified `ratio`.
+    """
 
     initial_lr: float
     ratio: float
diff --git a/tests/helper.py b/tests/helper.py
index bd07d66..bd64b3f 100644
--- a/tests/helper.py
+++ b/tests/helper.py
@@ -13,7 +13,7 @@
     'int_dtypes', 'float_dtypes', 'number_dtypes',
     'n_samples',
 
-    'assert_allclose', 'assert_not_equal', 'assert_equal',
+    'assert_allclose', 'assert_not_allclose', 'assert_equal',  'assert_not_equal',
 
     'slow_test',
 
@@ -47,9 +47,9 @@ def wrapper(x, y, **kwargs):
 
 
 @wrap_numpy_testing_assertion_fn
-def assert_not_equal(x, y, err_msg=''):
-    if np.all(np.equal(x, y)):
-        msg = f'`x != y` not hold'
+def assert_not_allclose(x, y, err_msg='', **kwargs):
+    if np.all(np.allclose(x, y, **kwargs)):
+        msg = f'`not allclose(x, y)` not hold'
         if err_msg:
             msg += f': {err_msg}'
         msg += f'\nx = {x}\ny = {y}'
@@ -59,6 +59,16 @@ def assert_not_equal(x, y, err_msg=''):
 assert_equal = wrap_numpy_testing_assertion_fn(np.testing.assert_equal)
 
 
+@wrap_numpy_testing_assertion_fn
+def assert_not_equal(x, y, err_msg=''):
+    if np.all(np.equal(x, y)):
+        msg = f'`x != y` not hold'
+        if err_msg:
+            msg += f': {err_msg}'
+        msg += f'\nx = {x}\ny = {y}'
+        raise AssertionError(msg)
+
+
 # decorate a test that is slow
 def slow_test(fn):
     fn = pytest.mark.skipif(
diff --git a/tests/optim/__init__.py b/tests/optim/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/optim/test_lr_scheduler.py b/tests/optim/test_lr_scheduler.py
new file mode 100644
index 0000000..f9c0268
--- /dev/null
+++ b/tests/optim/test_lr_scheduler.py
@@ -0,0 +1,65 @@
+import mltk
+from mock import Mock
+
+import tensorkit as tk
+from tests.helper import *
+
+
+class _MyFakeOptimizer(object):
+    def __init__(self, lr):
+        self.lr = lr
+
+    def set_lr(self, lr):
+        self.lr = lr
+
+
+def standard_lr_scheduler_check(ctx, scheduler_factory, lr_func):
+    # test start with eopch = 0
+    optimizer = _MyFakeOptimizer(0.1)
+    assert_allclose(optimizer.lr, 0.1)
+
+    ev_hosts = mltk.EventHost()
+    loop = Mock(epoch=0, on_epoch_end=ev_hosts['on_epoch_end'])
+    scheduler = scheduler_factory(loop, optimizer)
+    assert_allclose(optimizer.lr, lr_func(loop, optimizer))
+
+    for epoch in range(1, 29):
+        loop.epoch = epoch
+        if epoch < 15:
+            scheduler.update_lr()
+        else:
+            ev_hosts.fire('on_epoch_end')
+        assert_allclose(optimizer.lr, lr_func(loop, optimizer))
+
+    final_lr = optimizer.lr
+    scheduler.unbind_events()
+    for epoch in range(29, 39):
+        loop.epoch = epoch
+        ev_hosts.fire('on_epoch_end')
+        assert_allclose(optimizer.lr, final_lr)
+
+    for epoch in range(29, 39):
+        loop.epoch = epoch
+        scheduler.update_lr()  # still can update the lr if manually called
+        assert_allclose(optimizer.lr, lr_func(loop, optimizer))
+
+    # test start with epoch = some value
+    optimizer = _MyFakeOptimizer(0.1)
+    assert_allclose(optimizer.lr, 0.1)
+
+    ev_hosts = mltk.EventHost()
+    loop = Mock(epoch=40, on_epoch_end=ev_hosts['on_epoch_end'])
+    scheduler = scheduler_factory(loop, optimizer)
+    assert_allclose(optimizer.lr, lr_func(loop, optimizer))
+
+
+class LRSchedulerTestCaes(TestCase):
+
+    def test_annealing_lr(self):
+        standard_lr_scheduler_check(
+            self,
+            lambda loop, optimizer: tk.optim.lr_scheduler.AnnealingLR(
+                loop, optimizer, initial_lr=0.01, ratio=0.5, epochs=2
+            ),
+            lambda loop, optimizer: 0.01 * 0.5 ** int(loop.epoch // 2)
+        )
diff --git a/tests/optim/test_optimizer.py b/tests/optim/test_optimizer.py
new file mode 100644
index 0000000..7aac4ea
--- /dev/null
+++ b/tests/optim/test_optimizer.py
@@ -0,0 +1,115 @@
+import os
+from functools import partial
+from tempfile import TemporaryDirectory
+
+import tensorkit as tk
+from tensorkit import tensor as T
+from tests.helper import *
+
+
+def optimizer_standard_check(ctx, optimizer_factory, lr):
+    a = T.variable([], initializer=123.)
+    b = T.variable([], initializer=456.)
+
+    def calculate_loss(a, b):
+        return (a + b) ** 2
+
+    optimizer = optimizer_factory(iter([a]), lr)
+    ctx.assertEqual(optimizer.lr, lr)
+
+    # test optimize a
+    optimizer.clear_grad()
+    with optimizer.capture_grad():
+        loss = calculate_loss(a, b)
+        optimizer.minimize(loss)
+        ctx.assertLessEqual(calculate_loss(a, b), loss)
+        assert_not_equal(a, 123.)
+        assert_equal(b, 456.)
+
+    # test optimize a and b
+    optimizer.clear_grad()
+    optimizer.add_param_group(iter([b]))
+    with optimizer.capture_grad():
+        loss = calculate_loss(a, b)
+        optimizer.minimize(loss)
+        ctx.assertLessEqual(calculate_loss(a, b), loss)
+        assert_not_equal(a, 123.)
+        assert_not_equal(b, 456.)
+
+    # save checkpoint
+    with TemporaryDirectory() as temp_dir:
+        ckpt_path = os.path.join(temp_dir, 'ckpt')
+        checkpoint = tk.train.Checkpoint(optimizer=optimizer)
+        checkpoint.save(ckpt_path)
+
+        # test backup and restore the status
+        a2 = T.variable([], initializer=a)
+        b2 = T.variable([], initializer=b)
+        optimizer2 = optimizer_factory([a2], lr)
+        optimizer2.add_param_group([b2])
+        checkpoint2 = tk.train.Checkpoint(optimizer=optimizer2)
+        checkpoint2.restore(ckpt_path)
+
+        with optimizer2.capture_grad():
+            loss = calculate_loss(a2, b2)
+            optimizer2.minimize(loss)
+            ctx.assertLessEqual(calculate_loss(a2, b2), loss)
+            assert_not_equal(a2, a)
+            assert_not_equal(b2, b)
+
+        # test backup and restore the status, and use maximize instead of minimize
+        a3 = T.variable([], initializer=a)
+        b3 = T.variable([], initializer=b)
+        optimizer3 = optimizer_factory([a3], lr)
+        optimizer3.add_param_group([b3])
+        checkpoint3 = tk.train.Checkpoint(optimizer=optimizer3)
+        checkpoint3.restore(ckpt_path)
+
+        with optimizer3.capture_grad():
+            loss = calculate_loss(a3, b3)
+            optimizer3.maximize(-loss)
+            ctx.assertLessEqual(calculate_loss(a3, b3), loss)
+            assert_allclose(a3, a2)
+            assert_allclose(b3, b2)
+            assert_allclose(calculate_loss(a3, b3), calculate_loss(a2, b2))
+
+        # backup and restore the status, change the learning rate and get
+        # the third output, and compare to the result with optimizer2
+        a4 = T.variable([], initializer=a)
+        b4 = T.variable([], initializer=b)
+        optimizer4 = optimizer_factory([a4], lr)
+        optimizer4.add_param_group([b4])
+        checkpoint4 = tk.train.Checkpoint(optimizer=optimizer4)
+        checkpoint4.restore(ckpt_path)
+
+        optimizer4.set_lr(lr * 0.5)
+        ctx.assertEqual(optimizer4.lr, lr * 0.5)
+        with optimizer4.capture_grad():
+            loss = calculate_loss(a4, b4)
+            optimizer4.minimize(loss)
+            assert_not_allclose(a4, a2)
+            assert_not_allclose(b4, b2)
+            assert_not_allclose(calculate_loss(a4, b4), calculate_loss(a2, b2))
+
+    # now proceed the optimization from the first optimizer, and compare
+    # the result with optimizer2
+    optimizer.clear_grad()
+    with optimizer.capture_grad():
+        loss = calculate_loss(a, b)
+        optimizer.minimize(loss)
+        ctx.assertLessEqual(calculate_loss(a, b), loss)
+        assert_allclose(a, a2)
+        assert_allclose(b, b2)
+        assert_allclose(calculate_loss(a, b), calculate_loss(a2, b2))
+
+
+class OptimizerTestCase(TestCase):
+
+    def test_sgd(self):
+        optimizer_standard_check(self, partial(tk.optim.SGD), 0.001)
+        optimizer_standard_check(self, partial(tk.optim.SGD, momentum=0.9), 0.001)
+        optimizer_standard_check(self, partial(tk.optim.SGD, momentum=0.9, nesterov=True), 0.001)
+
+    def test_adam(self):
+        optimizer_standard_check(self, partial(tk.optim.Adam), 0.1)
+        optimizer_standard_check(self, partial(tk.optim.Adam, amsgrad=True), 0.1)

From 1b8042578215abcb0e9fc46866593650c4b5a33a Mon Sep 17 00:00:00 2001
From: Haowen Xu <haowen.xu@outlook.com>
Date: Thu, 20 Feb 2020 16:57:57 +0800
Subject: [PATCH 7/7] add tests for SequentialBuilder

---
 tensorkit/layers/builder.py  | 150 ++++++---
 tensorkit/layers/shape_.py   |  27 +-
 tests/layers/test_builder.py | 575 +++++++++++++++++++++++++++++++++++
 tests/tensor/test_core.py    |   2 +-
 4 files changed, 707 insertions(+), 47 deletions(-)
 create mode 100644 tests/layers/test_builder.py

diff --git a/tensorkit/layers/builder.py b/tensorkit/layers/builder.py
index 240d500..9442c17 100644
--- a/tensorkit/layers/builder.py
+++ b/tensorkit/layers/builder.py
@@ -1,4 +1,3 @@
-import re
 from contextlib import contextmanager
 from typing import *
 
@@ -14,7 +13,7 @@
 from ..arg_check import *
 from ..typing_ import *
 
-__all__ = ['SequentialBuilder']
+__all__ = ['LayerArgs', 'SequentialBuilder']
 
 
 def _get_layer_class(name: str) -> type:
@@ -64,7 +63,7 @@ def _calculate_deconv_output_size(in_size, kernel_size, stride, padding, output_
         if i is None:
             out_size.append(None)
         else:
-            l = T.utils.calculate_deconv_output_size(d[i], [k], [s], [p], [op], [d])[0]
+            l = T.utils.calculate_deconv_output_size([i], [k], [s], [p], [op], [d])[0]
             out_size.append(l)
     return out_size
 
@@ -157,6 +156,8 @@ def build(self, type_: Union[str, type], *args, **kwargs):
         Returns:
             The built layer object.
         """
+        if isinstance(type_, str):
+            type_ = _get_layer_class(type_)
         return type_(*args, **self.get_kwargs(type_, **kwargs))
 
 
@@ -176,7 +177,7 @@ def __init__(self,
                  *,
                  in_shape: Sequence[Optional[int]] = NOT_SET,
                  in_channels: Optional[int] = NOT_SET,
-                 in_spatial_shape: List[int] = NOT_SET,
+                 in_size: Sequence[Optional[int]] = NOT_SET,
                  in_builder: 'SequentialBuilder' = NOT_SET):
         """
         Construct a new :class:`SequentialBuilder`.
@@ -188,7 +189,7 @@ def __init__(self,
                 used as the `in_shape` of this :class:`SequentialBuilder`.
             in_shape: The input shape.
             in_channels: The number of input channels.
-            in_spatial_shape: The input spatial shape.  Can be specified
+            in_size: The input spatial size.  Can be specified
                 only if `in_channels` is specified, or `in_spec` is a int.
             in_builder: Explicitly specify the previous sequential builder.
         """
@@ -201,28 +202,30 @@ def __init__(self,
                 '`in_builder` should be specified.'
             )
 
+        layer_args = None
         if isinstance(in_spec, SequentialBuilder):
             in_builder = in_spec
             layer_args = LayerArgs(in_builder.layer_args)
         elif hasattr(in_spec, '__iter__'):
             in_shape = in_spec
-            layer_args = LayerArgs()
-        else:
+        elif in_spec is not NOT_SET:
             in_channels = in_spec
+
+        if layer_args is None:
             layer_args = LayerArgs()
 
-        if in_spatial_shape is not NOT_SET and in_channels is NOT_SET:
+        if in_size is not NOT_SET and in_channels is NOT_SET:
             raise ValueError(
-                '`in_spatial_shape` can be specified only when `in_channels` '
+                '`in_size` can be specified only when `in_channels` '
                 'is specified, or `in_spec` is None or an integer.'
             )
 
         if in_shape is not NOT_SET:
             in_shape = list(in_shape)
         elif in_channels is not NOT_SET:
-            if in_spatial_shape is NOT_SET:
-                in_spatial_shape = []
-            in_shape = _unsplit_channel_spatial(in_channels, in_spatial_shape)
+            if in_size is NOT_SET:
+                in_size = []
+            in_shape = _unsplit_channel_spatial(in_channels, in_size)
         else:
             in_shape = list(in_builder.out_shape)
 
@@ -238,7 +241,7 @@ def _assert_out_shape(self,
                           spatial: Optional[Sequence[bool]] = None,
                           at_least: bool = False) -> List[Optional[int]]:
         if shape is None:
-            if channel is None:
+            if channel is None:  # pragma: no cover
                 raise ValueError('`channel` must be specified when `shape` is not.')
             shape = _unsplit_channel_spatial(channel, spatial or [])
 
@@ -315,7 +318,7 @@ def add(self,
             out_shape: List[Optional[int]] = NOT_SET,
             *,
             out_channels: Optional[int] = NOT_SET,
-            out_spatial_shape: List[Optional[int]] = NOT_SET
+            out_size: List[Optional[int]] = NOT_SET
             ) -> 'SequentialBuilder':
         """
         Manually add a layer to this builder.
@@ -325,23 +328,23 @@ def add(self,
             out_shape: The new output shape.
             out_channels: The new output channels.  Should be specified and
                 only be specified when `out_shape` is not.
-            out_spatial_shape: The new spatial shape.  Should only be specified
+            out_size: The new spatial shape.  Should only be specified
                 when `out_channels` is specified.
 
         Returns:
             This sequential builder object.
         """
+        if out_size is not NOT_SET and out_channels is NOT_SET:
+            raise ValueError('`out_size` can only be specified when '
+                             '`out_channels` is specified.')
         if (out_shape is NOT_SET) == (out_channels is NOT_SET):
             raise ValueError('Either `out_shape` or `out_channels` should be '
                              'specified, but not both.')
-        if out_spatial_shape is not NOT_SET and out_channels is NOT_SET:
-            raise ValueError('`out_spatial_shape` can only be specified when '
-                             '`out_channels` is specified.')
 
         if out_channels is not NOT_SET:
-            if out_spatial_shape is NOT_SET:
-                out_spatial_shape = []
-            out_shape = _unsplit_channel_spatial(out_channels, out_spatial_shape)
+            if out_size is NOT_SET:
+                out_size = []
+            out_shape = _unsplit_channel_spatial(out_channels, out_size)
 
         self.layers.append(layer)
         self.out_shape = out_shape
@@ -359,7 +362,7 @@ def build(self, flatten_to_ndims: bool = True) -> T.Module:
             The built sequential layer.
         """
         if not self.layers:
-            raise RuntimeError('No layer has been added.')
+            return Identity()
         elif len(self.layers) == 1:
             layer = self.layers[0]
         else:
@@ -369,6 +372,10 @@ def build(self, flatten_to_ndims: bool = True) -> T.Module:
             layer = FlattenToNDims(layer, ndims=len(self.in_shape) + 1)
         return layer
 
+    # ---- identity layer (add no layer) ----
+    def identity(self):
+        return self
+
     # ---- activation ----
     def _make_activation(self, type_):
         self._assert_out_shape((False,), at_least=True)
@@ -404,10 +411,13 @@ def dense(self, out_features: int, **kwargs):
 
     # ---- convolution layers ----
     def _conv_nd(self, spatial_ndims, conv_cls, out_channels, **kwargs):
+        kwargs = self.layer_args.get_kwargs(conv_cls, **kwargs)
+        if 'kernel_size' not in kwargs:
+            raise ValueError('The `kernel_size` argument is required.')
+
         in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
 
         # validate the arguments
-        kwargs = self.layer_args.get_kwargs(conv_cls, **kwargs)
         kernel_size = validate_conv_size('kernel_size', kwargs['kernel_size'], spatial_ndims)
         stride = validate_conv_size('stride', kwargs.get('stride', 1), spatial_ndims)
         dilation = validate_conv_size('dilation', kwargs.get('dilation', 1), spatial_ndims)
@@ -469,10 +479,17 @@ def res_block3d(self,
 
     # ---- deconvolution layers ----
     def _deconv_nd(self, spatial_ndims, deconv_cls, out_channels, output_size, **kwargs):
+        kwargs = self.layer_args.get_kwargs(deconv_cls, **kwargs)
+        if 'kernel_size' not in kwargs:
+            raise ValueError('The `kernel_size` argument is required.')
+
+        if output_size is not NOT_SET:
+            kwargs.pop('output_size', None)
+        else:
+            output_size = kwargs.pop('output_size', NOT_SET)
         in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
 
         # validate the arguments
-        kwargs = self.layer_args.get_kwargs(deconv_cls, **kwargs)
         kernel_size = validate_conv_size('kernel_size', kwargs['kernel_size'], spatial_ndims)
         stride = validate_conv_size('stride', kwargs.get('stride', 1), spatial_ndims)
         dilation = validate_conv_size('dilation', kwargs.get('dilation', 1), spatial_ndims)
@@ -480,7 +497,7 @@ def _deconv_nd(self, spatial_ndims, deconv_cls, out_channels, output_size, **kwa
             kwargs.get('padding', PaddingMode.DEFAULT), kernel_size, dilation, spatial_ndims)
 
         if 'output_padding' in kwargs and output_size is not NOT_SET:
-            raise ValueError('`output_padding` and `out_shape` cannot be both specified.')
+            raise ValueError('`output_padding` and `output_size` cannot be both specified.')
         elif output_size is not NOT_SET:
             if len(output_size) != spatial_ndims:
                 raise ValueError(
@@ -493,12 +510,9 @@ def _deconv_nd(self, spatial_ndims, deconv_cls, out_channels, output_size, **kwa
                     f'is supported only when the previous output shape '
                     f'is all deterministic.'
                 )
+            output_padding = T.utils.calculate_deconv_output_padding(
+                in_size, output_size, kernel_size, stride, padding, dilation)
             out_size = output_size
-            output_padding = [
-                T.utils.calculate_deconv_output_padding(*args)
-                for args in zip(
-                    in_size, output_size, kernel_size, stride, padding, dilation)
-            ]
         elif 'output_padding' in kwargs:
             output_padding = validate_output_padding(
                 kwargs.get('output_padding', 0), stride, dilation, spatial_ndims)
@@ -523,63 +537,63 @@ def linear_conv_transpose1d(self,
                                 output_size: List[int] = NOT_SET,
                                 **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            1, LinearConvTranspose1d, out_channels, output_size, **kwargs)
+            1, LinearConvTranspose1d, out_channels, output_size=output_size, **kwargs)
 
     def linear_conv_transpose2d(self,
                                 out_channels: int,
                                 output_size: List[int] = NOT_SET,
                                 **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            2, LinearConvTranspose2d, out_channels, output_size, **kwargs)
+            2, LinearConvTranspose2d, out_channels, output_size=output_size, **kwargs)
 
     def linear_conv_transpose3d(self,
                                 out_channels: int,
                                 output_size: List[int] = NOT_SET,
                                 **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            3, LinearConvTranspose3d, out_channels, output_size, **kwargs)
+            3, LinearConvTranspose3d, out_channels, output_size=output_size, **kwargs)
 
     def conv_transpose1d(self,
                          out_channels: int,
                          output_size: List[int] = NOT_SET,
                          **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            1, ConvTranspose1d, out_channels, output_size, **kwargs)
+            1, ConvTranspose1d, out_channels, output_size=output_size, **kwargs)
 
     def conv_transpose2d(self,
                          out_channels: int,
                          output_size: List[int] = NOT_SET,
                          **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            2, ConvTranspose2d, out_channels, output_size, **kwargs)
+            2, ConvTranspose2d, out_channels, output_size=output_size, **kwargs)
 
     def conv_transpose3d(self,
                          out_channels: int,
                          output_size: List[int] = NOT_SET,
                          **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            3, ConvTranspose3d, out_channels, output_size, **kwargs)
+            3, ConvTranspose3d, out_channels, output_size=output_size, **kwargs)
 
     def res_block_transpose1d(self,
                               out_channels: int,
                               output_size: List[int] = NOT_SET,
                               **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            1, ResBlockTranspose1d, out_channels, output_size, **kwargs)
+            1, ResBlockTranspose1d, out_channels, output_size=output_size, **kwargs)
 
     def res_block_transpose2d(self,
                               out_channels: int,
                               output_size: List[int] = NOT_SET,
                               **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            2, ResBlockTranspose2d, out_channels, output_size, **kwargs)
+            2, ResBlockTranspose2d, out_channels, output_size=output_size, **kwargs)
 
     def res_block_transpose3d(self,
                               out_channels: int,
                               output_size: List[int] = NOT_SET,
                               **kwargs) -> 'SequentialBuilder':
         return self._deconv_nd(
-            3, ResBlockTranspose3d, out_channels, output_size, **kwargs)
+            3, ResBlockTranspose3d, out_channels, output_size=output_size, **kwargs)
 
     # aliases for the deconvolution layers
     linear_deconv1d = linear_conv_transpose1d
@@ -591,10 +605,13 @@ def res_block_transpose3d(self,
 
     # ---- pool layers ----
     def _pool_nd(self, spatial_ndims, pool_cls, **kwargs):
-        in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
+        kwargs = self.layer_args.get_kwargs(pool_cls, **kwargs)
+        if 'kernel_size' not in kwargs:
+            raise ValueError('The `kernel_size` argument is required.')
+
+        in_channels, in_size = self._split_out_shape(False, [False] * spatial_ndims)
 
         # validate the arguments
-        kwargs = self.layer_args.get_kwargs(pool_cls, **kwargs)
         kernel_size = validate_conv_size('kernel_size', kwargs['kernel_size'], spatial_ndims)
         stride = validate_conv_size('stride', kwargs.get('stride', kernel_size), spatial_ndims)
         dilation = [1] * spatial_ndims
@@ -627,13 +644,15 @@ def max_pool3d(self, **kwargs) -> 'SequentialBuilder':
         return self._pool_nd(3, MaxPool3d, **kwargs)
 
     def _global_avg_pool_nd(self, spatial_ndims, pool_cls, **kwargs):
-        in_channels, in_size = self._split_out_shape(True, [False] * spatial_ndims)
+        kwargs = self.layer_args.get_kwargs(pool_cls, **kwargs)
         keepdims = kwargs.get('keepdims', False)
+
+        in_channels, in_size = self._split_out_shape(False, [False] * spatial_ndims)
         if keepdims:
             out_shape = _unsplit_channel_spatial(in_channels, [1] * spatial_ndims)
         else:
             out_shape = [in_channels]
-        layer = pool_cls(**self.layer_args.get_kwargs(pool_cls, **kwargs))
+        layer = pool_cls(**kwargs)
         return self.add(layer, out_shape)
 
     def global_avg_pool1d(self, **kwargs) -> 'SequentialBuilder':
@@ -644,3 +663,48 @@ def global_avg_pool2d(self, **kwargs) -> 'SequentialBuilder':
 
     def global_avg_pool3d(self, **kwargs) -> 'SequentialBuilder':
         return self._global_avg_pool_nd(3, GlobalAvgPool3d, **kwargs)
+
+    # ---- reshape layers ----
+    def _channel_first_to_last_nd(self, spatial_ndims, layer_cls):
+        in_shape = self._assert_out_shape([False] * (spatial_ndims + 1))
+        out_shape = in_shape[1:] + in_shape[:1]
+        return self.add(layer_cls(), out_shape)
+
+    def channel_first_to_last1d(self):
+        return self._channel_first_to_last_nd(1, ChannelFirstToLast1d)
+
+    def channel_first_to_last2d(self):
+        return self._channel_first_to_last_nd(2, ChannelFirstToLast2d)
+
+    def channel_first_to_last3d(self):
+        return self._channel_first_to_last_nd(3, ChannelFirstToLast3d)
+
+    def _channel_last_to_first_nd(self, spatial_ndims, layer_cls):
+        in_shape = self._assert_out_shape([False] * (spatial_ndims + 1))
+        out_shape = in_shape[-1:] + in_shape[:-1]
+        return self.add(layer_cls(), out_shape)
+
+    def channel_last_to_first1d(self):
+        return self._channel_last_to_first_nd(1, ChannelLastToFirst1d)
+
+    def channel_last_to_first2d(self):
+        return self._channel_last_to_first_nd(2, ChannelLastToFirst2d)
+
+    def channel_last_to_first3d(self):
+        return self._channel_last_to_first_nd(3, ChannelLastToFirst3d)
+
+    if T.IS_CHANNEL_LAST:
+        channel_last_to_default1d = \
+            channel_last_to_default2d = \
+            channel_last_to_default3d = \
+            channel_default_to_last1d = \
+            channel_default_to_last2d = \
+            channel_default_to_last3d = \
+            identity
+    else:
+        channel_last_to_default1d = channel_last_to_first1d
+        channel_last_to_default2d = channel_last_to_first2d
+        channel_last_to_default3d = channel_last_to_first3d
+        channel_default_to_last1d = channel_first_to_last1d
+        channel_default_to_last2d = channel_first_to_last2d
+        channel_default_to_last3d = channel_first_to_last3d
diff --git a/tensorkit/layers/shape_.py b/tensorkit/layers/shape_.py
index ea7266a..5809f99 100644
--- a/tensorkit/layers/shape_.py
+++ b/tensorkit/layers/shape_.py
@@ -1,5 +1,6 @@
 from typing import *
 
+from .. import tensor as T
 from ..tensor import (Tensor, Module, shape, rank, flatten_to_ndims,
                       unflatten_from_ndims, pad)
 from ..tensor.nn import *
@@ -10,19 +11,22 @@
     'ConstantPad', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d',
     'ChannelFirstToLast1d', 'ChannelFirstToLast2d', 'ChannelFirstToLast3d',
     'ChannelLastToFirst1d', 'ChannelLastToFirst2d', 'ChannelLastToFirst3d',
+    'ChannelDefaultToLast1d', 'ChannelDefaultToLast2d', 'ChannelDefaultToLast3d',
+    'ChannelLastToDefault1d', 'ChannelLastToDefault2d', 'ChannelLastToDefault3d',
 ]
 
 
 # ---- FlattenToNDims ----
 class FlattenToNDims(BaseLayer):
 
-    __constants__ = ('layer', 'ndims')
+    __constants__ = ('wrapped', 'ndims')
 
+    wrapped: Module
     ndims: int
 
     def __init__(self, layer: Module, ndims: int):
         super().__init__()
-        self.layer = layer
+        self.wrapped = layer
         self.ndims = ndims
 
     def forward(self, input: Tensor) -> Tensor:
@@ -38,7 +42,7 @@ def forward(self, input: Tensor) -> Tensor:
 
         # flatten, get output from the layer, and then unflatten
         output, front_shape = flatten_to_ndims(input, expected_rank)
-        output = self.layer(output)
+        output = self.wrapped(output)
         return unflatten_from_ndims(output, front_shape)
 
 
@@ -163,3 +167,20 @@ class ChannelLastToFirst3d(BaseLayer):
 
     def forward(self, input: Tensor) -> Tensor:
         return channel_last_to_first3d(input)
+
+
+if T.IS_CHANNEL_LAST:
+    ChannelLastToDefault1d = \
+        ChannelLastToDefault2d = \
+        ChannelLastToDefault3d = \
+        ChannelDefaultToLast1d = \
+        ChannelDefaultToLast2d = \
+        ChannelDefaultToLast3d = \
+        Identity
+else:
+    ChannelLastToDefault1d = ChannelLastToFirst1d
+    ChannelLastToDefault2d = ChannelLastToFirst2d
+    ChannelLastToDefault3d = ChannelLastToFirst3d
+    ChannelDefaultToLast1d = ChannelFirstToLast1d
+    ChannelDefaultToLast2d = ChannelFirstToLast2d
+    ChannelDefaultToLast3d = ChannelFirstToLast3d
diff --git a/tests/layers/test_builder.py b/tests/layers/test_builder.py
new file mode 100644
index 0000000..b9a47f3
--- /dev/null
+++ b/tests/layers/test_builder.py
@@ -0,0 +1,575 @@
+from itertools import product
+
+import mltk
+import pytest
+
+import tensorkit as tk
+from tensorkit import tensor as T
+from tensorkit.layers import *
+from tests.helper import *
+from tests.ops import *
+
+
+class _RecordInitArgsLayer(BaseLayer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.args = tuple(args)
+        self.kwargs = dict(kwargs)
+
+    def __repr__(self):
+        return repr((self.args, self.kwargs))
+
+    def __eq__(self, other):
+        if isinstance(other, _RecordInitArgsLayer):
+            args, kwargs = other.args, other.kwargs
+        else:
+            args, kwargs = other
+        return args == self.args and kwargs == self.kwargs
+
+
+class LayerArgsTestCase(TestCase):
+
+    def test_set_args(self):
+        # empty default args
+        args = tk.layers.LayerArgs()
+        self.assertEqual(args.get_kwargs(_RecordInitArgsLayer), {})
+
+        o = args.build(_RecordInitArgsLayer)
+        self.assertIsInstance(o, _RecordInitArgsLayer)
+        self.assertEqual(o, ((), {}))
+
+        # set default args
+        args.set_args(_RecordInitArgsLayer, d=4)
+        self.assertEqual(args.get_kwargs(_RecordInitArgsLayer), {'d': 4})
+        self.assertEqual(args.get_kwargs(_RecordInitArgsLayer, c=3, d=5), {'c': 3, 'd': 5})
+
+        o = args.build(_RecordInitArgsLayer)
+        self.assertIsInstance(o, _RecordInitArgsLayer)
+        self.assertEqual(o, ((), {'d': 4}))
+
+        o = args.build(_RecordInitArgsLayer, 1, 2, c=3, d=5)
+        self.assertIsInstance(o, _RecordInitArgsLayer)
+        self.assertEqual(o, ((1, 2), {'c': 3, 'd': 5}))
+
+        # inherit default args from previous instance
+        args2 = tk.layers.LayerArgs(args)
+        args2.set_args([_RecordInitArgsLayer], c=5)
+        self.assertEqual(args2.get_kwargs(_RecordInitArgsLayer), {'c': 5, 'd': 4})
+        self.assertEqual(args.get_kwargs(_RecordInitArgsLayer), {'d': 4})  # should not change
+
+    def test_layer_names_as_types(self):
+        args = tk.layers.LayerArgs()
+        args.set_args(['dense', 'conv2d'], activation=tk.layers.LeakyReLU)
+        args.set_args(['conv2d'], kernel_size=3)
+
+        self.assertEqual(args.get_kwargs('dense'), {'activation': tk.layers.LeakyReLU})
+        self.assertEqual(args.get_kwargs('conv2d'), {
+            'activation': tk.layers.LeakyReLU,
+            'kernel_size': 3,
+        })
+
+        l1 = args.build('dense', 4, 4)
+        self.assertIsInstance(l1[1], tk.layers.LeakyReLU)
+        l2 = args.build('conv2d', 4, 4)
+        self.assertIsInstance(l2[1], tk.layers.LeakyReLU)
+        self.assertEqual(T.shape(l2[0].weight_store()), [4, 4, 3, 3])
+
+
+def sequential_builder_standard_check(ctx,
+                                      fn_name,
+                                      layer_cls,
+                                      input_shape,
+                                      input_mask,
+                                      args,
+                                      builder_args,
+                                      kwargs,
+                                      layer_kwargs=None,
+                                      builder_kwargs=None,
+                                      output_mask=None,
+                                      at_least=None):
+    if output_mask is None:
+        output_mask = input_mask
+    x = T.random.randn([3] + input_shape)
+
+    # the expected layer
+    T.random.seed(1234)
+    layer_kwargs = dict(layer_kwargs or {})
+    layer_kwargs.update(kwargs)
+    layer0 = layer_cls(*args, **layer_kwargs)
+    y = layer0(x)
+    output_shape = T.shape(y)[1:]
+
+    def fn(input_shape, output_shape, kwargs,
+           builder_set_arg_layer, builder_set_arg_kwargs):
+        T.random.seed(1234)
+        builder = SequentialBuilder(input_shape)
+        ctx.assertEqual(builder.in_shape, input_shape)
+        if builder_set_arg_kwargs:
+            ctx.assertIs(
+                builder.set_args(
+                    builder_set_arg_layer, **builder_set_arg_kwargs),
+                builder,
+            ),
+        ctx.assertIs(
+            getattr(builder, fn_name)(*builder_args, **kwargs),
+            builder,
+        )
+        ctx.assertEqual(builder.out_shape, output_shape)
+        layer = builder.build(False)
+        ctx.assertIsInstance(layer, layer_cls)
+        assert_allclose(layer(x), y, rtol=1e-4, atol=1e-6)
+
+    def apply_mask(shape, mask):
+        return [s if m else None for s, m in zip(shape, mask)]
+
+    # do check various ways to specify the arguments
+    builder_kwargs = dict(builder_kwargs or {})
+    builder_kwargs.update(kwargs)
+    fn(input_shape, output_shape, builder_kwargs, None, {})
+    fn(input_shape, output_shape, {}, layer_cls, builder_kwargs)
+    fn(input_shape, output_shape, {}, fn_name, builder_kwargs)
+
+    if False in input_mask:
+        fn(apply_mask(input_shape, input_mask),
+           apply_mask(output_shape, output_mask),
+           {}, fn_name, builder_kwargs)
+
+    # check some common error checks
+    if 'kernel_size' in builder_kwargs:
+        kwargs2 = dict(builder_kwargs)
+        kwargs2.pop('kernel_size')
+        with pytest.raises(ValueError,
+                           match='The `kernel_size` argument is required'):
+            fn(input_shape, output_shape, kwargs2, None, {})
+
+    if 'output_size' not in builder_kwargs:
+        for i, m in enumerate(input_mask):
+            if not m:
+                continue
+            input_shape2 = list(input_shape)
+            input_shape2[i] = None
+            with pytest.raises(ValueError,
+                               match=f'Axis {i - len(input_shape)} of the previous '
+                                     f'output shape is expected to be deterministic'):
+                fn(input_shape2, output_shape, builder_kwargs, None, {})
+
+    if len(input_shape) >= 1:
+        input_shape2 = list(input_shape[:-1])
+        if len(input_shape) == at_least:
+            with pytest.raises(ValueError,
+                               match=f'The previous output shape is expected to '
+                                     f'be at least {len(input_shape)}d'):
+                fn(input_shape2, output_shape, builder_kwargs, None, {})
+        elif at_least is None:
+            with pytest.raises(ValueError,
+                               match=f'The previous output shape is expected to '
+                                     f'be exactly {len(input_shape)}d'):
+                fn(input_shape2, output_shape, builder_kwargs, None, {})
+
+
+class SequentialBuilderTestCase(TestCase):
+
+    def test_construct(self):
+        def assert_in_shape(b, s):
+            self.assertEqual(b.in_shape, s)
+            self.assertEqual(b.out_shape, s)
+
+        # test the input shape
+        assert_in_shape(SequentialBuilder(3), [3])
+        assert_in_shape(SequentialBuilder(None), [None])
+        assert_in_shape(SequentialBuilder(in_channels=3), [3])
+        assert_in_shape(SequentialBuilder(in_channels=None), [None])
+
+        assert_in_shape(SequentialBuilder([3]), [3])
+        assert_in_shape(SequentialBuilder([3, 4]), [3, 4])
+        assert_in_shape(SequentialBuilder((3, 4)), [3, 4])
+        assert_in_shape(SequentialBuilder([None, None]), [None, None])
+        assert_in_shape(SequentialBuilder(in_shape=[3, 4]), [3, 4])
+        assert_in_shape(SequentialBuilder(in_shape=(3, 4)), [3, 4])
+        assert_in_shape(SequentialBuilder(in_shape=(None, None)), [None, None])
+
+        assert_in_shape(
+            SequentialBuilder(5, in_size=[3, 4]),
+            make_conv_shape([], 5, [3, 4]),
+        )
+        assert_in_shape(
+            SequentialBuilder(in_channels=5, in_size=[3, 4]),
+            make_conv_shape([], 5, [3, 4]),
+        )
+        assert_in_shape(
+            SequentialBuilder(in_channels=5, in_size=(3, 4)),
+            make_conv_shape([], 5, [3, 4]),
+        )
+        assert_in_shape(
+            SequentialBuilder(in_channels=5, in_size=(None, None)),
+            [s if s == 5 else None for s in make_conv_shape([], 5, [3, 4])],
+        )
+
+        # test in_builder
+        in_shape0 = make_conv_shape([], 5, [3, 4])
+        for in_shape in (in_shape0, [None if i != 5 else i for i in in_shape0]):
+            builder0 = SequentialBuilder(in_shape)
+            builder0.set_args(['dense', 'conv2d'], activation=tk.layers.LeakyReLU)
+            builder0.set_args('conv2d', kernel_size=3)
+
+            builder = SequentialBuilder(builder0)
+            assert_in_shape(builder, in_shape)
+            self.assertEqual(
+                builder.layer_args.get_kwargs(Dense),
+                {'activation': tk.layers.LeakyReLU}
+            )
+            self.assertEqual(
+                builder.layer_args.get_kwargs(Conv2d),
+                {'activation': tk.layers.LeakyReLU, 'kernel_size': 3}
+            )
+
+        # test arg errors
+        with pytest.raises(ValueError,
+                           match='One and only one of `in_spec`, `in_shape`, '
+                                 '`in_channels` and `in_builder` should be '
+                                 'specified'):
+            _ = SequentialBuilder()
+
+        arg_values = {
+            'in_spec': [3, 4],
+            'in_shape': [5, 6],
+            'in_channels': 7,
+            'in_builder': builder0,
+        }
+        for arg1, arg2 in product(
+                ['in_spec', 'in_shape', 'in_channels', 'in_builder'],
+                ['in_spec', 'in_shape', 'in_channels', 'in_builder']):
+            if arg1 == arg2:
+                continue
+            with pytest.raises(ValueError,
+                               match='One and only one of `in_spec`, `in_shape`, '
+                                     '`in_channels` and `in_builder` should be '
+                                     'specified'):
+                _ = SequentialBuilder(**{arg1: arg_values[arg1],
+                                         arg2: arg_values[arg2]})
+        for arg in ['in_spec', 'in_shape', 'in_builder']:
+            with pytest.raises(ValueError,
+                               match='`in_size` can be specified only when '
+                                     '`in_channels` is specified, or `in_spec` '
+                                     'is None or an integer'):
+                _ = SequentialBuilder(in_size=[8, 9], **{arg: arg_values[arg]})
+
+    def test_arg_scope(self):
+        builder = SequentialBuilder(5)
+        self.assertEqual(builder.layer_args.get_kwargs(Dense), {})
+        self.assertEqual(builder.layer_args.get_kwargs(Conv2d), {})
+        with builder.arg_scope(['conv2d', Dense], activation=LeakyReLU):
+            self.assertEqual(builder.layer_args.get_kwargs(Dense),
+                             {'activation': LeakyReLU})
+            self.assertEqual(builder.layer_args.get_kwargs(Conv2d),
+                             {'activation': LeakyReLU})
+            with builder.arg_scope('dense', activation=Sigmoid, normalizer=BatchNorm):
+                self.assertEqual(builder.layer_args.get_kwargs(Dense),
+                                 {'activation': Sigmoid, 'normalizer': BatchNorm})
+                with builder.arg_scope(Conv2d, activation=Tanh, normalizer=BatchNorm2d):
+                    self.assertEqual(builder.layer_args.get_kwargs(Conv2d),
+                                     {'activation': Tanh, 'normalizer': BatchNorm2d})
+                self.assertEqual(builder.layer_args.get_kwargs(Conv2d),
+                                 {'activation': LeakyReLU})
+            self.assertEqual(builder.layer_args.get_kwargs(Dense),
+                             {'activation': LeakyReLU})
+        self.assertEqual(builder.layer_args.get_kwargs(Dense), {})
+        self.assertEqual(builder.layer_args.get_kwargs(Conv2d), {})
+
+    def test_add(self):
+        def fn(in_shape, layer, out_shape):
+            # test using `out_shape`
+            builder = SequentialBuilder(in_shape)
+            self.assertIs(builder.add(layer, out_shape), builder)
+            self.assertEqual(builder.out_shape, out_shape)
+            self.assertIs(builder.build(False), layer)
+
+            with pytest.raises(ValueError,
+                               match='`out_size` can only be specified when '
+                                     '`out_channels` is specified'):
+                _ = builder.add(layer, out_shape, out_size=[])
+
+            # test using `out_channels` and `out_size`
+            def g(out_channels, **out_size_args):
+                builder = SequentialBuilder(in_shape)
+                self.assertIs(
+                    builder.add(layer, out_channels=out_channels, **out_size_args),
+                    builder
+                )
+                self.assertEqual(builder.out_shape, out_shape)
+                self.assertIs(builder.build(False), layer)
+
+                # test error
+                with pytest.raises(ValueError,
+                                   match='Either `out_shape` or `out_channels` '
+                                         'should be specified, but not both'):
+                    _ = builder.add(layer, out_shape, out_channels=out_channels,
+                                    **out_size_args)
+                with pytest.raises(ValueError,
+                                   match='Either `out_shape` or `out_channels` '
+                                         'should be specified, but not both'):
+                    _ = builder.add(layer)
+
+            if len(out_shape) > 1:
+                if T.IS_CHANNEL_LAST:
+                    out_channels, out_size = out_shape[-1], out_shape[:-1]
+                else:
+                    out_channels, out_size = out_shape[0], out_shape[1:]
+                g(out_channels, out_size=out_size)
+            else:
+                g(out_shape[0], out_size=[])
+                g(out_shape[0])
+
+        fn([5], Linear(5, 3), [3])
+        fn([None], Linear(5, 3), [None])
+        fn(make_conv_shape([], 5, [6, 7]),
+           Conv2d(5, 3, kernel_size=1),
+           make_conv_shape([], 3, [6, 7]))
+        fn(make_conv_shape([], None, [None, None]),
+           Conv2d(5, 3, kernel_size=1),
+           make_conv_shape([], None, [None, None]))
+
+    def test_build(self):
+        builder = SequentialBuilder(5)
+        self.assertIsInstance(builder.build(), Identity)
+        self.assertIsInstance(builder.build(False), Identity)
+
+        # build with one layer
+        builder.dense(4)
+        l1 = builder.build(False)
+        self.assertIsInstance(l1, Dense)
+        l = builder.build(True)
+        self.assertIsInstance(l, FlattenToNDims)
+        x = T.random.randn([3, 5])
+        assert_allclose(l(x), l1(x), rtol=1e-4, atol=1e-6)
+
+        # build with two layers
+        builder.linear(3)
+        l = builder.build(False)
+        self.assertIsInstance(l, Sequential)
+        self.assertIs(l[0], l1)
+        l2 = l[-1]
+        self.assertIsInstance(l2, Linear)
+        l = builder.build(True)
+        self.assertIsInstance(l, FlattenToNDims)
+        x = T.random.randn([3, 5])
+        assert_allclose(l(x), l2(l1(x)), rtol=1e-4, atol=1e-6)
+
+    def test_identity(self):
+        for in_shape in ([], [5], [3, 4, 5]):
+            sequential_builder_standard_check(
+                ctx=self, fn_name='identity', layer_cls=Identity,
+                input_shape=in_shape, input_mask=[False] * len(in_shape),
+                args=(), builder_args=(), kwargs={}, at_least=0,
+            )
+
+    def test_activation(self):
+        for name in ['relu', 'leaky_relu', 'sigmoid', 'tanh', 'log_softmax']:
+            layer_cls = tk.layers.get_activation_class(name)
+            for in_shape in ([5], [3, 4, 5]):
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=name, layer_cls=layer_cls,
+                    input_shape=in_shape, input_mask=[False] * len(in_shape),
+                    args=(), builder_args=(), kwargs={}, at_least=1,
+                )
+
+    def test_linear(self):
+        sequential_builder_standard_check(
+            ctx=self, fn_name='linear', layer_cls=Linear, input_shape=[5],
+            input_mask=[True], args=(5, 4), builder_args=(4,),
+            kwargs={'weight_norm': True},
+        )
+        sequential_builder_standard_check(
+            ctx=self, fn_name='dense', layer_cls=Dense, input_shape=[5],
+            input_mask=[True], args=(5, 4), builder_args=(4,),
+            kwargs={'weight_norm': True, 'activation': LeakyReLU},
+        )
+
+    def test_conv_and_deconv(self):
+        for spatial_ndims in (1, 2, 3):
+            input_shape = make_conv_shape([], 5, [15, 16, 17][:spatial_ndims])
+            input_mask = [i == 5 for i in input_shape]
+            for fn_name, layer_cls in zip(
+                    [
+                        f'linear_conv{spatial_ndims}d',
+                        f'conv{spatial_ndims}d',
+                        f'res_block{spatial_ndims}d'
+                    ],
+                    [
+                        getattr(tk.layers, f'LinearConv{spatial_ndims}d'),
+                        getattr(tk.layers, f'Conv{spatial_ndims}d'),
+                        getattr(tk.layers, f'ResBlock{spatial_ndims}d'),
+                    ]):
+                kwargs = {'kernel_size': 3, 'stride': 2, 'padding': 'half',
+                          'weight_norm': True}
+                if not fn_name.startswith('linear_'):
+                    kwargs['activation'] = LeakyReLU
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=fn_name, layer_cls=layer_cls,
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(5, 4), builder_args=(4,),
+                    kwargs=kwargs
+                )
+
+    def test_deconv(self):
+        for spatial_ndims in (1, 2, 3):
+            output_size = [16, 17, 18][:spatial_ndims]
+            output_shape = make_conv_shape([], 4, output_size)
+            layer0 = getattr(tk.layers, f'LinearConv{spatial_ndims}d')(
+                4, 5, kernel_size=3, stride=2, padding='half',
+                weight_init=tk.init.ones
+            )
+            y = layer0(T.zeros([1] + output_shape))
+            input_shape = T.shape(y)[1:]
+            input_channel, input_size = T.utils.split_channel_spatial_shape(input_shape)
+
+            for fn_name, layer_cls in zip(
+                    [
+                        f'linear_conv_transpose{spatial_ndims}d',
+                        f'linear_deconv{spatial_ndims}d',
+                        f'conv_transpose{spatial_ndims}d',
+                        f'deconv{spatial_ndims}d',
+                        f'res_block_transpose{spatial_ndims}d'
+                    ],
+                    [
+                        getattr(tk.layers, f'LinearConvTranspose{spatial_ndims}d'),
+                        getattr(tk.layers, f'LinearConvTranspose{spatial_ndims}d'),
+                        getattr(tk.layers, f'ConvTranspose{spatial_ndims}d'),
+                        getattr(tk.layers, f'ConvTranspose{spatial_ndims}d'),
+                        getattr(tk.layers, f'ResBlockTranspose{spatial_ndims}d'),
+                    ]):
+                # without output_shape
+                kwargs = {'kernel_size': 3, 'stride': 2, 'padding': 'half',
+                          'weight_norm': True}
+                input_mask = [i == 5 for i in input_shape]
+                if not fn_name.startswith('linear_'):
+                    kwargs['activation'] = LeakyReLU
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=fn_name, layer_cls=layer_cls,
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(5, 4), builder_args=(4,),
+                    kwargs=kwargs
+                )
+                kwargs['output_padding'] = 0
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=fn_name, layer_cls=layer_cls,
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(5, 4), builder_args=(4,),
+                    kwargs=kwargs
+                )
+
+                # with output_shape
+                kwargs = {'kernel_size': 3, 'stride': 2, 'padding': 'half',
+                          'weight_norm': True}
+                layer_kwargs = {
+                    'output_padding': T.utils.calculate_deconv_output_padding(
+                        input_size=input_size,
+                        output_size=output_size,
+                        kernel_size=[3] * spatial_ndims,
+                        stride=[2] * spatial_ndims,
+                        padding=[(1, 1)] * spatial_ndims,
+                        dilation=[1] * spatial_ndims,
+                    )
+                }
+                builder_kwargs = {'output_size': output_size}
+                input_mask = [True] * spatial_ndims
+                if not fn_name.startswith('linear_'):
+                    kwargs['activation'] = LeakyReLU
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=fn_name, layer_cls=layer_cls,
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(5, 4), builder_args=(4,),
+                    kwargs=kwargs, layer_kwargs=layer_kwargs,
+                    builder_kwargs=builder_kwargs,
+                )
+
+                # test errors
+                builder = SequentialBuilder(input_shape)
+                fn = getattr(builder, fn_name)
+                with pytest.raises(ValueError,
+                                   match='`output_padding` and `output_size` '
+                                         'cannot be both specified'):
+                    fn(5, kernel_size=1,
+                       output_padding=1,
+                       output_size=[2, 3, 4][:spatial_ndims])
+
+                with pytest.raises(ValueError,
+                                   match=f'`output_size` is expected to be '
+                                         f'{spatial_ndims}d'):
+                    fn(5, kernel_size=1,
+                       output_size=[2, 3, 4, 5][:spatial_ndims + 1])
+
+                builder = SequentialBuilder(
+                    [i if i == 5 else None for i in input_shape])
+                fn = getattr(builder, fn_name)
+                with pytest.raises(ValueError,
+                                   match='Specifying `output_size` instead of '
+                                         '`output_padding` is supported only '
+                                         'when the previous output shape '
+                                         'is all deterministic.'):
+                    fn(5, kernel_size=1, output_size=[2, 3, 4][:spatial_ndims])
+
+    def test_pool(self):
+        for spatial_ndims in (1, 2, 3):
+            input_shape = make_conv_shape([], 5, [15, 16, 17][:spatial_ndims])
+            input_mask = [False] * (spatial_ndims + 1)
+            for fn_name, layer_cls in zip(
+                    [
+                        f'avg_pool{spatial_ndims}d',
+                        f'max_pool{spatial_ndims}d',
+                    ],
+                    [
+                        getattr(tk.layers, f'AvgPool{spatial_ndims}d'),
+                        getattr(tk.layers, f'MaxPool{spatial_ndims}d'),
+                    ]):
+                kwargs = {'kernel_size': 3, 'stride': 2, 'padding': 'half'}
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=fn_name, layer_cls=layer_cls,
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(), builder_args=(), kwargs=kwargs
+                )
+
+    def test_global_avg_pool(self):
+        for spatial_ndims in (1, 2, 3):
+            input_shape = make_conv_shape([], 5, [15, 16, 17][:spatial_ndims])
+            input_mask = [False] * (spatial_ndims + 1)
+
+            for keepdims in [True, False, None]:
+                if keepdims:
+                    output_mask = [i != 5 for i in input_shape]
+                else:
+                    output_mask = [False]
+
+                kwargs = {'keepdims': keepdims} if keepdims is not None else {}
+                sequential_builder_standard_check(
+                    ctx=self,
+                    fn_name=f'global_avg_pool{spatial_ndims}d',
+                    layer_cls=getattr(tk.layers, f'GlobalAvgPool{spatial_ndims}d'),
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(), builder_args=(), kwargs=kwargs,
+                    output_mask=output_mask
+                )
+
+    def test_channel_transpose_layers(self):
+        for spatial_ndims in (1, 2, 3):
+            input_shape = [15, 16, 17, 18][:spatial_ndims + 1]
+            input_mask = [False] * (spatial_ndims + 1)
+            for fn_name, layer_cls in zip(
+                    [
+                        f'channel_first_to_last{spatial_ndims}d',
+                        f'channel_last_to_first{spatial_ndims}d',
+                        f'channel_default_to_last{spatial_ndims}d',
+                        f'channel_last_to_default{spatial_ndims}d',
+                    ],
+                    [
+                        getattr(tk.layers, f'ChannelFirstToLast{spatial_ndims}d'),
+                        getattr(tk.layers, f'ChannelLastToFirst{spatial_ndims}d'),
+                        getattr(tk.layers, f'ChannelDefaultToLast{spatial_ndims}d'),
+                        getattr(tk.layers, f'ChannelLastToDefault{spatial_ndims}d'),
+                    ]):
+                sequential_builder_standard_check(
+                    ctx=self, fn_name=fn_name, layer_cls=layer_cls,
+                    input_shape=input_shape, input_mask=input_mask,
+                    args=(), builder_args=(), kwargs={}
+                )
diff --git a/tests/tensor/test_core.py b/tests/tensor/test_core.py
index fb2eb09..7a2902b 100644
--- a/tests/tensor/test_core.py
+++ b/tests/tensor/test_core.py
@@ -266,7 +266,7 @@ def copy_tensor(o):
                 ([1, 2, 3], []),
                 number_dtypes,
                 (None, T.CPU_DEVICE)):
-            t = T.ones(shape, dtype=dtype)
+            t = T.ones(shape, dtype=dtype, device=device)
             self.assertIsInstance(t, T.Tensor)
             self.assertEqual(T.get_dtype(t), dtype)
             self.assertEqual(T.get_device(t), device or T.current_device())