hkvision · hkvision · Aug 15, 2019 · Aug 15, 2019 · Jul 8, 2019 · Jul 11, 2019
diff --git a/docs/docs/ProgrammingGuide/pytorch.md b/docs/docs/ProgrammingGuide/pytorch.md
@@ -0,0 +1,155 @@
+Analytics-Zoo supports distributed Pytorch training and inferenceon on Apache Spark. User can
+define their model and loss function with Pytorch API, and run it in a distributed environment
+with the wrapper layers provided by Analytics Zoo.
+
+# System Requirement
+Pytorch version: 1.1.0
+torchvision: 2.2.0
+
+tested OS version (all 64-bit): __Ubuntu 16.04 or later__ . We expect it to 
+support a wide range of Operating Systems, yet other systems have not been fully tested with.
+Please create issues on [issue page](https://github.com/intel-analytics/analytics-zoo/issues)
+if any error is found.
+
+
+# Pytorch API
+
+Two wrappers are defined in Analytics Zoo for Pytorch:
+
+1. TorchNet: TorchNet is a wrapper class for Pytorch model.
+User may create a TorchNet by providing a Pytorch model and example input or expected size, e.g.
+```python
+    from zoo.pipeline.api.net.torch_net import TorchNet
+    TorchNet.from_pytorch(torchvision.models.resnet18(pretrained=True).eval(), [1, 3, 224, 224])
+```
+The above line creates TorchNet wrapping a ResNet model, and user can use the TorchNet for
+training or inference with Analytics Zoo. Internally, we create a sample input
+from the input_shape provided, and use torch script module to trace the tensor operations
+performed on the input sample. The result TorchNet extends from BigDL module, and can be used
+with local or distributed data (RDD or DataFrame) just like other layers. For multi-input
+models, please use tuple of tensors or tuple of expected tensor sizes as example input.
+
+2. TorchCriterion: TorchCriterion is a wrapper for loss functions defined by Pytorch.
+User may create a TorchCriterion from a Pytorch Criterion, 
+```python
+    from torch import nn
+    from zoo.pipeline.api.net.torch_criterion import TorchCriterion
+
+    az_criterion = TorchCriterion.from_pytorch(loss=nn.MSELoss(),
+                                               input=[1, 1],
+                                               label=[1, 1])
+```
+or from a custom loss function, which takes input and label as parameters
+
+```python
+    from torch import nn
+    from zoo.pipeline.api.net.torch_criterion import TorchCriterion
+
+    criterion = nn.MSELoss()
+
+    # this loss function is calculating loss for a multi-output model
+    def lossFunc(input, label):
+        loss1 = criterion(input[0], label[0])
+        loss2 = criterion(input[1], label[1])
+        loss = loss1 + 0.4 * loss2
+        return loss
+
+    az_criterion = TorchCriterion.from_pytorch(loss=lossFunc,
+                                               input=(torch.ones(2, 2), torch.ones(2, 1)),
+                                               label=(torch.ones(2, 2), torch.ones(2, 1)))
+```
+Similar to TorchNet, we also need users to provide example input shape or example input data,
+to trace the operations in the loss functions. The created TorchCriterion extends BigDL
+criterion, and can be used similarly as other criterions.
+
+# Examples
+Here we provide a simple end to end example, where we use TorchNet and TorchCriterion to
+train a simple model with Spark DataFrame.
+```python
+#
+# Copyright 2018 Analytics Zoo Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import torch
+import torch.nn as nn
+from bigdl.optim.optimizer import Adam
+from zoo.common.nncontext import *
+from zoo.pipeline.api.net.torch_net import TorchNet
+from zoo.pipeline.api.net.torch_criterion import TorchCriterion
+from zoo.pipeline.nnframes import *
+
+from pyspark.ml.linalg import Vectors
+from pyspark.sql import SparkSession
+
+
+# define model with Pytorch
+class SimpleTorchModel(nn.Module):
+    def __init__(self):
+        super(SimpleTorchModel, self).__init__()
+        self.dense1 = nn.Linear(2, 4)
+        self.dense2 = nn.Linear(4, 1)
+
+    def forward(self, x):
+        x = self.dense1(x)
+        x = torch.sigmoid(self.dense2(x))
+        return x
+
+if __name__ == '__main__':
+    sparkConf = init_spark_conf().setAppName("example_pytorch").setMaster('local[1]')
+    sc = init_nncontext(sparkConf)
+    spark = SparkSession \
+        .builder \
+        .getOrCreate()
+
+    df = spark.createDataFrame(
+        [(Vectors.dense([2.0, 1.0]), 1.0),
+         (Vectors.dense([1.0, 2.0]), 0.0),
+         (Vectors.dense([2.0, 1.0]), 1.0),
+         (Vectors.dense([1.0, 2.0]), 0.0)],
+        ["features", "label"])
+
+    torch_model = SimpleTorchModel()
+    torch_criterion = nn.MSELoss()
+
+    az_model = TorchNet.from_pytorch(torch_model, [1, 2])
+    az_criterion = TorchCriterion.from_pytorch(torch_criterion, [1, 1], [1, 1])
+
+    classifier = NNClassifier(az_model, az_criterion) \
+        .setBatchSize(4) \
+        .setOptimMethod(Adam()) \
+        .setLearningRate(0.01) \
+        .setMaxEpoch(10)
+
+    nnClassifierModel = classifier.fit(df)
+
+    print("After training: ")
+    res = nnClassifierModel.transform(df)
+    res.show(10, False)
+
+```
+
+and we expects to see the output like:
+```python
++---------+-----+----------+
+|features |label|prediction|
++---------+-----+----------+
+|[2.0,1.0]|1.0  |1.0       |
+|[1.0,2.0]|0.0  |0.0       |
+|[2.0,1.0]|1.0  |1.0       |
+|[1.0,2.0]|0.0  |0.0       |
++---------+-----+----------+
+```
+
+More Pytorch examples (ResNet, Lenet etc.) are available [here](../../../pyzoo/zoo/examples/pytorch).
+
diff --git a/pyzoo/dev/run-pytests b/pyzoo/dev/run-pytests
@@ -22,15 +22,13 @@ cd "`dirname $0`"
 
 export PYSPARK_PYTHON=python
 export PYSPARK_DRIVER_PYTHON=python
-
 py_version="$(python -V 2>&1)"
 
 python -m pytest -v --doctest-modules ../zoo \
     --ignore=../zoo/pipeline/api/keras2 \
     --ignore=../zoo/tfpark/text \
     --ignore=../zoo/examples \
     --ignore=../zoo/ray/
-
 exit_status_1=$?
 if [ $exit_status_1 -ne 0 ];
 then

diff --git a/pyzoo/test/zoo/pipeline/api/keras/test_simple_integration.py b/pyzoo/test/zoo/pipeline/api/keras/test_simple_integration.py
@@ -63,7 +63,8 @@ def test_training_with_tensorboard_checkpoint_gradientclipping(self):
         y_train = np.random.randint(4, size=(200, ))
         X_test = np.random.random([40, 32, 32])
         y_test = np.random.randint(4, size=(40, ))
-        model.compile(optimizer="adam",
+        from zoo.pipeline.api.keras.optimizers import Adam, EpochStep
+        model.compile(optimizer=Adam(lr=0.003, schedule=EpochStep(1, 0.75)),
                       loss="sparse_categorical_crossentropy",
                       metrics=['accuracy'])
         tmp_log_dir = create_tmp_path()
@@ -72,7 +73,7 @@ def test_training_with_tensorboard_checkpoint_gradientclipping(self):
         model.set_tensorboard(tmp_log_dir, "training_test")
         model.set_checkpoint(tmp_checkpoint_path)
         model.set_constant_gradient_clipping(0.01, 0.03)
-        model.fit(X_train, y_train, batch_size=112, nb_epoch=2, validation_data=(X_test, y_test))
+        model.fit(X_train, y_train, batch_size=32, nb_epoch=20, validation_data=(X_test, y_test))
         model.clear_gradient_clipping()
         model.fit(X_train, y_train, batch_size=112, nb_epoch=2, validation_data=(X_test, y_test))
         model.set_gradient_clipping_by_l2_norm(0.2)

diff --git a/pyzoo/test/zoo/pipeline/api/test_torch_net.py b/pyzoo/test/zoo/pipeline/api/test_torch_net.py
@@ -27,6 +27,55 @@
 
 
 class TestTF(ZooTestCase):
+
+    def test_torchnet_constructor(self):
+        # two inputs test
+        class TwoInputModel(nn.Module):
+            def __init__(self):
+                super(TwoInputModel, self).__init__()
+                self.dense1 = nn.Linear(2, 2)
+                self.dense2 = nn.Linear(3, 1)
+
+            def forward(self, x1, x2):
+                x1 = self.dense1(x1)
+                x2 = self.dense2(x2)
+                return x1, x2
+
+        TorchNet.from_pytorch(TwoInputModel(), (torch.ones(2, 2), torch.ones(2, 3)))
+        TorchNet.from_pytorch(TwoInputModel(), ([2, 2], [2, 3]))
+        TorchNet.from_pytorch(TwoInputModel(), [torch.ones(2, 2), torch.ones(2, 3)])
+        TorchNet.from_pytorch(TwoInputModel(), [[2, 2], [2, 3]])
+
+        # one input
+        input = [[0.5, 1.], [-0.3, 1.2]]
+        torch_input = torch.tensor(input)
+        model = nn.Linear(2, 1)
+        TorchNet.from_pytorch(model, torch_input)
+        TorchNet.from_pytorch(model, [1, 2])
+
+    def test_torchcriterion_constructor(self):
+        # two inputs test
+        criterion = nn.MSELoss()
+
+        def lossFunc(input, label):
+            loss1 = criterion(input[0], label[0])
+            loss2 = criterion(input[1], label[1])
+            loss = loss1 + 0.4 * loss2
+            return loss
+
+        TorchCriterion.from_pytorch(lossFunc,
+                                    (torch.ones(2, 2), torch.ones(2, 3)),
+                                    (torch.ones(2, 2), torch.ones(2, 3)))
+        TorchCriterion.from_pytorch(lossFunc, ([2, 2], [2, 3]), ([2, 2], [2, 3]))
+        TorchCriterion.from_pytorch(lossFunc,
+                                    [torch.ones(2, 2), torch.ones(2, 3)],
+                                    [torch.ones(2, 2), torch.ones(2, 3)])
+        TorchCriterion.from_pytorch(lossFunc, [[2, 2], [2, 3]], [[2, 2], [2, 3]])
+
+        # one inputs test
+        TorchCriterion.from_pytorch(criterion, [2, 1], [2, 1])
+        TorchCriterion.from_pytorch(criterion, torch.ones(2, 2), torch.ones(2, 2))
+
     def test_torch_net_predict_resnet(self):
         model = torchvision.models.resnet18(pretrained=True).eval()
         net = TorchNet.from_pytorch(model, [1, 3, 224, 224])
@@ -54,8 +103,7 @@ def test_linear_gradient_match(self):
 
         # AZ part
         az_net = TorchNet.from_pytorch(model, [1, 2])
-        az_criterion = TorchCriterion.from_pytorch(loss=criterion, input_shape=[1, 1],
-                                                   label_shape=[1, 1])
+        az_criterion = TorchCriterion.from_pytorch(criterion, [1, 1], [1, 1])
 
         az_input = np.array(input)
         az_label = np.array(label)
@@ -107,9 +155,7 @@ def forward(self, x):
 
         # AZ part
         az_net = TorchNet.from_pytorch(torch_model, [1, 2])
-        az_criterion = TorchCriterion.from_pytorch(loss=torch_criterion.forward,
-                                                   input_shape=[1, 1],
-                                                   label_shape=[1, 1])
+        az_criterion = TorchCriterion.from_pytorch(torch_criterion.forward, [1, 1], [1, 1])
 
         az_input = np.array(input)
         az_label = np.array(label)
@@ -142,8 +188,7 @@ def lossFunc(input, target):
 
         # AZ part
         az_net = TorchNet.from_pytorch(model, [1, 2])
-        az_criterion = TorchCriterion.from_pytorch(loss=lossFunc, input_shape=[1, 10],
-                                                   label_shape=[1, 1])
+        az_criterion = TorchCriterion.from_pytorch(lossFunc, [1, 10], [1, 1])
 
         az_input = np.array(input)
         az_label = np.array(label)
@@ -198,13 +243,12 @@ def forward(self, x):
             torch_model.fc2.bias.grad.flatten().tolist()
 
         # AZ part
-        az_net = TorchNet.from_pytorch(torch_model, input_shape=[1, 1, 28, 28])
+        az_net = TorchNet.from_pytorch(torch_model, [1, 1, 28, 28])
 
         def lossFunc(input, target):
             return torch_criterion.forward(input, target.flatten().long())
 
-        az_criterion = TorchCriterion.from_pytorch(loss=lossFunc, input_shape=[1, 10],
-                                                   label_shape=[1, 1])
+        az_criterion = TorchCriterion.from_pytorch(lossFunc, [1, 10], [1, 1])
 
         az_input = np.array(input)
         az_label = np.array(label)
@@ -267,9 +311,9 @@ def lossFunc(input, label):
 
         az_net = TorchNet.from_pytorch(model, [1, 2])
         az_criterion = TorchCriterion.from_pytorch(
-            loss=lossFunc,
-            sample_input=(torch.ones(2, 2), torch.ones(2, 1)),
-            sample_label=(torch.ones(2, 2), torch.ones(2, 1)))
+            lossFunc,
+            (torch.ones(2, 2), torch.ones(2, 1)),
+            (torch.ones(2, 2), torch.ones(2, 1)))
 
         az_input = np.array(input)
         az_label = [np.ones([2, 2]), np.ones([2, 1])]
@@ -283,37 +327,6 @@ def lossFunc(input, label):
         assert np.allclose(torch_loss.tolist(), az_loss_output)
         assert np.allclose(torch_grad, az_grad.tolist())
 
-    def test_torchnet_constructor(self):
-        class TwoInputModel(nn.Module):
-            def __init__(self):
-                super(TwoInputModel, self).__init__()
-                self.dense1 = nn.Linear(2, 2)
-                self.dense2 = nn.Linear(3, 1)
-
-            def forward(self, x1, x2):
-                x1 = self.dense1(x1)
-                x2 = self.dense2(x2)
-                return x1, x2
-
-        az_net = TorchNet.from_pytorch(
-            TwoInputModel(), sample_input=(torch.ones(2, 2), torch.ones(2, 3)))
-        az_net = TorchNet.from_pytorch(TwoInputModel(), ([2, 2], [2, 3]))
-
-    def test_torchcriterion_constructor(self):
-        criterion = nn.MSELoss()
-
-        def lossFunc(input, label):
-            loss1 = criterion(input[0], label[0])
-            loss2 = criterion(input[1], label[1])
-            loss = loss1 + 0.4 * loss2
-            return loss
-
-        az_criterion = TorchCriterion.from_pytorch(
-            lossFunc,
-            sample_input=(torch.ones(2, 2), torch.ones(2, 3)),
-            sample_label=(torch.ones(2, 2), torch.ones(2, 3)))
-        az_criterion = TorchCriterion.from_pytorch(lossFunc, ([2, 2], [2, 3]), ([2, 2], [2, 3]))
-
     def test_model_train_with_multiple_input(self):
         class TwoInputModel(nn.Module):
             def __init__(self):
@@ -349,11 +362,11 @@ def lossFunc(input, label):
             model.dense2.weight.grad.tolist()[0] + \
             model.dense2.bias.grad.tolist()
 
-        az_net = TorchNet.from_pytorch(model, sample_input=(torch.ones(2, 2), torch.ones(2, 2)))
+        az_net = TorchNet.from_pytorch(model, (torch.ones(2, 2), torch.ones(2, 2)))
         az_criterion = TorchCriterion.from_pytorch(
-            loss=lossFunc,
-            sample_input=(torch.ones(2, 2), torch.ones(2, 1)),
-            sample_label=(torch.ones(2, 2), torch.ones(2, 1)))
+            lossFunc,
+            (torch.ones(2, 2), torch.ones(2, 1)),
+            (torch.ones(2, 2), torch.ones(2, 1)))
 
         az_input = [np.array(input), np.array(input)]
         az_label = [np.ones([2, 2]), np.ones([2, 1])]

diff --git a/pyzoo/test/zoo/ray/test_ray_on_local.py b/pyzoo/test/zoo/ray/test_ray_on_local.py
@@ -19,6 +19,7 @@
 import psutil
 import pytest
 import ray
+import time
 
 from zoo import init_spark_on_local
 from zoo.ray.util.raycontext import RayContext
@@ -44,6 +45,7 @@ def test_local(self):
         print([ray.get(actor.hostname.remote()) for actor in actors])
         ray_ctx.stop()
         sc.stop()
+        time.sleep(1)
         for process_info in ray_ctx.ray_processesMonitor.process_infos:
             for pid in process_info.pids:
                 assert not psutil.pid_exists(pid)

diff --git a/pyzoo/test/zoo/ray/test_util.py b/pyzoo/test/zoo/ray/test_util.py
@@ -25,14 +25,14 @@
 
 class TestUtil(TestCase):
 
-    def test_split(self):
-        vector = np.ones([10])
-        result = rutils.split(vector, 4)
-        assert len(result) == 4
-        assert len(result[0]) == 3
-        assert len(result[1]) == 3
-        assert len(result[2]) == 2
-        assert len(result[3]) == 2
+    # def test_split(self):
+    #     vector = np.ones([10])
+    #     result = rutils.split(vector, 4)
+    #     assert len(result) == 4
+    #     assert len(result[0]) == 3
+    #     assert len(result[1]) == 3
+    #     assert len(result[2]) == 2
+    #     assert len(result[3]) == 2
 
     def test_resource_to_bytes(self):
         assert 10 == rutils.resourceToBytes("10b")