kulinseth · DenisVieriu97 · Jan 27, 2023 · Jan 26, 2023 · Jan 26, 2023 · Jan 27, 2023
@@ -140,7 +140,7 @@ bool dispatchIndexKernel(TensorIteratorBase& iter,
                 threadsPerThreadgroup: threadGroupSize];
 
       [computeEncoder endEncoding];
-      mpsStream->commit(true);
+      mpsStream->synchronize(SyncType::COMMIT);
     }
   });
 
@@ -252,31 +252,43 @@ Tensor nonzero_fallback(const Tensor& self) {
     MPSGraphTensor* inputTensor_ = nil;
     MPSGraphTensor* outputTensor_ = nil;
     MPSGraphTensor* scatterDataTensor_ = nil;
+    MPSGraphTensor* countNonzeroTensor_ = nil;
   };
 
-  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
-  at::native::resize_output(out_, {total_nonzero, nDim});
-  if (out_.numel() ==  0) {
-    return out_;
-  }
-
-  bool contiguous_output = (out_.is_contiguous() && !out_.is_view());
-  Tensor out = out_;
-  if (!contiguous_output) {
-    out = at::native::empty_mps(
-           out_.sizes(),
+  // int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
+  // auto self_cpu = self.detach().clone().cpu();
+  // int64_t total_nonzero_cpu = at::count_nonzero(self_cpu).item<int64_t>();
+
+  // std::string s;
+  // if (total_nonzero != total_nonzero_cpu) {
+  //   s = std::to_string(total_nonzero) + string(" ") + std::to_string(total_nonzero_cpu) + " self(";
+  //   for (int i = 0; i < self.sizes().size(); i++) {
+  //     s += std::to_string(self.sizes()[i]) + ",";
+  //   }
+  //   s += ") ";
+  //   s += self.is_view() ? "view " : "not_view ";
+  //   s += self.is_contiguous() ? "contg" : "not_contg ";
+  //   std::cout << s << std::endl;
+  //   TORCH_CHECK(total_nonzero == total_nonzero_cpu, s.c_str());
+  // }
+
+
+
+  stream->synchronize(SyncType::COMMIT_AND_WAIT);
+  Tensor count_nonzero = at::empty({1}, self.options().dtype(kInt));
+  Tensor out =  at::native::empty_mps(
+           {self.numel(), nDim == 0 ? 1 : nDim},
            out_.scalar_type(),
            c10::nullopt,
            kMPS,
            c10::nullopt,
            c10::nullopt);
-  }
 
   int64_t _apparentInputShape = 1;
   for (auto dim : self.sizes()) {
     _apparentInputShape *= dim;
   }
-  MPSShape *apparentOutputShape = @[@(total_nonzero * nDim)];
+  MPSShape *apparentOutputShape = @[@(self.numel() * nDim)];
   MPSShape *apparentInputShape = @[@(_apparentInputShape)];
 
   // Pseudocode:
@@ -310,6 +322,11 @@ Tensor nonzero_fallback(const Tensor& self) {
           MPSGraphTensor *inputNotEqualToZeroTensor = [mpsGraph notEqualWithPrimaryTensor:inputTensor
                                                                           secondaryTensor:zeroTensor
                                                                                      name:nil];
+
+          MPSGraphTensor *countNonzero = [mpsGraph reductionSumWithTensor:inputNotEqualToZeroTensor
+                                                         axis:0
+                                                         name:nil];
+
           MPSGraphTensor *maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
                                                      toType:MPSDataTypeInt32
                                                        name:@"castToInt32"];
@@ -358,15 +375,17 @@ Tensor nonzero_fallback(const Tensor& self) {
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->scatterDataTensor_ = scatterDataTensor;
           newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->countNonzeroTensor_ = countNonzero;
         }
         return newCachedGraph;
       });
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparentInputShape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, contiguous_output ? out_ : out, apparentOutputShape);
-    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, contiguous_output ? out_ : out, apparentOutputShape);
+    Placeholder countNonzeroPlaceholder = Placeholder(cachedGraph->countNonzeroTensor_, count_nonzero);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out, apparentOutputShape);
+    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, out, apparentOutputShape);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -375,15 +394,16 @@ Tensor nonzero_fallback(const Tensor& self) {
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+      countNonzeroPlaceholder.getMPSGraphTensor() : countNonzeroPlaceholder.getMPSGraphTensorData()
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-    if (!contiguous_output) {
-      out_.copy_(out);
-    }
   }
 
+  int32_t total_nonzero = count_nonzero.item<int32_t>();
+  at::native::resize_output(out_, {total_nonzero, nDim});
+  out_.copy_(out.resize_({total_nonzero, nDim}));
   return out_;
 }