```python
def reduce_max(input_tensor,
               axis=None,
               keepdims=None,
               name=None,
               reduction_indices=None,
               keep_dims=None):
  """Computes the maximum of elements across dimensions of a tensor.
  Reduces `input_tensor` along the dimensions given in `axis`.
  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
  entry in `axis`. If `keepdims` is true, the reduced dimensions
  are retained with length 1.
  If `axis` has no entries, all dimensions are reduced, and a
  tensor with a single element is returned.
  Args:
    input_tensor: The tensor to reduce. Should have numeric type.
    axis: The dimensions to reduce. If `None` (the default),
      reduces all dimensions. Must be in the range
      `[-rank(input_tensor), rank(input_tensor))`.
    keepdims: If true, retains reduced dimensions with length 1.
    name: A name for the operation (optional).
    reduction_indices: The old (deprecated) name for axis.
    keep_dims: Deprecated alias for `keepdims`.
  Returns:
    The reduced tensor.
  @compatibility(numpy)
  Equivalent to np.max
  @end_compatibility
  """
  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                    "keep_dims", keep_dims)
  if keepdims is None:
    keepdims = False
  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
                               gen_math_ops._max(
                                   input_tensor,
                                   _ReductionDims(input_tensor, axis,
                                                  reduction_indices),
                                   keepdims,
                                   name=name))
```

C-code for Op def:

```cpp
REGISTER_OP("ArgMax")
    .Input("input: T")
    .Input("dimension: Tidx")
    .Output("output: output_type")
    .Attr("T: numbertype")
    .Attr("Tidx: {int32, int64} = DT_INT32")
    .Attr("output_type: {int32, int64} = DT_INT64")
    .SetShapeFn(ArgOpShape);
```

In tensorflow/tensorflow/core/kernels/argmax_op_gpu.cu.cc, we have:

```cpp


#if GOOGLE_CUDA

#define EIGEN_USE_GPU

#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/argmax_op.h"

namespace tensorflow {

typedef Eigen::GpuDevice GPUDevice;

#define DEFINE_GPU_SPEC(T)                              \
  template struct functor::ArgMax<GPUDevice, T, int64>; \
  template struct functor::ArgMin<GPUDevice, T, int64>; \
  template struct functor::ArgMax<GPUDevice, T, int32>; \
  template struct functor::ArgMin<GPUDevice, T, int32>;

TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);

}  // end namespace tensorflow

#endif  // GOOGLE_CUDA
```

## tensorflow/tensorflow/core/kernels/argmax_op.h

```cpp

#ifndef TENSORFLOW_KERNELS_ARGMAX_OP_H_
#define TENSORFLOW_KERNELS_ARGMAX_OP_H_
// Generator definition for ArgMaxOp, must be compilable by nvcc.

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/types.h"

namespace tensorflow {

namespace functor {

template <typename Device, typename T, typename Tout>
struct ArgMax {
#define DECLARE_COMPUTE_SPEC(Dims)                                             \
  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                                \
      const Device& d, typename TTypes<T, Dims>::ConstTensor input,            \
      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output) { \
    output.device(d) = input.argmax(dimension).template cast<Tout>();          \
  }

  DECLARE_COMPUTE_SPEC(1);
  DECLARE_COMPUTE_SPEC(2);
  DECLARE_COMPUTE_SPEC(3);
  DECLARE_COMPUTE_SPEC(4);
  DECLARE_COMPUTE_SPEC(5);

#undef DECLARE_COMPUTE_SPEC
};

template <typename Device, typename T, typename Tout>
struct ArgMin {
#define DECLARE_COMPUTE_SPEC(Dims)                                             \
  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                                \
      const Device& d, typename TTypes<T, Dims>::ConstTensor input,            \
      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output) { \
    output.device(d) = input.argmin(dimension).template cast<Tout>();          \
  }

  DECLARE_COMPUTE_SPEC(1);
  DECLARE_COMPUTE_SPEC(2);
  DECLARE_COMPUTE_SPEC(3);
  DECLARE_COMPUTE_SPEC(4);
  DECLARE_COMPUTE_SPEC(5);

#undef DECLARE_COMPUTE_SPEC
};

}  // namespace functor

}  // namespace tensorflow
```

## tensorflow/tensorflow/core/kernels/argmax_op.cc
```cpp
#define EIGEN_USE_THREADS

#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#endif  // GOOGLE_CUDA

#include "tensorflow/core/kernels/argmax_op.h"

#include <memory>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/bounds_check.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

template <typename Device, typename T, typename Tout, typename ArgFunctor>
class ArgOp : public OpKernel {
 public:
  explicit ArgOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& dimension = context->input(1);

    OP_REQUIRES(context, TensorShapeUtils::IsScalar(dimension.shape()),
                errors::InvalidArgument(
                    "dim must be a scalar, but received tensor of shape: ",
                    dimension.shape().DebugString()));

    const int32 dim = internal::SubtleMustCopy(dimension.scalar<int32>()());
    const int input_dims = input.dims();

    int axis = dim < 0 ? dim + input_dims : dim;

    OP_REQUIRES(context, axis >= 0 && axis < input_dims,
                errors::InvalidArgument("Expected dimension in the range [",
                                        -input_dims, ", ", input_dims,
                                        "), but got ", dim));
    OP_REQUIRES(
        context, input.dim_size(axis) > 0,
        errors::InvalidArgument("Reduction axis ", dim, " is empty in shape ",
                                input.shape().DebugString()));

    TensorShape output_shape;
    const TensorShape& input_shape = input.shape();
    for (int d = 0; d < input_dims - 1; ++d) {
      output_shape.AddDim(input_shape.dim_size((d < axis) ? d : d + 1));
    }
    Tensor* output = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));

#define HANDLE_DIM(NDIM)                                        \
  case NDIM:                                                    \
    ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),   \
                             input.tensor<T, NDIM>(), axis,     \
                             output->tensor<Tout, NDIM - 1>()); \
    break;

    switch (input_dims) {
      HANDLE_DIM(1);
      HANDLE_DIM(2);
      HANDLE_DIM(3);
      HANDLE_DIM(4);
      HANDLE_DIM(5);

      default:
        OP_REQUIRES(context, false,
                    errors::InvalidArgument(
                        "ArgOp : Unhandled input dimensions: ", input_dims));
    }
  }
#undef HANDLE_DIM

 private:
  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
};

template <typename Device, typename T, typename Tout>
class ArgMaxOp
    : public ArgOp<Device, T, Tout, functor::ArgMax<Device, T, Tout> > {
 public:
  explicit ArgMaxOp(OpKernelConstruction* context)
      : ArgOp<Device, T, Tout, functor::ArgMax<Device, T, Tout> >(context) {}
};

template <typename Device, typename T, typename Tout>
class ArgMinOp
    : public ArgOp<Device, T, Tout, functor::ArgMin<Device, T, Tout> > {
 public:
  explicit ArgMinOp(OpKernelConstruction* context)
      : ArgOp<Device, T, Tout, functor::ArgMin<Device, T, Tout> >(context) {}
};

#define REGISTER_ARGMAX(type)                                       \
  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
                              .Device(DEVICE_CPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int64>("output_type") \
                              .HostMemory("dimension"),             \
                          ArgMaxOp<CPUDevice, type, int64>);        \
  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
                              .Device(DEVICE_CPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int64>("output_type") \
                              .HostMemory("dimension"),             \
                          ArgMinOp<CPUDevice, type, int64>);        \
  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
                              .Device(DEVICE_CPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int32>("output_type") \
                              .HostMemory("dimension"),             \
                          ArgMaxOp<CPUDevice, type, int32>);        \
  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
                              .Device(DEVICE_CPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int32>("output_type") \
                              .HostMemory("dimension"),             \
                          ArgMinOp<CPUDevice, type, int32>);

TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);

#if GOOGLE_CUDA

// Forward declarations of the functor specializations for GPU.
namespace functor {

#define DECLARE_GPU_SPEC(T, Tout, Dims)                                       \
  template <>                                                                 \
  void ArgMax<GPUDevice, T, Tout>::Reduce##Dims(                              \
      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,        \
      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output); \
  template <>                                                                 \
  void ArgMin<GPUDevice, T, Tout>::Reduce##Dims(                              \
      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,        \
      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output);

#define DECLARE_GPU_SPECS(T)     \
  DECLARE_GPU_SPEC(T, int64, 1); \
  DECLARE_GPU_SPEC(T, int64, 2); \
  DECLARE_GPU_SPEC(T, int64, 3); \
  DECLARE_GPU_SPEC(T, int64, 4); \
  DECLARE_GPU_SPEC(T, int64, 5); \
  DECLARE_GPU_SPEC(T, int32, 1); \
  DECLARE_GPU_SPEC(T, int32, 2); \
  DECLARE_GPU_SPEC(T, int32, 3); \
  DECLARE_GPU_SPEC(T, int32, 4); \
  DECLARE_GPU_SPEC(T, int32, 5);

#define DECLARE_GPU_CLASS(T)                          \
  extern template struct ArgMax<GPUDevice, T, int64>; \
  extern template struct ArgMin<GPUDevice, T, int64>; \
  extern template struct ArgMax<GPUDevice, T, int32>; \
  extern template struct ArgMin<GPUDevice, T, int32>;

TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);

#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_CLASS

}  // namespace functor

// Registration of the GPU implementations.
#define REGISTER_ARGMAX_GPU(type)                                   \
  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
                              .Device(DEVICE_GPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int64>("output_type") \
                              .TypeConstraint<int32>("Tidx")        \
                              .HostMemory("dimension"),             \
                          ArgMaxOp<GPUDevice, type, int64>);        \
  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
                              .Device(DEVICE_GPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int64>("output_type") \
                              .TypeConstraint<int32>("Tidx")        \
                              .HostMemory("dimension"),             \
                          ArgMinOp<GPUDevice, type, int64>);        \
  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
                              .Device(DEVICE_GPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int32>("output_type") \
                              .TypeConstraint<int32>("Tidx")        \
                              .HostMemory("dimension"),             \
                          ArgMaxOp<GPUDevice, type, int32>);        \
  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
                              .Device(DEVICE_GPU)                   \
                              .TypeConstraint<type>("T")            \
                              .TypeConstraint<int32>("output_type") \
                              .TypeConstraint<int32>("Tidx")        \
                              .HostMemory("dimension"),             \
                          ArgMinOp<GPUDevice, type, int32>);

TF_CALL_GPU_NUMBER_TYPES(REGISTER_ARGMAX_GPU);

#undef REGISTER_ARGMAX_GPU

#endif  // GOOGLE_CUDA

}  // namespace tensorflow
```