From 93d7c6c0025fc8553d4d1298473350facbc87c2c Mon Sep 17 00:00:00 2001
From: Jrelvas <55360900+jrelvas-ipc@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:17:52 +0100
Subject: [PATCH] hwdec_cuda: avoid gpu wakeup by deferring cuInit

`cuInit` wakes up the nvidia dgpu on nvidia laptops. This is bad news because the wake up process
is blocking and takes a few seconds. It also needlessly increases power consumption.

Sometimes, a VO loads several hwdecs (like `dmabuf_wayland`). When `cuda` is loaded, it calls
`cuInit` before running all interop inits. However, the first checks in the interops do not
require cuda initialization, so we only need to call `cuInit` after those checks.

This commit splits the interop `init` function into `check` and `init`. `check` can be called without
initializing the Cuda backend, so cuInit is only called *after* the first interop check.

With these changes, there's no cuda initialization if no OpenGL/Vulkan backend is available. This prevents
`dmabuf_wayland` and other VOs which automatically load cuda from waking up the nvidia dgpu unnecessarily,
making them start faster and decreasing power consumption on laptops.

Fixes: https://github.com/mpv-player/mpv/issues/13668
---
 video/out/hwdec/hwdec_cuda.c    | 47 ++++++++++++++++++++-------------
 video/out/hwdec/hwdec_cuda.h    |  9 ++++---
 video/out/hwdec/hwdec_cuda_gl.c | 19 +++++++++----
 video/out/hwdec/hwdec_cuda_vk.c | 21 ++++++++++-----
 4 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/video/out/hwdec/hwdec_cuda.c b/video/out/hwdec/hwdec_cuda.c
index 57e4fb40e357..8987cf3407ec 100644
--- a/video/out/hwdec/hwdec_cuda.c
+++ b/video/out/hwdec/hwdec_cuda.c
@@ -57,12 +57,12 @@ int check_cu(const struct ra_hwdec *hw, CUresult err, const char *func)
 
 #define CHECK_CU(x) check_cu(hw, (x), #x)
 
-static const cuda_interop_init interop_inits[] = {
+static const struct cuda_interop_fn *interop_fns[] = {
 #if HAVE_GL
-    cuda_gl_init,
+    &cuda_gl_fn,
 #endif
 #if HAVE_VULKAN
-    cuda_vk_init,
+    &cuda_vk_fn,
 #endif
     NULL
 };
@@ -73,25 +73,36 @@ static int cuda_init(struct ra_hwdec *hw)
     CUcontext dummy;
     int ret = 0;
     struct cuda_hw_priv *p = hw->priv;
-    CudaFunctions *cu;
+    CudaFunctions *cu = NULL;
     int level = hw->probing ? MSGL_V : MSGL_ERR;
-
-    ret = cuda_load_functions(&p->cu, NULL);
-    if (ret != 0) {
-        MP_MSG(hw, level, "Failed to load CUDA symbols\n");
-        return -1;
-    }
-    cu = p->cu;
-
-    ret = CHECK_CU(cu->cuInit(0));
-    if (ret < 0)
-        return -1;
+    bool initialized = false;
 
     // Initialise CUDA context from backend.
-    for (int i = 0; interop_inits[i]; i++) {
-        if (interop_inits[i](hw)) {
-            break;
+    // Note that the interop check doesn't require the CUDA backend to be initialized.
+    // This is important because cuInit wakes up the dgpu (even if the cuda hwdec won't be used!)
+    // Doing this allows us to check if CUDA should be used without waking up the dgpu, avoiding
+    // a few seconds of delay and improving battery life for laptops!
+    for (int i = 0; interop_fns[i]; i++) {
+        if (!interop_fns[i]->check(hw))
+            continue;
+
+        if (!initialized) {
+            ret = cuda_load_functions(&p->cu, NULL);
+            if (ret != 0) {
+                MP_MSG(hw, level, "Failed to load CUDA symbols\n");
+                return -1;
+            }
+
+            cu = p->cu;
+            ret = CHECK_CU(cu->cuInit(0));
+            if (ret < 0)
+                return -1;
+
+            initialized = true;
         }
+
+        if (interop_fns[i]->init(hw))
+            break;
     }
 
     if (!p->ext_init || !p->ext_uninit) {
diff --git a/video/out/hwdec/hwdec_cuda.h b/video/out/hwdec/hwdec_cuda.h
index 9c55053d5933..6e671b364e3e 100644
--- a/video/out/hwdec/hwdec_cuda.h
+++ b/video/out/hwdec/hwdec_cuda.h
@@ -50,10 +50,13 @@ struct cuda_mapper_priv {
     void *ext[4];
 };
 
-typedef bool (*cuda_interop_init)(const struct ra_hwdec *hw);
+struct cuda_interop_fn {
+    bool (*check)(const struct ra_hwdec *hw);
+    bool (*init)(const struct ra_hwdec *hw);
+};
 
-bool cuda_gl_init(const struct ra_hwdec *hw);
+extern struct cuda_interop_fn cuda_gl_fn;
 
-bool cuda_vk_init(const struct ra_hwdec *hw);
+extern struct cuda_interop_fn cuda_vk_fn;
 
 int check_cu(const struct ra_hwdec *hw, CUresult err, const char *func);
diff --git a/video/out/hwdec/hwdec_cuda_gl.c b/video/out/hwdec/hwdec_cuda_gl.c
index f20540ed4d3c..d4c0104c8daa 100644
--- a/video/out/hwdec/hwdec_cuda_gl.c
+++ b/video/out/hwdec/hwdec_cuda_gl.c
@@ -106,11 +106,7 @@ static void cuda_ext_gl_uninit(const struct ra_hwdec_mapper *mapper, int n)
 #undef CHECK_CU
 #define CHECK_CU(x) check_cu(hw, (x), #x)
 
-bool cuda_gl_init(const struct ra_hwdec *hw) {
-    int ret = 0;
-    struct cuda_hw_priv *p = hw->priv;
-    CudaFunctions *cu = p->cu;
-
+static bool cuda_gl_check(const struct ra_hwdec *hw) {
     if (ra_is_gl(hw->ra_ctx->ra)) {
         GL *gl = ra_gl_get(hw->ra_ctx->ra);
         if (gl->version < 210 && gl->es < 300) {
@@ -122,6 +118,14 @@ bool cuda_gl_init(const struct ra_hwdec *hw) {
         return false;
     }
 
+    return true;
+}
+
+static bool cuda_gl_init(const struct ra_hwdec *hw) {
+    int ret = 0;
+    struct cuda_hw_priv *p = hw->priv;
+    CudaFunctions *cu = p->cu;
+
     CUdevice display_dev;
     unsigned int device_count;
     ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1,
@@ -172,3 +176,8 @@ bool cuda_gl_init(const struct ra_hwdec *hw) {
 
     return true;
 }
+
+struct cuda_interop_fn cuda_gl_fn = {
+    .check = cuda_gl_check,
+    .init = cuda_gl_init
+};
diff --git a/video/out/hwdec/hwdec_cuda_vk.c b/video/out/hwdec/hwdec_cuda_vk.c
index b9f8caa8150d..e1aca5be1693 100644
--- a/video/out/hwdec/hwdec_cuda_vk.c
+++ b/video/out/hwdec/hwdec_cuda_vk.c
@@ -272,12 +272,7 @@ static bool cuda_ext_vk_signal(const struct ra_hwdec_mapper *mapper, int n)
 #undef CHECK_CU
 #define CHECK_CU(x) check_cu(hw, (x), #x)
 
-bool cuda_vk_init(const struct ra_hwdec *hw) {
-    int ret = 0;
-    int level = hw->probing ? MSGL_V : MSGL_ERR;
-    struct cuda_hw_priv *p = hw->priv;
-    CudaFunctions *cu = p->cu;
-
+static bool cuda_vk_check(const struct ra_hwdec *hw) {
     pl_gpu gpu = ra_pl_get(hw->ra_ctx->ra);
     if (gpu != NULL) {
         if (!(gpu->export_caps.tex & HANDLE_TYPE)) {
@@ -294,6 +289,16 @@ bool cuda_vk_init(const struct ra_hwdec *hw) {
         return false;
     }
 
+    return true;
+}
+
+static bool cuda_vk_init(const struct ra_hwdec *hw) {
+    int ret = 0;
+    int level = hw->probing ? MSGL_V : MSGL_ERR;
+    struct cuda_hw_priv *p = hw->priv;
+    CudaFunctions *cu = p->cu;
+    pl_gpu gpu = ra_pl_get(hw->ra_ctx->ra);
+
     if (!cu->cuImportExternalMemory) {
         MP_MSG(hw, level, "CUDA hwdec with Vulkan requires driver version 410.48 or newer.\n");
         return false;
@@ -342,3 +347,7 @@ bool cuda_vk_init(const struct ra_hwdec *hw) {
     return true;
 }
 
+struct cuda_interop_fn cuda_vk_fn = {
+    .check = cuda_vk_check,
+    .init = cuda_vk_init
+};