# GPU/CUDA Diagnostic Test

Run each cell in order to diagnose the GPU detection issue.


In [None]:
# Cell 1: Check Environment Variables
import os

print("=" * 60)
print("KERNEL ENVIRONMENT VARIABLES")
print("=" * 60)

env_vars = {
    'LD_LIBRARY_PATH': os.environ.get('LD_LIBRARY_PATH', 'NOT SET'),
    'CUDA_HOME': os.environ.get('CUDA_HOME', 'NOT SET'),
    'CUDA_ROOT': os.environ.get('CUDA_ROOT', 'NOT SET'),
    'CUDA_PATH': os.environ.get('CUDA_PATH', 'NOT SET'),
    'NVIDIA_VISIBLE_DEVICES': os.environ.get('NVIDIA_VISIBLE_DEVICES', 'NOT SET'),
    'CUDA_VISIBLE_DEVICES': os.environ.get('CUDA_VISIBLE_DEVICES', 'NOT SET'),
}

for key, value in env_vars.items():
    if len(str(value)) > 100:
        print(f"{key}: {value[:100]}...")
    else:
        print(f"{key}: {value}")

print("\n" + "=" * 60)
if env_vars['LD_LIBRARY_PATH'] == 'NOT SET':
    print("❌ PROBLEM: LD_LIBRARY_PATH not set in kernel!")
    print("   The kernel config is not being applied.")
else:
    print("✓ LD_LIBRARY_PATH is set")

if env_vars['NVIDIA_VISIBLE_DEVICES'] == 'NOT SET':
    print("❌ PROBLEM: NVIDIA_VISIBLE_DEVICES not set!")
elif env_vars['NVIDIA_VISIBLE_DEVICES'] == 'void':
    print("❌ PROBLEM: NVIDIA_VISIBLE_DEVICES=void (hides GPU!)")
else:
    print(f"✓ NVIDIA_VISIBLE_DEVICES: {env_vars['NVIDIA_VISIBLE_DEVICES']}")


In [None]:
# Cell 2: Check nvidia-smi
import subprocess

print("=" * 60)
print("NVIDIA-SMI CHECK")
print("=" * 60)

try:
    result = subprocess.run(['nvidia-smi', '-L'], capture_output=True, text=True, timeout=5)
    print(result.stdout)
    if result.returncode == 0 and 'GPU' in result.stdout:
        print("✓ nvidia-smi can see GPU")
    else:
        print("❌ nvidia-smi failed or no GPU found")
except Exception as e:
    print(f"❌ Error running nvidia-smi: {e}")


In [None]:
# Cell 3: PyTorch CUDA Check (BEFORE init)
import torch

print("=" * 60)
print("PYTORCH CUDA (BEFORE INIT)")
print("=" * 60)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA compiled version: {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")

if not torch.cuda.is_available():
    print("\n❌ PyTorch cannot see CUDA")
    print("   Will try forced initialization next...")


In [None]:
# Cell 4: Force CUDA Initialization
print("=" * 60)
print("FORCING CUDA INITIALIZATION")
print("=" * 60)

try:
    torch.cuda.init()
    print("✓ torch.cuda.init() succeeded")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Device count: {torch.cuda.device_count()}")
    
    if torch.cuda.is_available():
        print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
        print(f"✓ Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print("\n🎉 SUCCESS! GPU is working!")
    else:
        print("\n❌ Init succeeded but CUDA still not available")
        
except RuntimeError as e:
    print(f"❌ torch.cuda.init() FAILED: {e}")
    print("\nThis means PyTorch cannot initialize CUDA.")


In [None]:
# Cell 5: Low-level CUDA Runtime Check
import ctypes

print("=" * 60)
print("LOW-LEVEL CUDA RUNTIME CHECK")
print("=" * 60)

try:
    libcuda = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libcuda.so.1")
    print("✓ libcuda.so.1 loaded")
    
    # Try cuInit
    result = libcuda.cuInit(0)
    print(f"cuInit result: {result} ", end="")
    
    if result == 0:
        print("(✓ SUCCESS)")
        
        # Try getting device count
        count = ctypes.c_int()
        result = libcuda.cuDeviceGetCount(ctypes.byref(count))
        print(f"cuDeviceGetCount result: {result} ", end="")
        
        if result == 0:
            print(f"(✓ SUCCESS)")
            print(f"GPU count from CUDA runtime: {count.value}")
        else:
            print(f"(❌ FAILED)")
            
    elif result == 100:
        print("(❌ CUDA_ERROR_NO_DEVICE)")
        print("\nDiagnosis: CUDA driver can't see devices")
        print("Likely cause: NVIDIA_VISIBLE_DEVICES=void or permissions issue")
    elif result == 3:
        print("(❌ CUDA_ERROR_NOT_INITIALIZED)")
    else:
        print(f"(❌ Unknown error code: {result})")
        
except Exception as e:
    print(f"❌ Error: {e}")


In [None]:
# Cell 6: Test Unsloth Import
print("=" * 60)
print("UNSLOTH IMPORT TEST")
print("=" * 60)

try:
    from unsloth import FastLanguageModel
    print("✅ SUCCESS! Unsloth imported successfully!")
    print("GPU is working and Unsloth can use it.")
except Exception as e:
    print(f"❌ Unsloth import FAILED: {e}")
    print("\nThis is expected if GPU detection failed in previous cells.")


## Interpretation

**If Cell 1 shows `LD_LIBRARY_PATH: NOT SET`:**
- The kernel configuration is not being applied
- Need to restart container with volume removed: `docker-compose down && docker volume rm unsloth-launch_unsloth-home && docker-compose up -d`

**If Cell 1 shows `NVIDIA_VISIBLE_DEVICES: void`:**
- This explicitly hides GPUs from the process
- The docker-compose.yml needs to NOT set this variable
- Docker's NVIDIA runtime should handle it automatically

**If Cell 5 shows `cuInit result: 100` (CUDA_ERROR_NO_DEVICE):**
- CUDA runtime cannot see devices
- Usually caused by NVIDIA_VISIBLE_DEVICES=void
- Or container not started with proper GPU access

**If Cell 4 succeeds:**
- GPU is working! Unsloth should work in Cell 6.
