In [None]:
import torch
import gc
import os

def check_gpu_memory(gpu_id):
    """检查指定GPU的显存使用情况"""
    if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():
        try:
            total_memory = torch.cuda.get_device_properties(gpu_id).total_memory / 1024**3
            allocated = torch.cuda.memory_allocated(gpu_id) / 1024**3
            cached = torch.cuda.memory_reserved(gpu_id) / 1024**3
            free = total_memory - cached
            
            print(f"GPU {gpu_id}:")
            print(f"  总显存: {total_memory:.1f}GB")
            print(f"  已分配: {allocated:.1f}GB") 
            print(f"  已缓存: {cached:.1f}GB")
            print(f"  可用显存: {free:.1f}GB")
            
            return allocated, cached, free
        except Exception as e:
            print(f"GPU {gpu_id} 检查失败: {e}")
            return 0, 0, 0
    else:
        print(f"GPU {gpu_id} 不可用")
        return 0, 0, 0

def clear_specific_gpu_memory(gpu_ids):
    """清理指定GPU的显存"""
    print("=== 清理前的显存状态 ===")
    for gpu_id in gpu_ids:
        check_gpu_memory(gpu_id)
    
    print(f"\n开始清理GPU {gpu_ids}的显存...")
    
    for gpu_id in gpu_ids:
        try:
            # 设置当前设备
            torch.cuda.set_device(gpu_id)
            
            # 清理缓存
            torch.cuda.empty_cache()
            
            # 强制垃圾回收
            gc.collect()
            
            print(f"✅ GPU {gpu_id} 显存清理完成")
            
        except Exception as e:
            print(f"❌ GPU {gpu_id} 清理失败: {e}")
    
    print("\n=== 清理后的显存状态 ===")
    total_free = 0
    for gpu_id in gpu_ids:
        allocated, cached, free = check_gpu_memory(gpu_id)
        total_free += free
    
    print(f"\nGPU {gpu_ids} 总可用显存: {total_free:.1f}GB")
    return total_free

def clear_pytorch_cache():
    """清理PyTorch缓存"""
    print("\n清理PyTorch全局缓存...")
    if hasattr(torch.cuda, 'empty_cache'):
        torch.cuda.empty_cache()
    gc.collect()
    print("✅ PyTorch缓存清理完成")

# 主程序
if __name__ == "__main__":
    print("GPU显存清理工具")
    print("目标: 只清理GPU 0和GPU 2，保持GPU 1和GPU 3不变")
    
    # 检查CUDA可用性
    if not torch.cuda.is_available():
        print("❌ CUDA不可用，无法清理GPU显存")
        exit()
    
    print(f"检测到 {torch.cuda.device_count()} 个GPU")
    
    # 要清理的GPU列表
    target_gpus = [0, 2]
    
    # 显示所有GPU状态（用于对比）
    print("\n=== 所有GPU当前状态 ===")
    for i in range(torch.cuda.device_count()):
        check_gpu_memory(i)
    
    # 清理指定GPU
    print(f"\n{'='*50}")
    available_memory = clear_specific_gpu_memory(target_gpus)
    
    # 额外清理PyTorch缓存
    clear_pytorch_cache()
    
    # 最终检查
    print(f"\n{'='*50}")
    print("=== 最终状态检查 ===")
    
    final_free = 0
    for gpu_id in target_gpus:
        allocated, cached, free = check_gpu_memory(gpu_id)
        final_free += free
    
    print(f"\n🎯 GPU 0和2总可用显存: {final_free:.1f}GB")
    
    # 评估是否足够加载模型
    print(f"\n{'='*50}")
    print("=== 模型加载建议 ===")
    
    if final_free >= 17:
        print("✅ 显存充足，可以尝试INT4量化加载Qwen2.5-14B")
        print("建议代码:")
        print("""
from modelscope import AutoModelForCausalLM, AutoTokenizer
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"

model = AutoModelForCausalLM.from_pretrained(
    "/home/dataset-assist-0/user/wangshuo/HYR/0628-LLM任务/1.Qwen2.5-14B-Instruct",
    local_files_only=True,
    trust_remote_code=True,
    load_in_4bit=True,  # 4位量化
    device_map="auto"
)
        """)
    
    print(f"\n清理完成！GPU 0和2现在有 {final_free:.1f}GB 可用显存。")