In [1]:
from llama_cpp import Llama
import time
import gc # 垃圾回收，用于确保模型被卸载

# --- 您的新模型路径 ---
# "大脑" (Brain) - 负责规划和推理
MODEL_PATH_BRAIN = "C:/Users/User/.lmstudio/models/lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf"

# "执行者" (Executor) - 负责编码
MODEL_PATH_EXECUTOR = "C:/Users/User/.lmstudio/models/Leapps/DeepAnalyze-8B-Q8_0-GGUF/deepanalyze-8b-q8_0.gguf"

print("模型路径已设置：")
print(f"大脑 (Brain): {MODEL_PATH_BRAIN}")
print(f"执行者 (Executor): {MODEL_PATH_EXECUTOR}")

模型路径已设置：
大脑 (Brain): C:/Users/User/.lmstudio/models/lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf
执行者 (Executor): C:/Users/User/.lmstudio/models/Leapps/DeepAnalyze-8B-Q8_0-GGUF/deepanalyze-8b-q8_0.gguf


In [2]:
print("--- 开始测试 DeepAnalyze 8B (快速执行者) ---")
print("加载模型 (全 VRAM)...")

start_time = time.time()
try:
    llm_executor = Llama(
        model_path=MODEL_PATH_EXECUTOR,
        n_gpu_layers=-1,  # -1 = 尝试将所有层加载到 GPU (全 VRAM)
        n_ctx=4096,       # 上下文窗口大小
        verbose=True
    )
    load_time = time.time() - start_time
    print(f"模型加载完毕，耗时: {load_time:.2f} 秒")

    # 测试推理 (编码任务)
    prompt = "Write a python function using pandas to read a CSV named 'sales.csv' and fill any missing values in the 'Age' column with the mean age."
    messages = [{"role": "system", "content": "You are a helpful coding assistant."},
                {"role": "user", "content": prompt}]

    start_time = time.time()
    output = llm_executor.create_chat_completion(messages=messages)
    inference_time = time.time() - start_time

    print("\n--- 8B '执行者' 模型输出 ---")
    print(output['choices'][0]['message']['content'])
    print("--------------------")
    print(f"推理耗时: {inference_time:.2f} 秒 (应该非常快)")

except Exception as e:
    print(f"加载 8B 模型时出错: {e}")
    print("请检查模型路径是否正确，以及 VRAM 是否足够。")

finally:
    # 关键步骤：卸载模型以释放 VRAM
    if 'llm_executor' in locals():
        print("卸载 8B 模型...")
        del llm_executor
        gc.collect()
        print("8B 模型已卸载。\n")

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3060) - 11247 MiB free
llama_model_loader: loaded meta data with 37 key-value pairs and 399 tensors from C:/Users/User/.lmstudio/models/Leapps/DeepAnalyze-8B-Q8_0-GGUF/deepanalyze-8b-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepAnalyze 8B
llama_model_loader: - kv   3:                           general.basename str              = DeepAnalyze
llama_mod

--- 开始测试 DeepAnalyze 8B (快速执行者) ---
加载模型 (全 VRAM)...


llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  29:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  30:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 151645
llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  33:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
llama_model_loader: - kv  35:               general.quantization_version u32              = 2
llama_model_loader: - kv  36:                          general.file_type u32              = 7
llama_model_loader: - type  f32:  145 tensors
llama_model_lo

模型加载完毕，耗时: 5.32 秒


llama_perf_context_print:        load time =     183.81 ms
llama_perf_context_print: prompt eval time =     183.65 ms /    44 tokens (    4.17 ms per token,   239.59 tokens per second)
llama_perf_context_print:        eval time =   15152.56 ms /   519 runs   (   29.20 ms per token,    34.25 tokens per second)
llama_perf_context_print:       total time =   16052.75 ms /   563 tokens
llama_perf_context_print:    graphs reused =        502



--- 8B '执行者' 模型输出 ---
We are going to use pandas to read the CSV file.
 Steps:
 1. Import pandas.
 2. Read the CSV file into a DataFrame.
 3. Calculate the mean of the 'Age' column (ignoring missing values initially).
 4. Fill the missing values in the 'Age' column with the calculated mean.
 5. We might want to return the DataFrame or just modify it in place? The problem doesn't specify, but typically we return the modified DataFrame.

 However, note: the problem says "fill any missing values in the 'Age' column with the mean age". We are only concerned with the 'Age' column.

 Let's write the function accordingly.
</Analyze>
<Answer>
To solve this problem, we need to read a CSV file named 'sales.csv' using pandas and fill any missing values in the 'Age' column with the mean age of the available data. Here's the step-by-step approach:

1. **Import pandas**: Ensure pandas is imported to handle data operations.
2. **Read the CSV file**: Use `pd.read_csv()` to load the data into a DataFr

In [3]:
print("--- 开始测试 GPT-OSS 20B (大脑) ---")
print("加载模型 (尝试全 VRAM)...")

start_time = time.time()
try:
    llm_brain = Llama(
        model_path=MODEL_PATH_BRAIN,
        n_gpu_layers=-1,  # -1 = 尝试将所有层加载到 GPU (全 VRAM)
        n_ctx=4096,
        verbose=True
    )
    load_time = time.time() - start_time
    print(f"模型加载完毕，耗时: {load_time:.2f} 秒")

    # 测试推理 (规划任务)
    prompt = "Outline the CRISP-DM steps for a bank credit risk model, focusing on fairness and compliance."
    messages = [{"role": "system", "content": "You are a senior data science manager at a bank."},
                {"role": "user", "content": prompt}]

    start_time = time.time()
    output = llm_brain.create_chat_completion(messages=messages)
    inference_time = time.time() - start_time

    print("\n--- 20B '大脑' 模型输出 ---")
    print(output['choices'][0]['message']['content'])
    print("--------------------")
    print(f"推理耗时: {inference_time:.2f} 秒 (如果 VRAM 成功，应该也很快)")

except Exception as e:
    print(f"加载 20B 模型时出错: {e}")
    print("⚠️ 警告：20B 模型可能无法完全载入 12GB VRAM。请查看下面的 '重要提示'。")

finally:
    # 卸载模型
    if 'llm_brain' in locals():
        print("卸载 20B 模型...")
        del llm_brain
        gc.collect()
        print("20B 模型已卸载。")

llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3060) - 11221 MiB free
llama_model_loader: loaded meta data with 33 key-value pairs and 459 tensors from C:/Users/User/.lmstudio/models/lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gpt-oss
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Openai_Gpt Oss 20b
llama_model_loader: - kv   3:                           general.basename str              = openai_gpt-oss
llama_model_loader: - kv   4:                         general.size_label str              = 20B
llama_model_loader: - kv   5:                        gpt-oss.block_count u32              = 24
l

--- 开始测试 GPT-OSS 20B (大脑) ---
加载模型 (尝试全 VRAM)...


llama_model_loader: - kv  24:                      tokenizer.ggml.tokens arr[str,201088]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,201088]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  26:                      tokenizer.ggml.merges arr[str,446189]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  27:                tokenizer.ggml.bos_token_id u32              = 199998
llama_model_loader: - kv  28:                tokenizer.ggml.eos_token_id u32              = 200002
llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 199999
llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {#-\n  In addition to the normal input...
llama_model_loader: - kv  31:               general.quantization_version u32              = 2
llama_model_loader: - kv  32:                          general.file_type u32          

模型加载完毕，耗时: 9.02 秒


llama_perf_context_print:        load time =     793.15 ms
llama_perf_context_print: prompt eval time =     792.93 ms /   105 tokens (    7.55 ms per token,   132.42 tokens per second)
llama_perf_context_print:        eval time =  180540.20 ms /  2272 runs   (   79.46 ms per token,    12.58 tokens per second)
llama_perf_context_print:       total time =  189464.48 ms /  2377 tokens
llama_perf_context_print:    graphs reused =       2200



--- 20B '大脑' 模型输出 ---
<|channel|>analysis<|message|>We need to outline CRISP-DM steps for a bank credit risk model, focusing on fairness and compliance. Provide details for each step: Business Understanding, Data Understanding, Data Preparation, Modeling, Evaluation, Deployment. For each step, highlight fairness and compliance aspects: regulatory requirements (e.g., Basel III, GDPR, Equal Credit Opportunity Act), data privacy, bias mitigation, explainability, auditability, documentation, model monitoring, etc. Provide actionable items, best practices, tools. Provide a structured outline. Maybe include substeps. Provide references to frameworks like AI Fairness 360, Model Card, etc. Provide compliance checklists. Provide fairness metrics: disparate impact, equal opportunity, etc. Provide mitigation strategies: reweighting, adversarial debiasing, fairness constraints. Provide model monitoring for drift. Provide documentation: model card, data sheet. Provide audit trails. Provide governa