intel-analytics · Zhangky11 · Apr 18, 2024 · Apr 18, 2024
diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
@@ -125,6 +125,7 @@ def __init__(self, settings: Settings) -> None:
                     "top_p": ollama_settings.top_p,  # ollama and llama-cpp
                     "repeat_last_n": ollama_settings.repeat_last_n,  # ollama
                     "repeat_penalty": ollama_settings.repeat_penalty,  # ollama llama-cpp
+                    "num_gpu": ollama_settings.num_gpu,
                 }
 
                 self.llm = Ollama(

diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
@@ -253,6 +253,10 @@ class OllamaSettings(BaseModel):
         120.0,
         description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ",
     )
+    num_gpu: int = Field(
+        999,
+        description="Number of Layers running on GPU. (Default: 999)",
+    )
 
 
 class AzureOpenAISettings(BaseModel):

diff --git a/settings-ollama.yaml b/settings-ollama.yaml
@@ -22,6 +22,8 @@ ollama:
   repeat_last_n: 64       # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
   repeat_penalty: 1.2     # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
   request_timeout: 120.0  # Time elapsed until ollama times out the request. Default is 120s. Format is float.
+  num_predict: 64         # Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)
+  num_gpu: 999            # Number of Layers running on GPU. (Default: 999)
 
 vectorstore:
   database: qdrant