Upgrade to latest GPT4All. Use Mistral as default offline chat model

GPT4all now supports gguf llama.cpp chat models. Latest GPT4All (+mistral) performs much at least 3x faster. On Macbook Pro at ~10s response start time vs 30s-120s earlier. Mistral is also a better chat model, although it hallucinates more than llama-2
khoj-ai · Oct 23, 2023 · 0f1ebca · 0f1ebca
1 parent 6dc0df3
commit 0f1ebca
Show file tree

Hide file tree

Showing 10 changed files with 84 additions and 11 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,8 +60,8 @@ dependencies = [
     "bs4 >= 0.0.1",
     "anyio == 3.7.1",
     "pymupdf >= 1.23.3",
-    "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
-    "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
+    "gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'",
 ]
 dynamic = ["version"]
 

diff --git a/src/khoj/migrations/migrate_offline_chat_default_model.py b/src/khoj/migrations/migrate_offline_chat_default_model.py
@@ -0,0 +1,69 @@
+"""
+Current format of khoj.yml
+---
+app:
+    ...
+content-type:
+    ...
+processor:
+  conversation:
+    offline-chat:
+        enable-offline-chat: false
+        chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
+    ...
+search-type:
+    ...
+
+New format of khoj.yml
+---
+app:
+    ...
+content-type:
+    ...
+processor:
+  conversation:
+    offline-chat:
+        enable-offline-chat: false
+        chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
+    ...
+search-type:
+    ...
+"""
+import logging
+from packaging import version
+
+from khoj.utils.yaml import load_config_from_file, save_config_to_file
+
+
+logger = logging.getLogger(__name__)
+
+
+def migrate_offline_chat_default_model(args):
+    schema_version = "0.12.4"
+    raw_config = load_config_from_file(args.config_file)
+    previous_version = raw_config.get("version")
+
+    if "processor" not in raw_config:
+        return args
+    if raw_config["processor"] is None:
+        return args
+    if "conversation" not in raw_config["processor"]:
+        return args
+    if "offline-chat" not in raw_config["processor"]["conversation"]:
+        return args
+    if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]:
+        return args
+
+    if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"):
+        logger.info(
+            f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF"
+        )
+        raw_config["version"] = schema_version
+
+        # Update offline chat model to mistral in GGUF format to use latest GPT4All
+        offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"]
+        if offline_chat_model.endswith(".bin"):
+            raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
+
+        save_config_to_file(raw_config, args.config_file)
+    return args
diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -16,7 +16,7 @@
 
 def extract_questions_offline(
     text: str,
-    model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
+    model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
     loaded_model: Union[Any, None] = None,
     conversation_log={},
     use_history: bool = True,
@@ -123,7 +123,7 @@ def converse_offline(
     references,
     user_query,
     conversation_log={},
-    model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
+    model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
     loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_command=ConversationCommand.Default,

diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py
@@ -14,9 +14,9 @@ def download_model(model_name: str):
     # Use GPU for Chat Model, if available
     try:
         model = GPT4All(model_name=model_name, device="gpu")
-        logger.debug("Loaded chat model to GPU.")
+        logger.debug(f"Loaded {model_name} chat model to GPU.")
     except ValueError:
         model = GPT4All(model_name=model_name)
-        logger.debug("Loaded chat model to CPU.")
+        logger.debug(f"Loaded {model_name} chat model to CPU.")
 
     return model
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
@@ -20,9 +20,11 @@
     "gpt-4": 8192,
     "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
     "gpt-3.5-turbo-16k": 15000,
+    "mistral-7b-instruct-v0.1.Q4_0.gguf": 1548,
 }
 model_to_tokenizer = {
     "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
+    "mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1",
 }
 
 

diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py
@@ -10,6 +10,7 @@
 from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
 from khoj.migrations.migrate_offline_model import migrate_offline_model
 from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
+from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model
 
 
 def cli(args=None):
@@ -61,6 +62,7 @@ def run_migrations(args):
         migrate_processor_conversation_schema,
         migrate_offline_model,
         migrate_offline_chat_schema,
+        migrate_offline_chat_default_model,
     ]
     for migration in migrations:
         args = migration(args)

diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
@@ -55,7 +55,7 @@
             },
             "offline-chat": {
                 "enable-offline-chat": False,
-                "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
+                "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
             },
             "tokenizer": None,
             "max-prompt-size": None,
@@ -132,7 +132,7 @@
             },
             "offline-chat": {
                 "enable-offline-chat": False,
-                "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
+                "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
             },
             "tokenizer": None,
             "max-prompt-size": None,

diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
@@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
 
 class OfflineChatProcessorConfig(ConfigBase):
     enable_offline_chat: Optional[bool] = False
-    chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+    chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
 
 
 class ConversationProcessorConfig(ConfigBase):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -206,7 +206,7 @@ def processor_config_offline_chat(tmp_path_factory):
 
     # Setup conversation processor
     processor_config = ProcessorConfig()
-    offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
+    offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True, chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf")
     processor_config.conversation = ConversationProcessorConfig(
         offline_chat=offline_chat,
         conversation_logfile=processor_dir.joinpath("conversation_logs.json"),

diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py
@@ -24,7 +24,7 @@
 
 from khoj.processor.conversation.utils import message_to_log
 
-MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf"
 
 
 @pytest.fixture(scope="session")