From c417ba31cb05ec255569d9ce3b41257087515c10 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Fri, 19 Apr 2024 16:56:48 +0300 Subject: [PATCH 01/10] llama3 ollama experimental test --- app/context/context_manager.py | 8 ++++++++ app/openai_helpers/chatgpt.py | 20 +++++++++++++------- app/openai_helpers/utils.py | 14 ++++++++++++-- main.py | 2 +- settings.py | 4 ++++ 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/app/context/context_manager.py b/app/context/context_manager.py index 493d82c..7c279b2 100644 --- a/app/context/context_manager.py +++ b/app/context/context_manager.py @@ -75,6 +75,14 @@ def get_config(model: str): summary_length=2048, hard_max_context_size=13 * 1024, ) + elif model == 'llama3': + return ContextConfiguration( + model_name=model, + long_term_memory_tokens=512, + short_term_memory_tokens=2048, + summary_length=512, + hard_max_context_size=13 * 1024, + ) else: raise ValueError(f'Unknown model name: {model}') diff --git a/app/openai_helpers/chatgpt.py b/app/openai_helpers/chatgpt.py index 47f8ce5..c428bdf 100644 --- a/app/openai_helpers/chatgpt.py +++ b/app/openai_helpers/chatgpt.py @@ -2,6 +2,8 @@ from contextlib import suppress from typing import List, Any, Optional, Callable, Union +from openai import BadRequestError + import settings from app.bot.utils import merge_dicts from app.openai_helpers.count_tokens import count_messages_tokens, count_tokens_from_functions, count_string_tokens @@ -19,6 +21,7 @@ class GptModel: GPT_4_TURBO = 'gpt-4-turbo' GPT_4_TURBO_PREVIEW = 'gpt-4-turbo-preview' GPT_4_VISION_PREVIEW = 'gpt-4-vision-preview' + LLAMA3 = 'llama3' GPT_MODELS = {GptModel.GPT_35_TURBO, GptModel.GPT_35_TURBO_16K, GptModel.GPT_4, GptModel.GPT_4_TURBO, @@ -148,13 +151,16 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i del additional_fields['functions'] messages = self.create_context(messages_to_send, self.system_prompt) - resp_generator = await OpenAIAsync.instance().chat.completions.create( - model=self.model, - messages=messages, - temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, - stream=True, - **additional_fields, - ) + try: + resp_generator = await OpenAIAsync.instance().chat.completions.create( + model=self.model, + messages=messages, + temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, + stream=True, + **additional_fields, + ) + except BadRequestError as e: + print(e) prompt_tokens += count_messages_tokens(messages, self.model) result_dict = {} diff --git a/app/openai_helpers/utils.py b/app/openai_helpers/utils.py index 91e9334..415b0ba 100644 --- a/app/openai_helpers/utils.py +++ b/app/openai_helpers/utils.py @@ -10,6 +10,7 @@ 'gpt-4-vision-preview': (Decimal('0.01'), Decimal('0.03')), 'gpt-4-turbo-preview': (Decimal('0.01'), Decimal('0.03')), 'gpt-4-turbo': (Decimal('0.01'), Decimal('0.03')), + 'llama3': (Decimal('0'), Decimal('0')), } WHISPER_PRICE = Decimal('0.006') @@ -54,14 +55,23 @@ def calculate_image_generation_usage_price(model, resolution, num_images): class OpenAIAsync: _key = None + _base_url = None _instance = None @classmethod - def init(cls, api_key): + def init(cls, api_key, base_url=None): cls._key = api_key + cls._base_url = base_url @classmethod def instance(cls): + params = {} + if cls._base_url: + params['base_url'] = cls._base_url + + if cls._key is None: + raise ValueError("OpenAIAsync is not initialized") + if cls._instance is None: - cls._instance = openai.AsyncOpenAI(api_key=cls._key) + cls._instance = openai.AsyncOpenAI(**params) return cls._instance diff --git a/main.py b/main.py index b731c5c..7314119 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,6 @@ if __name__ == '__main__': - OpenAIAsync.init(settings.OPENAI_TOKEN) + OpenAIAsync.init(settings.OLLAMA_API_KEY, settings.OLLAMA_BASE_URL) telegram_bot = TelegramBot(bot, dp) telegram_bot.run() diff --git a/settings.py b/settings.py index 17a8e0c..8d00fc9 100644 --- a/settings.py +++ b/settings.py @@ -69,6 +69,10 @@ IMAGE_PROXY_BIND_HOST = '0.0.0.0' IMAGE_PROXY_BIND_PORT = 8321 +OLLAMA_BASE_URL = '' +OLLAMA_MODEL = 'llama3' +OLLAMA_API_KEY = 'ollama' + # Vectara RAG settings # this feature is highly experimental and not recommended to be used in it's current state # currently it even doesn't have instructions on how to setup, use it only if you feel experimenalish From 93faa146dcd0acd8d00bd1e6f7739c3c1dd405f0 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Tue, 23 Apr 2024 23:05:06 +0300 Subject: [PATCH 02/10] Fix ollama api --- app/bot/message_processor.py | 12 ++++++++---- app/openai_helpers/chatgpt.py | 2 +- app/openai_helpers/utils.py | 2 ++ settings.py | 5 ++++- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/app/bot/message_processor.py b/app/bot/message_processor.py index 702d7d1..3275e1c 100644 --- a/app/bot/message_processor.py +++ b/app/bot/message_processor.py @@ -48,11 +48,13 @@ async def add_message_as_context(self, message_id: int = None, message: Message @staticmethod async def prepare_user_message(message: Message): - content = [] - if message.text: - content.append(DialogUtils.construct_message_content_part(DialogUtils.CONTENT_TEXT, message.text)) if message.photo: + content = [] + + if message.text: + content.append(DialogUtils.construct_message_content_part(DialogUtils.CONTENT_TEXT, message.text)) + # largest photo photo = message.photo[-1] file_id = photo.file_id @@ -63,7 +65,9 @@ async def prepare_user_message(message: Message): file_url = urljoin(f'{settings.IMAGE_PROXY_URL}:{settings.IMAGE_PROXY_PORT}', f'{file_id}_{tokens}.jpg') content.append(DialogUtils.construct_message_content_part(DialogUtils.CONTENT_IMAGE_URL, file_url)) - return DialogUtils.prepare_user_message(content) + return DialogUtils.prepare_user_message(content) + else: + return DialogUtils.prepare_user_message(message.text) async def process(self, is_cancelled): context_manager = await self.context_manager() diff --git a/app/openai_helpers/chatgpt.py b/app/openai_helpers/chatgpt.py index c428bdf..e580279 100644 --- a/app/openai_helpers/chatgpt.py +++ b/app/openai_helpers/chatgpt.py @@ -25,7 +25,7 @@ class GptModel: GPT_MODELS = {GptModel.GPT_35_TURBO, GptModel.GPT_35_TURBO_16K, GptModel.GPT_4, GptModel.GPT_4_TURBO, - GptModel.GPT_4_TURBO_PREVIEW, GptModel.GPT_4_VISION_PREVIEW} + GptModel.GPT_4_TURBO_PREVIEW, GptModel.GPT_4_VISION_PREVIEW, GptModel.LLAMA3} class FunctionCall(pydantic.BaseModel): diff --git a/app/openai_helpers/utils.py b/app/openai_helpers/utils.py index 415b0ba..34d04e3 100644 --- a/app/openai_helpers/utils.py +++ b/app/openai_helpers/utils.py @@ -72,6 +72,8 @@ def instance(cls): if cls._key is None: raise ValueError("OpenAIAsync is not initialized") + params['api_key'] = cls._key + if cls._instance is None: cls._instance = openai.AsyncOpenAI(**params) return cls._instance diff --git a/settings.py b/settings.py index 8d00fc9..5887656 100644 --- a/settings.py +++ b/settings.py @@ -17,7 +17,10 @@ 'and assist users to the best of your abilities. Listen carefully to what they say, ask questions, ' 'and help in any way you can. Avoid giving advices, your ultimate goal is to help the user to find the right solution by himself. ' 'Ask only one question a time.', - } + }, + 'ai dungeon': { # free to be deleted, also you can add new ones + 'system': 'You are the AI Dungeon game. Your task is to entertain user with role play. User creates a setup and you play role of the world and characters in it.', + }, } # Mandatory settings From b713fe07052c6746b592c4555e634bfb77f9f8be Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Wed, 24 Apr 2024 02:01:31 +0300 Subject: [PATCH 03/10] Refactoring of adding new models with their own base_urls Streaming answers setting comeback --- app/bot/settings_menu.py | 7 +- app/bot/telegram_bot.py | 1 - app/context/context_manager.py | 84 ++--------------- app/llm_models.py | 154 +++++++++++++++++++++++++++++++ app/openai_helpers/chatgpt.py | 48 +++------- app/openai_helpers/llm_client.py | 21 +++++ app/openai_helpers/utils.py | 18 ++-- main.py | 3 +- settings.py | 3 +- 9 files changed, 210 insertions(+), 129 deletions(-) create mode 100644 app/llm_models.py create mode 100644 app/openai_helpers/llm_client.py diff --git a/app/bot/settings_menu.py b/app/bot/settings_menu.py index 8ebfa05..e2be2ae 100644 --- a/app/bot/settings_menu.py +++ b/app/bot/settings_menu.py @@ -2,6 +2,7 @@ from aiogram import Bot, types, Dispatcher +from app.llm_models import get_models from app.storage.db import User, DB from app.storage.user_role import check_access_conditions, UserRole @@ -16,6 +17,8 @@ 'gpt-4': 'GPT-4' } +ALL_MODELS_OPTIONS = list(get_models().keys()) + TTS_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'] SETTINGS_PREFIX = 'settings' @@ -99,6 +102,7 @@ def __init__(self, bot: Bot, dispatcher: Dispatcher, db: DB): self.settings = { 'current_model': VisibleOptionsSetting('current_model', GPT_MODELS_OPTIONS), 'current_model_preview': VisibleOptionsSetting('current_model', OLD_MODELS_OPTIONS), + 'all_models': ChoiceSetting('Model', 'current_model', ALL_MODELS_OPTIONS), 'gpt_mode': ChoiceSetting('GPT mode', 'gpt_mode', list(settings.gpt_mode.keys())), 'use_functions': OnOffSetting('Use functions', 'use_functions'), 'image_generation': OnOffSetting('Image generation', 'image_generation'), @@ -106,13 +110,14 @@ def __init__(self, bot: Bot, dispatcher: Dispatcher, db: DB): 'tts-voice': ChoiceSetting('TTS voice', 'tts_voice', TTS_VOICES), 'voice_as_prompt': OnOffSetting('Voice as prompt', 'voice_as_prompt'), 'function_call_verbose': OnOffSetting('Verbose function calls', 'function_call_verbose'), - # 'streaming_answers': OnOffSetting('Streaming answers', 'streaming_answers'), + 'streaming_answers': OnOffSetting('Streaming answers', 'streaming_answers'), # 'auto_summarize': OnOffSetting('Auto summarize', 'auto_summarize'), # 'forward_as_prompt': OnOffSetting('Forward as prompt', 'forward_as_prompt'), } self.minimum_required_roles = { 'current_model': settings.USER_ROLE_CHOOSE_MODEL, 'current_model_preview': UserRole.ADMIN, + 'all_models': UserRole.ADMIN, 'image_generation': settings.USER_ROLE_IMAGE_GENERATION, 'tts-voice': settings.USER_ROLE_TTS, 'streaming_answers': settings.USER_ROLE_STREAMING_ANSWERS, diff --git a/app/bot/telegram_bot.py b/app/bot/telegram_bot.py index a9f4932..edcce9e 100644 --- a/app/bot/telegram_bot.py +++ b/app/bot/telegram_bot.py @@ -18,7 +18,6 @@ calculate_image_generation_usage_price, calculate_tts_usage_price) from app.storage.db import DBFactory, User from app.storage.user_role import check_access_conditions, UserRole -from app.openai_helpers.chatgpt import GptModel from aiogram import types, Bot, Dispatcher from aiogram.utils import executor diff --git a/app/context/context_manager.py b/app/context/context_manager.py index 7c279b2..ed39768 100644 --- a/app/context/context_manager.py +++ b/app/context/context_manager.py @@ -1,4 +1,3 @@ -import dataclasses from typing import List, Optional from aiogram import types @@ -6,87 +5,12 @@ import settings from app.context.dialog_manager import DialogManager from app.context.function_manager import FunctionManager +from app.llm_models import get_models from app.openai_helpers.chatgpt import DialogMessage from app.openai_helpers.function_storage import FunctionStorage from app.storage.db import DB, User, MessageType -@dataclasses.dataclass -class ContextConfiguration: - model_name: str - - # long term memory is based on embedding context search - long_term_memory_tokens: int - # short term memory is used for storing last messages - short_term_memory_tokens: int - # length of summary to be generated when context is too long - summary_length: int - # hard limit for context size, when this limit is reached, processing is being stopped, - # summarization also cannot be done - hard_max_context_size: int - - @staticmethod - def get_config(model: str): - if model == 'gpt-3.5-turbo': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=512, - short_term_memory_tokens=2560, - summary_length=512, - hard_max_context_size=5*1024, - ) - elif model == 'gpt-3.5-turbo-16k': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=1024, - short_term_memory_tokens=4096, - summary_length=1024, - hard_max_context_size=17*1024, - ) - elif model == 'gpt-4': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=512, - short_term_memory_tokens=2048, - summary_length=1024, - hard_max_context_size=9*1024, - ) - elif model == 'gpt-4-turbo-preview': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=512, - short_term_memory_tokens=5120, - summary_length=2048, - hard_max_context_size=13*1024, - ) - elif model == 'gpt-4-vision-preview': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=512, - short_term_memory_tokens=5120, - summary_length=2048, - hard_max_context_size=13*1024, - ) - elif model == 'gpt-4-turbo': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=512, - short_term_memory_tokens=5120, - summary_length=2048, - hard_max_context_size=13 * 1024, - ) - elif model == 'llama3': - return ContextConfiguration( - model_name=model, - long_term_memory_tokens=512, - short_term_memory_tokens=2048, - summary_length=512, - hard_max_context_size=13 * 1024, - ) - else: - raise ValueError(f'Unknown model name: {model}') - - class ContextManager: def __init__(self, db: DB, user: User, message: types.Message): self.db = db @@ -96,7 +20,11 @@ def __init__(self, db: DB, user: User, message: types.Message): self.function_manager = None async def process_dialog(self): - context_configuration = ContextConfiguration.get_config(self.user.current_model) + models = get_models() + llm_model = models.get(self.user.current_model) + if not llm_model: + raise ValueError(f"Unknown model: {self.user.current_model}") + context_configuration = llm_model.context_configuration self.dialog_manager = DialogManager(self.db, self.user, context_configuration) await self.dialog_manager.process_dialog(self.message) diff --git a/app/llm_models.py b/app/llm_models.py new file mode 100644 index 0000000..c2be6e4 --- /dev/null +++ b/app/llm_models.py @@ -0,0 +1,154 @@ +import dataclasses +from decimal import Decimal + +import settings + + +@dataclasses.dataclass +class LLMModelPrice: + # price per 1000 tokens + input_tokens_price: Decimal + output_tokens_price: Decimal + + +@dataclasses.dataclass +class LLMModelContextConfiguration: + # long term memory is based on embedding context search + long_term_memory_tokens: int + # short term memory is used for storing last messages + short_term_memory_tokens: int + # length of summary to be generated when context is too long + summary_length: int + # hard limit for context size, when this limit is reached, processing is being stopped, + # summarization also cannot be done + hard_max_context_size: int + + +class LLMModel: + def __init__(self, model_name: str, api_key, context_configuration, model_price=None, base_url=None): + if model_price is None: + model_price = LLMModelPrice(input_tokens_price=Decimal('0'), output_tokens_price=Decimal('0')) + + self.model_name = model_name + self.api_key = api_key + self.context_configuration = context_configuration + self.model_price = model_price + self.base_url = base_url + + +class LLMModels: + GPT_35_TURBO = 'gpt-3.5-turbo' + GPT_35_TURBO_16K = 'gpt-3.5-turbo-16k' + GPT_4 = 'gpt-4' + GPT_4_TURBO = 'gpt-4-turbo' + GPT_4_TURBO_PREVIEW = 'gpt-4-turbo-preview' + GPT_4_VISION_PREVIEW = 'gpt-4-vision-preview' + LLAMA3 = 'llama3' + + +def get_models(): + models = {} + openai_models = { + LLMModels.GPT_35_TURBO: LLMModel( + model_name=LLMModels.GPT_35_TURBO, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=2560, + summary_length=512, + hard_max_context_size=5*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.0005'), + output_tokens_price=Decimal('0.0015'), + ), + ), + LLMModels.GPT_35_TURBO_16K: LLMModel( + model_name=LLMModels.GPT_35_TURBO_16K, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=1024, + short_term_memory_tokens=4096, + summary_length=1024, + hard_max_context_size=17*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.003'), + output_tokens_price=Decimal('0.004'), + ), + ), + LLMModels.GPT_4: LLMModel( + model_name=LLMModels.GPT_4, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=2048, + summary_length=1024, + hard_max_context_size=9*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.03'), + output_tokens_price=Decimal('0.06'), + ), + ), + LLMModels.GPT_4_TURBO: LLMModel( + model_name=LLMModels.GPT_4_TURBO, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=5120, + summary_length=2048, + hard_max_context_size=13*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.01'), + output_tokens_price=Decimal('0.03'), + ), + ), + LLMModels.GPT_4_TURBO_PREVIEW: LLMModel( + model_name=LLMModels.GPT_4_TURBO_PREVIEW, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=5120, + summary_length=2048, + hard_max_context_size=13*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.01'), + output_tokens_price=Decimal('0.03'), + ), + ), + LLMModels.GPT_4_VISION_PREVIEW: LLMModel( + model_name=LLMModels.GPT_4_VISION_PREVIEW, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=5120, + summary_length=2048, + hard_max_context_size=13*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.01'), + output_tokens_price=Decimal('0.03'), + ), + ), + } + + if settings.OPENAI_TOKEN: + models.update(openai_models) + + if settings.OLLAMA_BASE_URL: + models[LLMModels.LLAMA3] = LLMModel( + model_name=LLMModels.LLAMA3, + api_key=settings.OLLAMA_API_KEY, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=2048, + summary_length=512, + hard_max_context_size=13*1024, + ), + base_url=settings.OLLAMA_BASE_URL, + ) + + return models diff --git a/app/openai_helpers/chatgpt.py b/app/openai_helpers/chatgpt.py index e580279..b26f4b8 100644 --- a/app/openai_helpers/chatgpt.py +++ b/app/openai_helpers/chatgpt.py @@ -2,30 +2,15 @@ from contextlib import suppress from typing import List, Any, Optional, Callable, Union -from openai import BadRequestError - import settings from app.bot.utils import merge_dicts +from app.llm_models import get_models, LLMModels from app.openai_helpers.count_tokens import count_messages_tokens, count_tokens_from_functions, count_string_tokens from app.openai_helpers.function_storage import FunctionStorage import pydantic -from app.openai_helpers.utils import OpenAIAsync - - -class GptModel: - GPT_35_TURBO = 'gpt-3.5-turbo' - GPT_35_TURBO_16K = 'gpt-3.5-turbo-16k' - GPT_4 = 'gpt-4' - GPT_4_TURBO = 'gpt-4-turbo' - GPT_4_TURBO_PREVIEW = 'gpt-4-turbo-preview' - GPT_4_VISION_PREVIEW = 'gpt-4-vision-preview' - LLAMA3 = 'llama3' - - -GPT_MODELS = {GptModel.GPT_35_TURBO, GptModel.GPT_35_TURBO_16K, GptModel.GPT_4, GptModel.GPT_4_TURBO, - GptModel.GPT_4_TURBO_PREVIEW, GptModel.GPT_4_VISION_PREVIEW, GptModel.LLAMA3} +from app.openai_helpers.llm_client import OpenAILLMClient class FunctionCall(pydantic.BaseModel): @@ -91,7 +76,7 @@ def openai_message(self): class ChatGPT: def __init__(self, model, system_prompt: str, function_storage: FunctionStorage = None): self.function_storage = function_storage - if model not in GPT_MODELS: + if model not in get_models(): raise ValueError(f"Unknown model: {model}") self.model = model self.system_prompt = system_prompt @@ -104,7 +89,7 @@ async def send_messages(self, messages_to_send: List[DialogMessage]) -> (DialogM 'function_call': 'auto', }) - if self.model == GptModel.GPT_4_VISION_PREVIEW: + if self.model == LLMModels.GPT_4_VISION_PREVIEW: # TODO: somewhy by default it's 16 tokens for this model additional_fields['max_tokens'] = 4096 @@ -115,7 +100,7 @@ async def send_messages(self, messages_to_send: List[DialogMessage]) -> (DialogM del additional_fields['functions'] messages = self.create_context(messages_to_send, self.system_prompt) - resp = await OpenAIAsync.instance().chat.completions.create( + resp = await OpenAILLMClient.get_client(self.model).chat.completions.create( model=self.model, messages=messages, temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, @@ -130,9 +115,7 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i prompt_tokens = 0 additional_fields = {} - system_prompt_addition = None if self.function_storage is not None: - system_prompt_addition = self.function_storage.get_system_prompt_addition() functions = self.function_storage.get_openai_prompt() prompt_tokens += count_tokens_from_functions(functions, self.model) additional_fields.update({ @@ -140,7 +123,7 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i 'function_call': 'auto', }) - if self.model == GptModel.GPT_4_VISION_PREVIEW: + if self.model == LLMModels.GPT_4_VISION_PREVIEW: # TODO: somewhy by default it's 16 tokens for this model additional_fields['max_tokens'] = 4096 @@ -151,16 +134,13 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i del additional_fields['functions'] messages = self.create_context(messages_to_send, self.system_prompt) - try: - resp_generator = await OpenAIAsync.instance().chat.completions.create( - model=self.model, - messages=messages, - temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, - stream=True, - **additional_fields, - ) - except BadRequestError as e: - print(e) + resp_generator = await OpenAILLMClient.get_client(self.model).chat.completions.create( + model=self.model, + messages=messages, + temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, + stream=True, + **additional_fields, + ) prompt_tokens += count_messages_tokens(messages, self.model) result_dict = {} @@ -213,7 +193,7 @@ async def summarize_messages(messages: List[DialogMessage], model: str, summary_ "role": "user", "content": f"Summarize this conversation in {summary_max_length} characters or less. Divide different themes explicitly with new lines. Return only text of summary, nothing else.", }] - resp = await OpenAIAsync.instance().chat.completions.create( + resp = await OpenAILLMClient.get_client(model).chat.completions.create( model=model, messages=prompt_messages, temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, diff --git a/app/openai_helpers/llm_client.py b/app/openai_helpers/llm_client.py new file mode 100644 index 0000000..de16ccb --- /dev/null +++ b/app/openai_helpers/llm_client.py @@ -0,0 +1,21 @@ +import openai + +from app.llm_models import get_models + + +class OpenAILLMClient: + _model_clients = {} + + @classmethod + def get_client(cls, model_name: str): + if model_name not in cls._model_clients: + llm_model = get_models().get(model_name) + if not llm_model: + raise ValueError(f"Unknown model: {model_name}") + params = { + 'api_key': llm_model.api_key, + } + if llm_model.base_url: + params['base_url'] = llm_model.base_url + cls._model_clients[model_name] = openai.AsyncOpenAI(**params) + return cls._model_clients[model_name] diff --git a/app/openai_helpers/utils.py b/app/openai_helpers/utils.py index 34d04e3..f49eb1c 100644 --- a/app/openai_helpers/utils.py +++ b/app/openai_helpers/utils.py @@ -1,17 +1,7 @@ from decimal import Decimal import openai - -COMPLETION_PRICE = { - 'gpt-3.5-turbo': (Decimal('0.0005'), Decimal('0.0015')), - 'gpt-3.5-turbo-16k': (Decimal('0.003'), Decimal('0.004')), - 'gpt-4': (Decimal('0.03'), Decimal('0.06')), - 'gpt-4-1106-preview': (Decimal('0.01'), Decimal('0.03')), - 'gpt-4-vision-preview': (Decimal('0.01'), Decimal('0.03')), - 'gpt-4-turbo-preview': (Decimal('0.01'), Decimal('0.03')), - 'gpt-4-turbo': (Decimal('0.01'), Decimal('0.03')), - 'llama3': (Decimal('0'), Decimal('0')), -} +from app.llm_models import get_models WHISPER_PRICE = Decimal('0.006') @@ -30,7 +20,11 @@ def calculate_completion_usage_price(prompt_tokens: int, completion_tokens: int, model: str) -> Decimal: - price = COMPLETION_PRICE.get(model) + llm_model = get_models().get(model) + if not llm_model: + raise ValueError(f"Unknown model: {model}") + + price = llm_model.model_price if not price: raise ValueError(f"Unknown model: {model}") prompt_price, completion_price = price diff --git a/main.py b/main.py index 7314119..0bddc3f 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,7 @@ if __name__ == '__main__': - OpenAIAsync.init(settings.OLLAMA_API_KEY, settings.OLLAMA_BASE_URL) + # needed for whisper and tts capabilities + OpenAIAsync.init(settings.OPENAI_TOKEN) telegram_bot = TelegramBot(bot, dp) telegram_bot.run() diff --git a/settings.py b/settings.py index 5887656..be9a88d 100644 --- a/settings.py +++ b/settings.py @@ -24,7 +24,7 @@ } # Mandatory settings -OPENAI_TOKEN = 'YOUR_TOKEN' +OPENAI_TOKEN = '' TELEGRAM_BOT_TOKEN = 'YOUR_TOKEN' # Image proxy settings # This proxy is used to send images to openai for GPT-4-Vision @@ -73,7 +73,6 @@ IMAGE_PROXY_BIND_PORT = 8321 OLLAMA_BASE_URL = '' -OLLAMA_MODEL = 'llama3' OLLAMA_API_KEY = 'ollama' # Vectara RAG settings From c53f0ab8c4521ae4eb9541a44b0e90db15d19130 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Thu, 25 Apr 2024 12:29:35 +0300 Subject: [PATCH 04/10] Fix count_tokens for custom models --- app/openai_helpers/count_tokens.py | 12 +++++++++--- app/openai_helpers/utils.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/app/openai_helpers/count_tokens.py b/app/openai_helpers/count_tokens.py index 40cebac..3e495e0 100644 --- a/app/openai_helpers/count_tokens.py +++ b/app/openai_helpers/count_tokens.py @@ -21,7 +21,8 @@ def count_string_tokens(string: str, model="gpt-3.5-turbo") -> int: elif "gpt-4" in model: model = "gpt-4" else: - raise ValueError(f"Unknown model: {model}") + # TODO: add method to calculate tokens for different models + model = "gpt-4" encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(str(string))) @@ -43,7 +44,11 @@ def count_messages_tokens(messages: List[dict], model="gpt-3.5-turbo") -> int: tokens_per_message = 3 tokens_per_name = 1 - encoding = tiktoken.encoding_for_model(model) + try: + encoding = tiktoken.encoding_for_model(model) + except: + # TODO: add method to calculate tokens for different models + encoding = tiktoken.encoding_for_model("gpt-4") num_tokens = 0 for message in messages: @@ -84,7 +89,8 @@ def count_tokens_from_functions(functions, model="gpt-3.5-turbo"): elif "gpt-4" in model: model = "gpt-4" else: - raise ValueError(f"Unknown model: {model}") + # TODO: add method to calculate tokens for different models + model = "gpt-4" encoding = tiktoken.encoding_for_model(model) num_tokens = 0 diff --git a/app/openai_helpers/utils.py b/app/openai_helpers/utils.py index f49eb1c..507bdc0 100644 --- a/app/openai_helpers/utils.py +++ b/app/openai_helpers/utils.py @@ -27,7 +27,7 @@ def calculate_completion_usage_price(prompt_tokens: int, completion_tokens: int, price = llm_model.model_price if not price: raise ValueError(f"Unknown model: {model}") - prompt_price, completion_price = price + prompt_price, completion_price = price.input_tokens_price, price.output_tokens_price return prompt_price * prompt_tokens / 1000 + completion_price * completion_tokens / 1000 From 936b138b407882f486330de01c36de3b6240c185 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Sun, 28 Apr 2024 21:54:32 +0300 Subject: [PATCH 05/10] Fix stange bugs --- app/bot/batched_input_handler.py | 4 ++-- app/bot/message_processor.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/app/bot/batched_input_handler.py b/app/bot/batched_input_handler.py index 1625cf3..73d4deb 100644 --- a/app/bot/batched_input_handler.py +++ b/app/bot/batched_input_handler.py @@ -112,9 +112,9 @@ async def process_batch(self, messages_batch: List[types.Message], user: User): for message in messages_batch: if message.audio: await self.handle_voice(message, user, message_processor) - if message.voice: + elif message.voice: await self.handle_voice(message, user, message_processor) - if message.document: + elif message.document: await self.handle_document(message, user, message_processor) else: await self.handle_message(message, user, message_processor) diff --git a/app/bot/message_processor.py b/app/bot/message_processor.py index 3275e1c..4ed5b4e 100644 --- a/app/bot/message_processor.py +++ b/app/bot/message_processor.py @@ -48,7 +48,6 @@ async def add_message_as_context(self, message_id: int = None, message: Message @staticmethod async def prepare_user_message(message: Message): - if message.photo: content = [] @@ -66,8 +65,10 @@ async def prepare_user_message(message: Message): content.append(DialogUtils.construct_message_content_part(DialogUtils.CONTENT_IMAGE_URL, file_url)) return DialogUtils.prepare_user_message(content) - else: + elif message.text: return DialogUtils.prepare_user_message(message.text) + else: + ValueError("prepare_user_message called with empty message") async def process(self, is_cancelled): context_manager = await self.context_manager() From 8c3eb4ae1f075f423ea128b1147c1b4eaad4b703 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Mon, 29 Apr 2024 00:30:37 +0300 Subject: [PATCH 06/10] Refactor llmmodel usage Start refactoring FunctionStorage for tools api support Add LLMCapabilities class, getting ready to integrate --- app/context/context_manager.py | 7 ++--- app/llm_models.py | 23 +++++++++++++++- app/openai_helpers/chatgpt.py | 36 ++++++++++++-------------- app/openai_helpers/function_storage.py | 13 +++++++++- app/openai_helpers/llm_client.py | 6 ++--- app/openai_helpers/utils.py | 7 ++--- 6 files changed, 57 insertions(+), 35 deletions(-) diff --git a/app/context/context_manager.py b/app/context/context_manager.py index ed39768..281ba27 100644 --- a/app/context/context_manager.py +++ b/app/context/context_manager.py @@ -5,7 +5,7 @@ import settings from app.context.dialog_manager import DialogManager from app.context.function_manager import FunctionManager -from app.llm_models import get_models +from app.llm_models import get_model_by_name from app.openai_helpers.chatgpt import DialogMessage from app.openai_helpers.function_storage import FunctionStorage from app.storage.db import DB, User, MessageType @@ -20,10 +20,7 @@ def __init__(self, db: DB, user: User, message: types.Message): self.function_manager = None async def process_dialog(self): - models = get_models() - llm_model = models.get(self.user.current_model) - if not llm_model: - raise ValueError(f"Unknown model: {self.user.current_model}") + llm_model = get_model_by_name(self.user.current_model) context_configuration = llm_model.context_configuration self.dialog_manager = DialogManager(self.db, self.user, context_configuration) await self.dialog_manager.process_dialog(self.message) diff --git a/app/llm_models.py b/app/llm_models.py index c2be6e4..57fb03a 100644 --- a/app/llm_models.py +++ b/app/llm_models.py @@ -1,5 +1,6 @@ import dataclasses from decimal import Decimal +from functools import lru_cache import settings @@ -24,16 +25,28 @@ class LLMModelContextConfiguration: hard_max_context_size: int +@dataclasses.dataclass +class LLMCapabilities: + function_calling: bool = False + tool_calling: bool = False + image_processing: bool = False + + class LLMModel: - def __init__(self, model_name: str, api_key, context_configuration, model_price=None, base_url=None): + def __init__(self, *, model_name: str, api_key, context_configuration, model_price=None, base_url=None, + capabilities=None): if model_price is None: model_price = LLMModelPrice(input_tokens_price=Decimal('0'), output_tokens_price=Decimal('0')) + if capabilities is None: + capabilities = LLMCapabilities() + self.model_name = model_name self.api_key = api_key self.context_configuration = context_configuration self.model_price = model_price self.base_url = base_url + self.capabilities = capabilities class LLMModels: @@ -46,6 +59,7 @@ class LLMModels: LLAMA3 = 'llama3' +@lru_cache def get_models(): models = {} openai_models = { @@ -152,3 +166,10 @@ def get_models(): ) return models + + +def get_model_by_name(model_name: str): + model = get_models().get(model_name) + if not model: + raise ValueError(f"Unknown model: {model_name}") + return model diff --git a/app/openai_helpers/chatgpt.py b/app/openai_helpers/chatgpt.py index b26f4b8..dc8e81f 100644 --- a/app/openai_helpers/chatgpt.py +++ b/app/openai_helpers/chatgpt.py @@ -4,7 +4,7 @@ import settings from app.bot.utils import merge_dicts -from app.llm_models import get_models, LLMModels +from app.llm_models import LLMModels, get_model_by_name from app.openai_helpers.count_tokens import count_messages_tokens, count_tokens_from_functions, count_string_tokens from app.openai_helpers.function_storage import FunctionStorage @@ -76,20 +76,18 @@ def openai_message(self): class ChatGPT: def __init__(self, model, system_prompt: str, function_storage: FunctionStorage = None): self.function_storage = function_storage - if model not in get_models(): - raise ValueError(f"Unknown model: {model}") - self.model = model + self.llm_model = get_model_by_name(model) self.system_prompt = system_prompt async def send_messages(self, messages_to_send: List[DialogMessage]) -> (DialogMessage, CompletionUsage): additional_fields = {} if self.function_storage is not None: additional_fields.update({ - 'functions': self.function_storage.get_openai_prompt(), + 'functions': self.function_storage.get_functions_info(), 'function_call': 'auto', }) - if self.model == LLMModels.GPT_4_VISION_PREVIEW: + if self.llm_model.model_name == LLMModels.GPT_4_VISION_PREVIEW: # TODO: somewhy by default it's 16 tokens for this model additional_fields['max_tokens'] = 4096 @@ -100,13 +98,13 @@ async def send_messages(self, messages_to_send: List[DialogMessage]) -> (DialogM del additional_fields['functions'] messages = self.create_context(messages_to_send, self.system_prompt) - resp = await OpenAILLMClient.get_client(self.model).chat.completions.create( - model=self.model, + resp = await OpenAILLMClient.get_client(self.llm_model.model_name).chat.completions.create( + model=self.llm_model.model_name, messages=messages, temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, **additional_fields, ) - completion_usage = CompletionUsage(model=self.model, **dict(resp.usage)) + completion_usage = CompletionUsage(model=self.llm_model.model_name, **dict(resp.usage)) message = resp.choices[0].message response = DialogMessage(**dict(message)) return response, completion_usage @@ -116,14 +114,14 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i additional_fields = {} if self.function_storage is not None: - functions = self.function_storage.get_openai_prompt() - prompt_tokens += count_tokens_from_functions(functions, self.model) + functions = self.function_storage.get_functions_info() + prompt_tokens += count_tokens_from_functions(functions, self.llm_model.model_name) additional_fields.update({ - 'functions': self.function_storage.get_openai_prompt(), + 'functions': self.function_storage.get_functions_info(), 'function_call': 'auto', }) - if self.model == LLMModels.GPT_4_VISION_PREVIEW: + if self.llm_model.model_name == LLMModels.GPT_4_VISION_PREVIEW: # TODO: somewhy by default it's 16 tokens for this model additional_fields['max_tokens'] = 4096 @@ -134,15 +132,15 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i del additional_fields['functions'] messages = self.create_context(messages_to_send, self.system_prompt) - resp_generator = await OpenAILLMClient.get_client(self.model).chat.completions.create( - model=self.model, + resp_generator = await OpenAILLMClient.get_client(self.llm_model.model_name).chat.completions.create( + model=self.llm_model.model_name, messages=messages, temperature=settings.OPENAI_CHAT_COMPLETION_TEMPERATURE, stream=True, **additional_fields, ) - prompt_tokens += count_messages_tokens(messages, self.model) + prompt_tokens += count_messages_tokens(messages, self.llm_model.model_name) result_dict = {} async for resp_part in resp_generator: delta = resp_part.choices[0].delta @@ -154,19 +152,19 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i continue result_dict = merge_dicts(result_dict, dict(delta)) dialog_message = DialogMessage(**result_dict) - completion_tokens = count_messages_tokens([result_dict], model=self.model) + completion_tokens = count_messages_tokens([result_dict], model=self.llm_model.model_name) elif delta.function_call is not None: result_dict = merge_dicts(result_dict, dict(delta.function_call)) dialog_message = DialogMessage(function_call=result_dict) # TODO: find mode accurate way to calculate completion length for function calls - completion_tokens = count_string_tokens(json.dumps(result_dict), model=self.model) + completion_tokens = count_string_tokens(json.dumps(result_dict), model=self.llm_model.model_name) else: continue # openai doesn't return this field in streaming mode somewhy dialog_message.role = 'assistant' completion_usage = CompletionUsage( - model=self.model, + model=self.llm_model.model_name, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, diff --git a/app/openai_helpers/function_storage.py b/app/openai_helpers/function_storage.py index 9d12a3b..4380c12 100644 --- a/app/openai_helpers/function_storage.py +++ b/app/openai_helpers/function_storage.py @@ -21,7 +21,7 @@ def extract_function_info(function) -> Dict[str, Any]: "parameters": function.get_params_schema(), } - def get_openai_prompt(self): + def get_functions_info(self): functions = [] for function in self.functions.values(): function_info = function['info'] @@ -29,6 +29,17 @@ def get_openai_prompt(self): return functions + def get_tools_info(self): + tools = [] + for function in self.functions.values(): + function_info = { + "type": "function", + "function": function['info'], + } + tools.append(function_info) + + return tools + def get_system_prompt_addition(self) -> str: result = [] for function in self.functions.values(): diff --git a/app/openai_helpers/llm_client.py b/app/openai_helpers/llm_client.py index de16ccb..c528871 100644 --- a/app/openai_helpers/llm_client.py +++ b/app/openai_helpers/llm_client.py @@ -1,6 +1,6 @@ import openai -from app.llm_models import get_models +from app.llm_models import get_model_by_name class OpenAILLMClient: @@ -9,9 +9,7 @@ class OpenAILLMClient: @classmethod def get_client(cls, model_name: str): if model_name not in cls._model_clients: - llm_model = get_models().get(model_name) - if not llm_model: - raise ValueError(f"Unknown model: {model_name}") + llm_model = get_model_by_name(model_name) params = { 'api_key': llm_model.api_key, } diff --git a/app/openai_helpers/utils.py b/app/openai_helpers/utils.py index 507bdc0..f822451 100644 --- a/app/openai_helpers/utils.py +++ b/app/openai_helpers/utils.py @@ -1,7 +1,7 @@ from decimal import Decimal import openai -from app.llm_models import get_models +from app.llm_models import get_model_by_name WHISPER_PRICE = Decimal('0.006') @@ -20,10 +20,7 @@ def calculate_completion_usage_price(prompt_tokens: int, completion_tokens: int, model: str) -> Decimal: - llm_model = get_models().get(model) - if not llm_model: - raise ValueError(f"Unknown model: {model}") - + llm_model = get_model_by_name(model) price = llm_model.model_price if not price: raise ValueError(f"Unknown model: {model}") From ede71ea9fccbd0d69d77a27d9c780f263915a50d Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Mon, 29 Apr 2024 00:53:54 +0300 Subject: [PATCH 07/10] Add handling image_processing capability llm_models refactoring --- app/bot/batched_input_handler.py | 9 ++ app/llm_models.py | 191 +++++++++++++++++-------------- app/openai_helpers/chatgpt.py | 23 ++-- 3 files changed, 127 insertions(+), 96 deletions(-) diff --git a/app/bot/batched_input_handler.py b/app/bot/batched_input_handler.py index 73d4deb..5fd042d 100644 --- a/app/bot/batched_input_handler.py +++ b/app/bot/batched_input_handler.py @@ -11,6 +11,7 @@ import settings from app.bot.message_processor import MessageProcessor from app.bot.utils import TypingWorker, message_is_forward, get_username, Timer, generate_document_id +from app.llm_models import get_model_by_name from app.openai_helpers.whisper import get_audio_speech_to_text from app.storage.db import User, MessageType from app.storage.user_role import check_access_conditions @@ -116,6 +117,14 @@ async def process_batch(self, messages_batch: List[types.Message], user: User): await self.handle_voice(message, user, message_processor) elif message.document: await self.handle_document(message, user, message_processor) + elif message.photo: + # handling image just like message but with some additional checks + llm_model = get_model_by_name(user.current_model) + if llm_model.capabilities.image_processing: + await self.handle_message(message, user, message_processor) + else: + # TODO: exception is a bad way to handle this, need to find a better way + raise ValueError(f'Image processing is not supported by {llm_model.model_name} model.') else: await self.handle_message(message, user, message_processor) diff --git a/app/llm_models.py b/app/llm_models.py index 57fb03a..7c02c26 100644 --- a/app/llm_models.py +++ b/app/llm_models.py @@ -49,7 +49,7 @@ def __init__(self, *, model_name: str, api_key, context_configuration, model_pri self.capabilities = capabilities -class LLMModels: +class LLModel: GPT_35_TURBO = 'gpt-3.5-turbo' GPT_35_TURBO_16K = 'gpt-3.5-turbo-16k' GPT_4 = 'gpt-4' @@ -62,99 +62,118 @@ class LLMModels: @lru_cache def get_models(): models = {} - openai_models = { - LLMModels.GPT_35_TURBO: LLMModel( - model_name=LLMModels.GPT_35_TURBO, - api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( - long_term_memory_tokens=512, - short_term_memory_tokens=2560, - summary_length=512, - hard_max_context_size=5*1024, - ), - model_price=LLMModelPrice( - input_tokens_price=Decimal('0.0005'), - output_tokens_price=Decimal('0.0015'), - ), - ), - LLMModels.GPT_35_TURBO_16K: LLMModel( - model_name=LLMModels.GPT_35_TURBO_16K, - api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( - long_term_memory_tokens=1024, - short_term_memory_tokens=4096, - summary_length=1024, - hard_max_context_size=17*1024, - ), - model_price=LLMModelPrice( - input_tokens_price=Decimal('0.003'), - output_tokens_price=Decimal('0.004'), - ), - ), - LLMModels.GPT_4: LLMModel( - model_name=LLMModels.GPT_4, - api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( - long_term_memory_tokens=512, - short_term_memory_tokens=2048, - summary_length=1024, - hard_max_context_size=9*1024, - ), - model_price=LLMModelPrice( - input_tokens_price=Decimal('0.03'), - output_tokens_price=Decimal('0.06'), - ), - ), - LLMModels.GPT_4_TURBO: LLMModel( - model_name=LLMModels.GPT_4_TURBO, - api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( - long_term_memory_tokens=512, - short_term_memory_tokens=5120, - summary_length=2048, - hard_max_context_size=13*1024, + + if settings.OPENAI_TOKEN: + models.update({ + LLModel.GPT_35_TURBO: LLMModel( + model_name=LLModel.GPT_35_TURBO, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=2560, + summary_length=512, + hard_max_context_size=5*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.0005'), + output_tokens_price=Decimal('0.0015'), + ), + capabilities=LLMCapabilities( + function_calling=True, + ), ), - model_price=LLMModelPrice( - input_tokens_price=Decimal('0.01'), - output_tokens_price=Decimal('0.03'), + LLModel.GPT_35_TURBO_16K: LLMModel( + model_name=LLModel.GPT_35_TURBO_16K, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=1024, + short_term_memory_tokens=4096, + summary_length=1024, + hard_max_context_size=17*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.003'), + output_tokens_price=Decimal('0.004'), + ), + capabilities=LLMCapabilities( + function_calling=True, + ), ), - ), - LLMModels.GPT_4_TURBO_PREVIEW: LLMModel( - model_name=LLMModels.GPT_4_TURBO_PREVIEW, - api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( - long_term_memory_tokens=512, - short_term_memory_tokens=5120, - summary_length=2048, - hard_max_context_size=13*1024, + LLModel.GPT_4: LLMModel( + model_name=LLModel.GPT_4, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=2048, + summary_length=1024, + hard_max_context_size=9*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.03'), + output_tokens_price=Decimal('0.06'), + ), + capabilities=LLMCapabilities( + function_calling=True, + ), ), - model_price=LLMModelPrice( - input_tokens_price=Decimal('0.01'), - output_tokens_price=Decimal('0.03'), + LLModel.GPT_4_TURBO: LLMModel( + model_name=LLModel.GPT_4_TURBO, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=5120, + summary_length=2048, + hard_max_context_size=13*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.01'), + output_tokens_price=Decimal('0.03'), + ), + capabilities=LLMCapabilities( + function_calling=True, + image_processing=True, + ), ), - ), - LLMModels.GPT_4_VISION_PREVIEW: LLMModel( - model_name=LLMModels.GPT_4_VISION_PREVIEW, - api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( - long_term_memory_tokens=512, - short_term_memory_tokens=5120, - summary_length=2048, - hard_max_context_size=13*1024, + LLModel.GPT_4_TURBO_PREVIEW: LLMModel( + model_name=LLModel.GPT_4_TURBO_PREVIEW, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=5120, + summary_length=2048, + hard_max_context_size=13*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.01'), + output_tokens_price=Decimal('0.03'), + ), + capabilities=LLMCapabilities( + function_calling=True, + ), ), - model_price=LLMModelPrice( - input_tokens_price=Decimal('0.01'), - output_tokens_price=Decimal('0.03'), + LLModel.GPT_4_VISION_PREVIEW: LLMModel( + model_name=LLModel.GPT_4_VISION_PREVIEW, + api_key=settings.OPENAI_TOKEN, + context_configuration=LLMModelContextConfiguration( + long_term_memory_tokens=512, + short_term_memory_tokens=5120, + summary_length=2048, + hard_max_context_size=13*1024, + ), + model_price=LLMModelPrice( + input_tokens_price=Decimal('0.01'), + output_tokens_price=Decimal('0.03'), + ), + capabilities=LLMCapabilities( + image_processing=True, + ), ), - ), - } - - if settings.OPENAI_TOKEN: - models.update(openai_models) + }) + # example of using llama3 model in ollama if settings.OLLAMA_BASE_URL: - models[LLMModels.LLAMA3] = LLMModel( - model_name=LLMModels.LLAMA3, + models[LLModel.LLAMA3] = LLMModel( + model_name=LLModel.LLAMA3, api_key=settings.OLLAMA_API_KEY, context_configuration=LLMModelContextConfiguration( long_term_memory_tokens=512, diff --git a/app/openai_helpers/chatgpt.py b/app/openai_helpers/chatgpt.py index dc8e81f..e412434 100644 --- a/app/openai_helpers/chatgpt.py +++ b/app/openai_helpers/chatgpt.py @@ -4,7 +4,7 @@ import settings from app.bot.utils import merge_dicts -from app.llm_models import LLMModels, get_model_by_name +from app.llm_models import LLModel, get_model_by_name from app.openai_helpers.count_tokens import count_messages_tokens, count_tokens_from_functions, count_string_tokens from app.openai_helpers.function_storage import FunctionStorage @@ -87,7 +87,7 @@ async def send_messages(self, messages_to_send: List[DialogMessage]) -> (DialogM 'function_call': 'auto', }) - if self.llm_model.model_name == LLMModels.GPT_4_VISION_PREVIEW: + if self.llm_model.model_name == LLModel.GPT_4_VISION_PREVIEW: # TODO: somewhy by default it's 16 tokens for this model additional_fields['max_tokens'] = 4096 @@ -114,14 +114,17 @@ async def send_messages_streaming(self, messages_to_send: List[DialogMessage], i additional_fields = {} if self.function_storage is not None: - functions = self.function_storage.get_functions_info() - prompt_tokens += count_tokens_from_functions(functions, self.llm_model.model_name) - additional_fields.update({ - 'functions': self.function_storage.get_functions_info(), - 'function_call': 'auto', - }) - - if self.llm_model.model_name == LLMModels.GPT_4_VISION_PREVIEW: + if self.llm_model.capabilities.function_calling: + functions = self.function_storage.get_functions_info() + prompt_tokens += count_tokens_from_functions(functions, self.llm_model.model_name) + additional_fields.update({ + 'functions': self.function_storage.get_functions_info(), + 'function_call': 'auto', + }) + elif self.llm_model.capabilities.tool_calling: + NotImplementedError('Tool calling support is not implemented yet') + + if self.llm_model.model_name == LLModel.GPT_4_VISION_PREVIEW: # TODO: somewhy by default it's 16 tokens for this model additional_fields['max_tokens'] = 4096 From e08b9e6e23558e02b7c4978169d47191e9d35dd7 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Mon, 29 Apr 2024 00:59:36 +0300 Subject: [PATCH 08/10] Update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fc09a18..8e407df 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,12 @@ This GitHub repository contains the implementation of a telegram bot, designed to facilitate seamless interaction with GPT-3.5 and GPT-4, state-of-the-art language models by OpenAI. 🔥 **GPT-4 Turbo support (with vision)** +🔥 **Custom OpenAI API compatible endpoints support** 🔥 **DALL-E 3 Image generation support** 🔑 **Key Features** -1. **Model Support**: gpt-3.5-turbo, gpt-4-turbo, gpt-4, gpt-4-turbo-preview, gpt-4-vision-preview. +1. **Model Support**: all OpenAI models are supported out of the box. Also you can add OpenAI API compatible endpoints by adding them to `app/llm_models.py` 2. **Image Generation**: You can ask bot to generate images using DALL-E 3 model, use bot just like official chatgpt app. 3. **Dynamic Dialog Management**: The bot automatically manages the context of the conversation, eliminating the need for the user to manually reset the context using the /reset command. You still can reset dialog manually if needed. 4. **Automatic Context Summarization**: In case the context size exceeds the model's maximum limit, the bot automatically summarizes the context to ensure the continuity of the conversation. From a0a089a4edef85933576305d97d1eacf2581a844 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Mon, 29 Apr 2024 01:02:55 +0300 Subject: [PATCH 09/10] Refactor LLM configuration naming --- app/llm_models.py | 66 +++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/app/llm_models.py b/app/llm_models.py index 7c02c26..be1978c 100644 --- a/app/llm_models.py +++ b/app/llm_models.py @@ -6,14 +6,14 @@ @dataclasses.dataclass -class LLMModelPrice: +class LLMPrice: # price per 1000 tokens input_tokens_price: Decimal output_tokens_price: Decimal @dataclasses.dataclass -class LLMModelContextConfiguration: +class LLMContextConfiguration: # long term memory is based on embedding context search long_term_memory_tokens: int # short term memory is used for storing last messages @@ -32,11 +32,19 @@ class LLMCapabilities: image_processing: bool = False -class LLMModel: +class LLModel: + GPT_35_TURBO = 'gpt-3.5-turbo' + GPT_35_TURBO_16K = 'gpt-3.5-turbo-16k' + GPT_4 = 'gpt-4' + GPT_4_TURBO = 'gpt-4-turbo' + GPT_4_TURBO_PREVIEW = 'gpt-4-turbo-preview' + GPT_4_VISION_PREVIEW = 'gpt-4-vision-preview' + LLAMA3 = 'llama3' + def __init__(self, *, model_name: str, api_key, context_configuration, model_price=None, base_url=None, capabilities=None): if model_price is None: - model_price = LLMModelPrice(input_tokens_price=Decimal('0'), output_tokens_price=Decimal('0')) + model_price = LLMPrice(input_tokens_price=Decimal('0'), output_tokens_price=Decimal('0')) if capabilities is None: capabilities = LLMCapabilities() @@ -49,32 +57,22 @@ def __init__(self, *, model_name: str, api_key, context_configuration, model_pri self.capabilities = capabilities -class LLModel: - GPT_35_TURBO = 'gpt-3.5-turbo' - GPT_35_TURBO_16K = 'gpt-3.5-turbo-16k' - GPT_4 = 'gpt-4' - GPT_4_TURBO = 'gpt-4-turbo' - GPT_4_TURBO_PREVIEW = 'gpt-4-turbo-preview' - GPT_4_VISION_PREVIEW = 'gpt-4-vision-preview' - LLAMA3 = 'llama3' - - @lru_cache def get_models(): models = {} if settings.OPENAI_TOKEN: models.update({ - LLModel.GPT_35_TURBO: LLMModel( + LLModel.GPT_35_TURBO: LLModel( model_name=LLModel.GPT_35_TURBO, api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=512, short_term_memory_tokens=2560, summary_length=512, hard_max_context_size=5*1024, ), - model_price=LLMModelPrice( + model_price=LLMPrice( input_tokens_price=Decimal('0.0005'), output_tokens_price=Decimal('0.0015'), ), @@ -82,16 +80,16 @@ def get_models(): function_calling=True, ), ), - LLModel.GPT_35_TURBO_16K: LLMModel( + LLModel.GPT_35_TURBO_16K: LLModel( model_name=LLModel.GPT_35_TURBO_16K, api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=1024, short_term_memory_tokens=4096, summary_length=1024, hard_max_context_size=17*1024, ), - model_price=LLMModelPrice( + model_price=LLMPrice( input_tokens_price=Decimal('0.003'), output_tokens_price=Decimal('0.004'), ), @@ -99,16 +97,16 @@ def get_models(): function_calling=True, ), ), - LLModel.GPT_4: LLMModel( + LLModel.GPT_4: LLModel( model_name=LLModel.GPT_4, api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=512, short_term_memory_tokens=2048, summary_length=1024, hard_max_context_size=9*1024, ), - model_price=LLMModelPrice( + model_price=LLMPrice( input_tokens_price=Decimal('0.03'), output_tokens_price=Decimal('0.06'), ), @@ -116,16 +114,16 @@ def get_models(): function_calling=True, ), ), - LLModel.GPT_4_TURBO: LLMModel( + LLModel.GPT_4_TURBO: LLModel( model_name=LLModel.GPT_4_TURBO, api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=512, short_term_memory_tokens=5120, summary_length=2048, hard_max_context_size=13*1024, ), - model_price=LLMModelPrice( + model_price=LLMPrice( input_tokens_price=Decimal('0.01'), output_tokens_price=Decimal('0.03'), ), @@ -134,16 +132,16 @@ def get_models(): image_processing=True, ), ), - LLModel.GPT_4_TURBO_PREVIEW: LLMModel( + LLModel.GPT_4_TURBO_PREVIEW: LLModel( model_name=LLModel.GPT_4_TURBO_PREVIEW, api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=512, short_term_memory_tokens=5120, summary_length=2048, hard_max_context_size=13*1024, ), - model_price=LLMModelPrice( + model_price=LLMPrice( input_tokens_price=Decimal('0.01'), output_tokens_price=Decimal('0.03'), ), @@ -151,16 +149,16 @@ def get_models(): function_calling=True, ), ), - LLModel.GPT_4_VISION_PREVIEW: LLMModel( + LLModel.GPT_4_VISION_PREVIEW: LLModel( model_name=LLModel.GPT_4_VISION_PREVIEW, api_key=settings.OPENAI_TOKEN, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=512, short_term_memory_tokens=5120, summary_length=2048, hard_max_context_size=13*1024, ), - model_price=LLMModelPrice( + model_price=LLMPrice( input_tokens_price=Decimal('0.01'), output_tokens_price=Decimal('0.03'), ), @@ -172,10 +170,10 @@ def get_models(): # example of using llama3 model in ollama if settings.OLLAMA_BASE_URL: - models[LLModel.LLAMA3] = LLMModel( + models[LLModel.LLAMA3] = LLModel( model_name=LLModel.LLAMA3, api_key=settings.OLLAMA_API_KEY, - context_configuration=LLMModelContextConfiguration( + context_configuration=LLMContextConfiguration( long_term_memory_tokens=512, short_term_memory_tokens=2048, summary_length=512, From 63c9643c56a22c65b586ccd165fd1601de8ecc10 Mon Sep 17 00:00:00 2001 From: Maxim Nechaev Date: Mon, 29 Apr 2024 01:39:51 +0300 Subject: [PATCH 10/10] Remove function_storage from processing pipeline earlier if model doesn't support tool calls or function calls --- app/bot/message_processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/bot/message_processor.py b/app/bot/message_processor.py index 4ed5b4e..7f02ccd 100644 --- a/app/bot/message_processor.py +++ b/app/bot/message_processor.py @@ -10,6 +10,7 @@ from app.bot.utils import send_telegram_message, detect_and_extract_code, edit_telegram_message from app.context.context_manager import build_context_manager from app.context.dialog_manager import DialogUtils +from app.llm_models import get_model_by_name from app.openai_helpers.chatgpt import ChatGPT from app.openai_helpers.count_tokens import calculate_image_tokens from app.storage.db import DB, User, MessageType @@ -73,7 +74,10 @@ async def prepare_user_message(message: Message): async def process(self, is_cancelled): context_manager = await self.context_manager() - function_storage = await context_manager.get_function_storage() + llm_model = get_model_by_name(self.user.current_model) + function_storage = None + if llm_model.capabilities.tool_calling or llm_model.capabilities.function_calling: + function_storage = await context_manager.get_function_storage() system_prompt = await context_manager.get_system_prompt() chat_gpt_manager = ChatGptManager(ChatGPT(self.user.current_model, system_prompt, function_storage), self.db)