From efba13c2b92bb810cc1e2fb68881a6ddc5667536 Mon Sep 17 00:00:00 2001 From: "Chelsea E. Manning" Date: Sun, 26 Nov 2023 23:07:45 +0000 Subject: [PATCH 1/6] Extend OpenAIEmbeddings class to support non-`tiktoken` based embeddings, supporting the new `text-generation-webui` API for Non-OpenAI-based Embeddings Support --- libs/langchain/langchain/embeddings/openai.py | 206 +++++++++++++----- 1 file changed, 150 insertions(+), 56 deletions(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index d10da4b4e97132..6a960e9edeb503 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings): """Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or None.""" headers: Any = None + tiktoken_enabled: bool = True + """Set this to False for non-OpenAI implementations of the embeddings API, e.g. + the `--extensions openai` extension for `text-generation-webui`""" tiktoken_model_name: Optional[str] = None """The model name to pass to tiktoken when using this class. Tiktoken is used to count the number of tokens in documents to constrain @@ -383,41 +386,85 @@ def _get_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) + batched_embeddings: List[List[float]] = [] + """ + Generate length-safe embeddings for a list of texts. + + This method handles tokenization and embedding generation, respecting the + set embedding context length and chunk size. It supports both tiktoken + and HuggingFace tokenizer based on the tiktoken_enabled flag. + + Args: + texts (List[str]): A list of texts to embed. + engine (str): The engine or model to use for embeddings. + chunk_size (Optional[int]): The size of chunks for processing embeddings. + + Returns: + List[List[float]]: A list of embeddings for each input text. + """ tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) - - batched_embeddings: List[List[float]] = [] _chunk_size = chunk_size or self.chunk_size + # If tiktoken flag set to False + if not self.tiktoken_enabled: + try: + from transformers import AutoTokenizer + except ImportError: + raise ValueError( + "Could not import transformers python package. " + "This is needed in order to for OpenAIEmbeddings without `tiktoken`. " + "Please install it with `pip install transformers`." + ) + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name) + for i, text in enumerate(texts): + # Tokenize the text using HuggingFace transformers + tokenized = tokenizer.encode(text, add_special_tokens=False) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(tokenized), self.embedding_ctx_length): + token_chunk = tokenized[j : j + self.embedding_ctx_length] + + # Convert token IDs back to a string + chunk_text = tokenizer.decode(token_chunk) + tokens.append(chunk_text) + indices.append(i) + else: + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for OpenAIEmbeddings. " + "Please install it with `pip install tiktoken`." + ) + + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + for i, text in enumerate(texts): + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") + + token = encoding.encode( + text=text, + allowed_special=self.allowed_special, + disallowed_special=self.disallowed_special, + ) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(token), self.embedding_ctx_length): + tokens.append(token[j : j + self.embedding_ctx_length]) + indices.append(i) + if self.show_progress_bar: try: from tqdm.auto import tqdm @@ -428,6 +475,7 @@ def _get_len_safe_embeddings( else: _iter = range(0, len(tokens), _chunk_size) + batched_embeddings: List[List[float]] = [] for i in _iter: response = embed_with_retry( self, @@ -469,37 +517,83 @@ async def _aget_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) + """ + Asynchronously generate length-safe embeddings for a list of texts. + + This method handles tokenization and asynchronous embedding generation, respecting + the set embedding context length and chunk size. It supports both tiktoken + and Hugging Face tokenizer based on the tiktoken_enabled flag. + + Args: + texts (List[str]): A list of texts to embed. + engine (str): The engine or model to use for embeddings. + chunk_size (Optional[int]): The size of chunks for processing embeddings. + + Returns: + List[List[float]]: A list of embeddings for each input text. + """ tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) + _chunk_size = chunk_size or self.chunk_size + + # If tiktoken flag set to False + if not self.tiktoken_enabled: + try: + from transformers import AutoTokenizer + except ImportError: + raise ValueError( + "Could not import transformers python package. " + "This is needed in order to for OpenAIEmbeddings without `tiktoken`. " + "Please install it with `pip install transformers`." + ) + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name) + for i, text in enumerate(texts): + # Tokenize the text using HuggingFace transformers + tokenized = tokenizer.encode(text, add_special_tokens=False) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(tokenized), self.embedding_ctx_length): + token_chunk = tokenized[j : j + self.embedding_ctx_length] + + # Convert token IDs back to a string + chunk_text = tokenizer.decode(token_chunk) + tokens.append(chunk_text) + indices.append(i) + else: + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for OpenAIEmbeddings. " + "Please install it with `pip install tiktoken`." + ) + + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + for i, text in enumerate(texts): + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") + + token = encoding.encode( + text=text, + allowed_special=self.allowed_special, + disallowed_special=self.disallowed_special, + ) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(token), self.embedding_ctx_length): + tokens.append(token[j : j + self.embedding_ctx_length]) + indices.append(i) batched_embeddings: List[List[float]] = [] _chunk_size = chunk_size or self.chunk_size From 1780814effac34343870ffc004606516e265f5c2 Mon Sep 17 00:00:00 2001 From: "Chelsea E. Manning" Date: Mon, 27 Nov 2023 06:33:31 +0000 Subject: [PATCH 2/6] correct lint errors with modified code --- libs/langchain/langchain/embeddings/openai.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index 6a960e9edeb503..861b7f7c37e38b 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -415,11 +415,13 @@ def _get_len_safe_embeddings( except ImportError: raise ValueError( "Could not import transformers python package. " - "This is needed in order to for OpenAIEmbeddings without `tiktoken`. " - "Please install it with `pip install transformers`." + "This is needed in order to for OpenAIEmbeddings without " + "`tiktoken`. Please install it with `pip install transformers`. " ) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name) + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name + ) for i, text in enumerate(texts): # Tokenize the text using HuggingFace transformers tokenized = tokenizer.encode(text, add_special_tokens=False) @@ -520,9 +522,9 @@ async def _aget_len_safe_embeddings( """ Asynchronously generate length-safe embeddings for a list of texts. - This method handles tokenization and asynchronous embedding generation, respecting - the set embedding context length and chunk size. It supports both tiktoken - and Hugging Face tokenizer based on the tiktoken_enabled flag. + This method handles tokenization and asynchronous embedding generation, + respecting the set embedding context length and chunk size. It supports both + `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag. Args: texts (List[str]): A list of texts to embed. @@ -545,11 +547,13 @@ async def _aget_len_safe_embeddings( except ImportError: raise ValueError( "Could not import transformers python package. " - "This is needed in order to for OpenAIEmbeddings without `tiktoken`. " - "Please install it with `pip install transformers`." + "This is needed in order to for OpenAIEmbeddings without " + " `tiktoken`. Please install it with `pip install transformers`." ) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name) + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name + ) for i, text in enumerate(texts): # Tokenize the text using HuggingFace transformers tokenized = tokenizer.encode(text, add_special_tokens=False) From b349267f2aa8f3f15c099b63116e5275ea923733 Mon Sep 17 00:00:00 2001 From: "Chelsea E. Manning" Date: Mon, 27 Nov 2023 16:37:18 +0000 Subject: [PATCH 3/6] more linter corrections --- libs/langchain/langchain/embeddings/openai.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index 861b7f7c37e38b..a0255ad99d2f2a 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -429,7 +429,7 @@ def _get_len_safe_embeddings( # Split tokens into chunks respecting the embedding_ctx_length for j in range(0, len(tokenized), self.embedding_ctx_length): token_chunk = tokenized[j : j + self.embedding_ctx_length] - + # Convert token IDs back to a string chunk_text = tokenizer.decode(token_chunk) tokens.append(chunk_text) @@ -452,7 +452,8 @@ def _get_len_safe_embeddings( encoding = tiktoken.get_encoding(model) for i, text in enumerate(texts): if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # See: https://github.com/openai/openai-python/ + # issues/418#issuecomment-1525939500 # replace newlines, which can negatively affect performance. text = text.replace("\n", " ") @@ -584,7 +585,8 @@ async def _aget_len_safe_embeddings( encoding = tiktoken.get_encoding(model) for i, text in enumerate(texts): if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # See: https://github.com/openai/openai-python/ + # issues/418#issuecomment-1525939500 # replace newlines, which can negatively affect performance. text = text.replace("\n", " ") From b78c7584af47853e6a07d07afc61d90c8cc125dc Mon Sep 17 00:00:00 2001 From: "Chelsea E. Manning" Date: Mon, 27 Nov 2023 16:39:18 +0000 Subject: [PATCH 4/6] final linter error corrected --- libs/langchain/langchain/embeddings/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index a0255ad99d2f2a..04ea75d92ec530 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -562,7 +562,7 @@ async def _aget_len_safe_embeddings( # Split tokens into chunks respecting the embedding_ctx_length for j in range(0, len(tokenized), self.embedding_ctx_length): token_chunk = tokenized[j : j + self.embedding_ctx_length] - + # Convert token IDs back to a string chunk_text = tokenizer.decode(token_chunk) tokens.append(chunk_text) From 1111a4a69a2ba78cceb26cf91b0f4c29da43dab8 Mon Sep 17 00:00:00 2001 From: "Chelsea E. Manning" Date: Mon, 27 Nov 2023 18:33:59 +0000 Subject: [PATCH 5/6] remove two characters of whitespace --- libs/langchain/langchain/embeddings/openai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index 04ea75d92ec530..8f9fae63fa845f 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -456,7 +456,7 @@ def _get_len_safe_embeddings( # issues/418#issuecomment-1525939500 # replace newlines, which can negatively affect performance. text = text.replace("\n", " ") - + token = encoding.encode( text=text, allowed_special=self.allowed_special, @@ -589,7 +589,7 @@ async def _aget_len_safe_embeddings( # issues/418#issuecomment-1525939500 # replace newlines, which can negatively affect performance. text = text.replace("\n", " ") - + token = encoding.encode( text=text, allowed_special=self.allowed_special, From 03750ba6377bab8d918e2cd8ff20581320dfa311 Mon Sep 17 00:00:00 2001 From: "Chelsea E. Manning" Date: Mon, 27 Nov 2023 23:34:43 +0000 Subject: [PATCH 6/6] minor adjustments for the linter --- libs/langchain/langchain/embeddings/openai.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index 8f9fae63fa845f..5abb8a7ec6497b 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -385,8 +385,6 @@ def _invocation_params(self) -> Dict[str, Any]: def _get_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - batched_embeddings: List[List[float]] = [] """ Generate length-safe embeddings for a list of texts. @@ -497,6 +495,7 @@ def _get_len_safe_embeddings( results[indices[i]].append(batched_embeddings[i]) num_tokens_in_batch[indices[i]].append(len(tokens[i])) + embeddings: List[List[float]] = [[] for _ in range(len(texts))] for i in range(len(texts)): _result = results[i] if len(_result) == 0: @@ -519,11 +518,10 @@ def _get_len_safe_embeddings( async def _aget_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] """ Asynchronously generate length-safe embeddings for a list of texts. - This method handles tokenization and asynchronous embedding generation, + This method handles tokenization and asynchronous embedding generation, respecting the set embedding context length and chunk size. It supports both `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag. @@ -620,6 +618,7 @@ async def _aget_len_safe_embeddings( results[indices[i]].append(batched_embeddings[i]) num_tokens_in_batch[indices[i]].append(len(tokens[i])) + embeddings: List[List[float]] = [[] for _ in range(len(texts))] for i in range(len(texts)): _result = results[i] if len(_result) == 0: