From efba13c2b92bb810cc1e2fb68881a6ddc5667536 Mon Sep 17 00:00:00 2001
From: "Chelsea E. Manning" <me@xychelsea.is>
Date: Sun, 26 Nov 2023 23:07:45 +0000
Subject: [PATCH 1/6] Extend OpenAIEmbeddings class to support non-`tiktoken`
 based embeddings, supporting the new `text-generation-webui` API for
 Non-OpenAI-based Embeddings Support

---
 libs/langchain/langchain/embeddings/openai.py | 206 +++++++++++++-----
 1 file changed, 150 insertions(+), 56 deletions(-)

diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py
index d10da4b4e97132..6a960e9edeb503 100644
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
     """Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or 
         None."""
     headers: Any = None
+    tiktoken_enabled: bool = True
+    """Set this to False for non-OpenAI implementations of the embeddings API, e.g.
+    the `--extensions openai` extension for `text-generation-webui`"""
     tiktoken_model_name: Optional[str] = None
     """The model name to pass to tiktoken when using this class. 
     Tiktoken is used to count the number of tokens in documents to constrain 
@@ -383,41 +386,85 @@ def _get_len_safe_embeddings(
         self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
     ) -> List[List[float]]:
         embeddings: List[List[float]] = [[] for _ in range(len(texts))]
-        try:
-            import tiktoken
-        except ImportError:
-            raise ImportError(
-                "Could not import tiktoken python package. "
-                "This is needed in order to for OpenAIEmbeddings. "
-                "Please install it with `pip install tiktoken`."
-            )
+        batched_embeddings: List[List[float]] = []
+        """
+        Generate length-safe embeddings for a list of texts.
+
+        This method handles tokenization and embedding generation, respecting the
+        set embedding context length and chunk size. It supports both tiktoken
+        and HuggingFace tokenizer based on the tiktoken_enabled flag.
+
+        Args:
+            texts (List[str]): A list of texts to embed.
+            engine (str): The engine or model to use for embeddings.
+            chunk_size (Optional[int]): The size of chunks for processing embeddings.
+
+        Returns:
+            List[List[float]]: A list of embeddings for each input text.
+        """
 
         tokens = []
         indices = []
         model_name = self.tiktoken_model_name or self.model
-        try:
-            encoding = tiktoken.encoding_for_model(model_name)
-        except KeyError:
-            logger.warning("Warning: model not found. Using cl100k_base encoding.")
-            model = "cl100k_base"
-            encoding = tiktoken.get_encoding(model)
-        for i, text in enumerate(texts):
-            if self.model.endswith("001"):
-                # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
-                # replace newlines, which can negatively affect performance.
-                text = text.replace("\n", " ")
-            token = encoding.encode(
-                text,
-                allowed_special=self.allowed_special,
-                disallowed_special=self.disallowed_special,
-            )
-            for j in range(0, len(token), self.embedding_ctx_length):
-                tokens.append(token[j : j + self.embedding_ctx_length])
-                indices.append(i)
-
-        batched_embeddings: List[List[float]] = []
         _chunk_size = chunk_size or self.chunk_size
 
+        # If tiktoken flag set to False
+        if not self.tiktoken_enabled:
+            try:
+                from transformers import AutoTokenizer
+            except ImportError:
+                raise ValueError(
+                    "Could not import transformers python package. "
+                    "This is needed in order to for OpenAIEmbeddings without `tiktoken`. "
+                    "Please install it with `pip install transformers`."
+                )
+
+            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
+            for i, text in enumerate(texts):
+                # Tokenize the text using HuggingFace transformers
+                tokenized = tokenizer.encode(text, add_special_tokens=False)
+
+                # Split tokens into chunks respecting the embedding_ctx_length
+                for j in range(0, len(tokenized), self.embedding_ctx_length):
+                    token_chunk = tokenized[j : j + self.embedding_ctx_length]
+                
+                    # Convert token IDs back to a string
+                    chunk_text = tokenizer.decode(token_chunk)
+                    tokens.append(chunk_text)
+                    indices.append(i)
+        else:
+            try:
+                import tiktoken
+            except ImportError:
+                raise ImportError(
+                    "Could not import tiktoken python package. "
+                    "This is needed in order to for OpenAIEmbeddings. "
+                    "Please install it with `pip install tiktoken`."
+                )
+
+            try:
+                encoding = tiktoken.encoding_for_model(model_name)
+            except KeyError:
+                logger.warning("Warning: model not found. Using cl100k_base encoding.")
+                model = "cl100k_base"
+                encoding = tiktoken.get_encoding(model)
+            for i, text in enumerate(texts):
+                if self.model.endswith("001"):
+                    # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
+                    # replace newlines, which can negatively affect performance.
+                    text = text.replace("\n", " ")
+ 
+                token = encoding.encode(
+                    text=text,
+                    allowed_special=self.allowed_special,
+                    disallowed_special=self.disallowed_special,
+                )
+
+                # Split tokens into chunks respecting the embedding_ctx_length
+                for j in range(0, len(token), self.embedding_ctx_length):
+                    tokens.append(token[j : j + self.embedding_ctx_length])
+                    indices.append(i)
+
         if self.show_progress_bar:
             try:
                 from tqdm.auto import tqdm
@@ -428,6 +475,7 @@ def _get_len_safe_embeddings(
         else:
             _iter = range(0, len(tokens), _chunk_size)
 
+        batched_embeddings: List[List[float]] = []
         for i in _iter:
             response = embed_with_retry(
                 self,
@@ -469,37 +517,83 @@ async def _aget_len_safe_embeddings(
         self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
     ) -> List[List[float]]:
         embeddings: List[List[float]] = [[] for _ in range(len(texts))]
-        try:
-            import tiktoken
-        except ImportError:
-            raise ImportError(
-                "Could not import tiktoken python package. "
-                "This is needed in order to for OpenAIEmbeddings. "
-                "Please install it with `pip install tiktoken`."
-            )
+        """
+        Asynchronously generate length-safe embeddings for a list of texts.
+
+        This method handles tokenization and asynchronous embedding generation, respecting
+        the set embedding context length and chunk size. It supports both tiktoken
+        and Hugging Face tokenizer based on the tiktoken_enabled flag.
+
+        Args:
+            texts (List[str]): A list of texts to embed.
+            engine (str): The engine or model to use for embeddings.
+            chunk_size (Optional[int]): The size of chunks for processing embeddings.
+
+        Returns:
+            List[List[float]]: A list of embeddings for each input text.
+        """
 
         tokens = []
         indices = []
         model_name = self.tiktoken_model_name or self.model
-        try:
-            encoding = tiktoken.encoding_for_model(model_name)
-        except KeyError:
-            logger.warning("Warning: model not found. Using cl100k_base encoding.")
-            model = "cl100k_base"
-            encoding = tiktoken.get_encoding(model)
-        for i, text in enumerate(texts):
-            if self.model.endswith("001"):
-                # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
-                # replace newlines, which can negatively affect performance.
-                text = text.replace("\n", " ")
-            token = encoding.encode(
-                text,
-                allowed_special=self.allowed_special,
-                disallowed_special=self.disallowed_special,
-            )
-            for j in range(0, len(token), self.embedding_ctx_length):
-                tokens.append(token[j : j + self.embedding_ctx_length])
-                indices.append(i)
+        _chunk_size = chunk_size or self.chunk_size
+
+        # If tiktoken flag set to False
+        if not self.tiktoken_enabled:
+            try:
+                from transformers import AutoTokenizer
+            except ImportError:
+                raise ValueError(
+                    "Could not import transformers python package. "
+                    "This is needed in order to for OpenAIEmbeddings without `tiktoken`. "
+                    "Please install it with `pip install transformers`."
+                )
+
+            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
+            for i, text in enumerate(texts):
+                # Tokenize the text using HuggingFace transformers
+                tokenized = tokenizer.encode(text, add_special_tokens=False)
+
+                # Split tokens into chunks respecting the embedding_ctx_length
+                for j in range(0, len(tokenized), self.embedding_ctx_length):
+                    token_chunk = tokenized[j : j + self.embedding_ctx_length]
+                
+                    # Convert token IDs back to a string
+                    chunk_text = tokenizer.decode(token_chunk)
+                    tokens.append(chunk_text)
+                    indices.append(i)
+        else:
+            try:
+                import tiktoken
+            except ImportError:
+                raise ImportError(
+                    "Could not import tiktoken python package. "
+                    "This is needed in order to for OpenAIEmbeddings. "
+                    "Please install it with `pip install tiktoken`."
+                )
+
+            try:
+                encoding = tiktoken.encoding_for_model(model_name)
+            except KeyError:
+                logger.warning("Warning: model not found. Using cl100k_base encoding.")
+                model = "cl100k_base"
+                encoding = tiktoken.get_encoding(model)
+            for i, text in enumerate(texts):
+                if self.model.endswith("001"):
+                    # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
+                    # replace newlines, which can negatively affect performance.
+                    text = text.replace("\n", " ")
+ 
+                token = encoding.encode(
+                    text=text,
+                    allowed_special=self.allowed_special,
+                    disallowed_special=self.disallowed_special,
+                )
+
+                # Split tokens into chunks respecting the embedding_ctx_length
+                for j in range(0, len(token), self.embedding_ctx_length):
+                    tokens.append(token[j : j + self.embedding_ctx_length])
+                    indices.append(i)
 
         batched_embeddings: List[List[float]] = []
         _chunk_size = chunk_size or self.chunk_size

From 1780814effac34343870ffc004606516e265f5c2 Mon Sep 17 00:00:00 2001
From: "Chelsea E. Manning" <me@xychelsea.is>
Date: Mon, 27 Nov 2023 06:33:31 +0000
Subject: [PATCH 2/6] correct lint errors with modified code

---
 libs/langchain/langchain/embeddings/openai.py | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py
index 6a960e9edeb503..861b7f7c37e38b 100644
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@@ -415,11 +415,13 @@ def _get_len_safe_embeddings(
             except ImportError:
                 raise ValueError(
                     "Could not import transformers python package. "
-                    "This is needed in order to for OpenAIEmbeddings without `tiktoken`. "
-                    "Please install it with `pip install transformers`."
+                    "This is needed in order to for OpenAIEmbeddings without "
+                    "`tiktoken`. Please install it with `pip install transformers`. "
                 )
 
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path=model_name
+            )
             for i, text in enumerate(texts):
                 # Tokenize the text using HuggingFace transformers
                 tokenized = tokenizer.encode(text, add_special_tokens=False)
@@ -520,9 +522,9 @@ async def _aget_len_safe_embeddings(
         """
         Asynchronously generate length-safe embeddings for a list of texts.
 
-        This method handles tokenization and asynchronous embedding generation, respecting
-        the set embedding context length and chunk size. It supports both tiktoken
-        and Hugging Face tokenizer based on the tiktoken_enabled flag.
+        This method handles tokenization and asynchronous embedding generation, 
+        respecting the set embedding context length and chunk size. It supports both
+        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
 
         Args:
             texts (List[str]): A list of texts to embed.
@@ -545,11 +547,13 @@ async def _aget_len_safe_embeddings(
             except ImportError:
                 raise ValueError(
                     "Could not import transformers python package. "
-                    "This is needed in order to for OpenAIEmbeddings without `tiktoken`. "
-                    "Please install it with `pip install transformers`."
+                    "This is needed in order to for OpenAIEmbeddings without "
+                    " `tiktoken`. Please install it with `pip install transformers`."
                 )
 
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path=model_name
+            )
             for i, text in enumerate(texts):
                 # Tokenize the text using HuggingFace transformers
                 tokenized = tokenizer.encode(text, add_special_tokens=False)

From b349267f2aa8f3f15c099b63116e5275ea923733 Mon Sep 17 00:00:00 2001
From: "Chelsea E. Manning" <me@xychelsea.is>
Date: Mon, 27 Nov 2023 16:37:18 +0000
Subject: [PATCH 3/6] more linter corrections

---
 libs/langchain/langchain/embeddings/openai.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py
index 861b7f7c37e38b..a0255ad99d2f2a 100644
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@@ -429,7 +429,7 @@ def _get_len_safe_embeddings(
                 # Split tokens into chunks respecting the embedding_ctx_length
                 for j in range(0, len(tokenized), self.embedding_ctx_length):
                     token_chunk = tokenized[j : j + self.embedding_ctx_length]
-                
+
                     # Convert token IDs back to a string
                     chunk_text = tokenizer.decode(token_chunk)
                     tokens.append(chunk_text)
@@ -452,7 +452,8 @@ def _get_len_safe_embeddings(
                 encoding = tiktoken.get_encoding(model)
             for i, text in enumerate(texts):
                 if self.model.endswith("001"):
-                    # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
+                    # See: https://github.com/openai/openai-python/
+                    #      issues/418#issuecomment-1525939500
                     # replace newlines, which can negatively affect performance.
                     text = text.replace("\n", " ")
  
@@ -584,7 +585,8 @@ async def _aget_len_safe_embeddings(
                 encoding = tiktoken.get_encoding(model)
             for i, text in enumerate(texts):
                 if self.model.endswith("001"):
-                    # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
+                    # See: https://github.com/openai/openai-python/
+                    #      issues/418#issuecomment-1525939500
                     # replace newlines, which can negatively affect performance.
                     text = text.replace("\n", " ")
  

From b78c7584af47853e6a07d07afc61d90c8cc125dc Mon Sep 17 00:00:00 2001
From: "Chelsea E. Manning" <me@xychelsea.is>
Date: Mon, 27 Nov 2023 16:39:18 +0000
Subject: [PATCH 4/6] final linter error corrected

---
 libs/langchain/langchain/embeddings/openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py
index a0255ad99d2f2a..04ea75d92ec530 100644
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@@ -562,7 +562,7 @@ async def _aget_len_safe_embeddings(
                 # Split tokens into chunks respecting the embedding_ctx_length
                 for j in range(0, len(tokenized), self.embedding_ctx_length):
                     token_chunk = tokenized[j : j + self.embedding_ctx_length]
-                
+
                     # Convert token IDs back to a string
                     chunk_text = tokenizer.decode(token_chunk)
                     tokens.append(chunk_text)

From 1111a4a69a2ba78cceb26cf91b0f4c29da43dab8 Mon Sep 17 00:00:00 2001
From: "Chelsea E. Manning" <me@xychelsea.is>
Date: Mon, 27 Nov 2023 18:33:59 +0000
Subject: [PATCH 5/6] remove two characters of whitespace

---
 libs/langchain/langchain/embeddings/openai.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py
index 04ea75d92ec530..8f9fae63fa845f 100644
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@@ -456,7 +456,7 @@ def _get_len_safe_embeddings(
                     #      issues/418#issuecomment-1525939500
                     # replace newlines, which can negatively affect performance.
                     text = text.replace("\n", " ")
- 
+
                 token = encoding.encode(
                     text=text,
                     allowed_special=self.allowed_special,
@@ -589,7 +589,7 @@ async def _aget_len_safe_embeddings(
                     #      issues/418#issuecomment-1525939500
                     # replace newlines, which can negatively affect performance.
                     text = text.replace("\n", " ")
- 
+
                 token = encoding.encode(
                     text=text,
                     allowed_special=self.allowed_special,

From 03750ba6377bab8d918e2cd8ff20581320dfa311 Mon Sep 17 00:00:00 2001
From: "Chelsea E. Manning" <me@xychelsea.is>
Date: Mon, 27 Nov 2023 23:34:43 +0000
Subject: [PATCH 6/6] minor adjustments for the linter

---
 libs/langchain/langchain/embeddings/openai.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py
index 8f9fae63fa845f..5abb8a7ec6497b 100644
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@@ -385,8 +385,6 @@ def _invocation_params(self) -> Dict[str, Any]:
     def _get_len_safe_embeddings(
         self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
     ) -> List[List[float]]:
-        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
-        batched_embeddings: List[List[float]] = []
         """
         Generate length-safe embeddings for a list of texts.
 
@@ -497,6 +495,7 @@ def _get_len_safe_embeddings(
             results[indices[i]].append(batched_embeddings[i])
             num_tokens_in_batch[indices[i]].append(len(tokens[i]))
 
+        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
         for i in range(len(texts)):
             _result = results[i]
             if len(_result) == 0:
@@ -519,11 +518,10 @@ def _get_len_safe_embeddings(
     async def _aget_len_safe_embeddings(
         self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
     ) -> List[List[float]]:
-        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
         """
         Asynchronously generate length-safe embeddings for a list of texts.
 
-        This method handles tokenization and asynchronous embedding generation, 
+        This method handles tokenization and asynchronous embedding generation,
         respecting the set embedding context length and chunk size. It supports both
         `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
 
@@ -620,6 +618,7 @@ async def _aget_len_safe_embeddings(
             results[indices[i]].append(batched_embeddings[i])
             num_tokens_in_batch[indices[i]].append(len(tokens[i]))
 
+        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
         for i in range(len(texts)):
             _result = results[i]
             if len(_result) == 0: