diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index d10da4b4e97..5abb8a7ec64 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings): """Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or None.""" headers: Any = None + tiktoken_enabled: bool = True + """Set this to False for non-OpenAI implementations of the embeddings API, e.g. + the `--extensions openai` extension for `text-generation-webui`""" tiktoken_model_name: Optional[str] = None """The model name to pass to tiktoken when using this class. Tiktoken is used to count the number of tokens in documents to constrain @@ -382,41 +385,86 @@ def _invocation_params(self) -> Dict[str, Any]: def _get_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) + """ + Generate length-safe embeddings for a list of texts. + + This method handles tokenization and embedding generation, respecting the + set embedding context length and chunk size. It supports both tiktoken + and HuggingFace tokenizer based on the tiktoken_enabled flag. + + Args: + texts (List[str]): A list of texts to embed. + engine (str): The engine or model to use for embeddings. + chunk_size (Optional[int]): The size of chunks for processing embeddings. + + Returns: + List[List[float]]: A list of embeddings for each input text. + """ tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, + _chunk_size = chunk_size or self.chunk_size + + # If tiktoken flag set to False + if not self.tiktoken_enabled: + try: + from transformers import AutoTokenizer + except ImportError: + raise ValueError( + "Could not import transformers python package. " + "This is needed in order to for OpenAIEmbeddings without " + "`tiktoken`. Please install it with `pip install transformers`. " + ) + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) + for i, text in enumerate(texts): + # Tokenize the text using HuggingFace transformers + tokenized = tokenizer.encode(text, add_special_tokens=False) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(tokenized), self.embedding_ctx_length): + token_chunk = tokenized[j : j + self.embedding_ctx_length] + + # Convert token IDs back to a string + chunk_text = tokenizer.decode(token_chunk) + tokens.append(chunk_text) + indices.append(i) + else: + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for OpenAIEmbeddings. " + "Please install it with `pip install tiktoken`." + ) - batched_embeddings: List[List[float]] = [] - _chunk_size = chunk_size or self.chunk_size + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + for i, text in enumerate(texts): + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/ + # issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") + + token = encoding.encode( + text=text, + allowed_special=self.allowed_special, + disallowed_special=self.disallowed_special, + ) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(token), self.embedding_ctx_length): + tokens.append(token[j : j + self.embedding_ctx_length]) + indices.append(i) if self.show_progress_bar: try: @@ -428,6 +476,7 @@ def _get_len_safe_embeddings( else: _iter = range(0, len(tokens), _chunk_size) + batched_embeddings: List[List[float]] = [] for i in _iter: response = embed_with_retry( self, @@ -446,6 +495,7 @@ def _get_len_safe_embeddings( results[indices[i]].append(batched_embeddings[i]) num_tokens_in_batch[indices[i]].append(len(tokens[i])) + embeddings: List[List[float]] = [[] for _ in range(len(texts))] for i in range(len(texts)): _result = results[i] if len(_result) == 0: @@ -468,38 +518,86 @@ def _get_len_safe_embeddings( async def _aget_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) + """ + Asynchronously generate length-safe embeddings for a list of texts. + + This method handles tokenization and asynchronous embedding generation, + respecting the set embedding context length and chunk size. It supports both + `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag. + + Args: + texts (List[str]): A list of texts to embed. + engine (str): The engine or model to use for embeddings. + chunk_size (Optional[int]): The size of chunks for processing embeddings. + + Returns: + List[List[float]]: A list of embeddings for each input text. + """ tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, + _chunk_size = chunk_size or self.chunk_size + + # If tiktoken flag set to False + if not self.tiktoken_enabled: + try: + from transformers import AutoTokenizer + except ImportError: + raise ValueError( + "Could not import transformers python package. " + "This is needed in order to for OpenAIEmbeddings without " + " `tiktoken`. Please install it with `pip install transformers`." + ) + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) + for i, text in enumerate(texts): + # Tokenize the text using HuggingFace transformers + tokenized = tokenizer.encode(text, add_special_tokens=False) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(tokenized), self.embedding_ctx_length): + token_chunk = tokenized[j : j + self.embedding_ctx_length] + + # Convert token IDs back to a string + chunk_text = tokenizer.decode(token_chunk) + tokens.append(chunk_text) + indices.append(i) + else: + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for OpenAIEmbeddings. " + "Please install it with `pip install tiktoken`." + ) + + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + for i, text in enumerate(texts): + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/ + # issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") + + token = encoding.encode( + text=text, + allowed_special=self.allowed_special, + disallowed_special=self.disallowed_special, + ) + + # Split tokens into chunks respecting the embedding_ctx_length + for j in range(0, len(token), self.embedding_ctx_length): + tokens.append(token[j : j + self.embedding_ctx_length]) + indices.append(i) batched_embeddings: List[List[float]] = [] _chunk_size = chunk_size or self.chunk_size @@ -520,6 +618,7 @@ async def _aget_len_safe_embeddings( results[indices[i]].append(batched_embeddings[i]) num_tokens_in_batch[indices[i]].append(len(tokens[i])) + embeddings: List[List[float]] = [[] for _ in range(len(texts))] for i in range(len(texts)): _result = results[i] if len(_result) == 0: