-
Notifications
You must be signed in to change notification settings - Fork 13.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Chroma multimodal cookbook (#12952)
Pending: * chroma-core/chroma#1294 * chroma-core/chroma#1293 --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <baskaryan@gmail.com>
- Loading branch information
1 parent
5591286
commit d2e50b3
Showing
8 changed files
with
838 additions
and
121 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
3 changes: 3 additions & 0 deletions
3
libs/experimental/langchain_experimental/open_clip/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .open_clip import OpenCLIPEmbeddings | ||
|
||
__all__ = ["OpenCLIPEmbeddings"] |
87 changes: 87 additions & 0 deletions
87
libs/experimental/langchain_experimental/open_clip/open_clip.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from typing import Any, Dict, List | ||
|
||
from langchain.pydantic_v1 import BaseModel, root_validator | ||
from langchain.schema.embeddings import Embeddings | ||
|
||
|
||
class OpenCLIPEmbeddings(BaseModel, Embeddings): | ||
model: Any | ||
preprocess: Any | ||
tokenizer: Any | ||
|
||
@root_validator() | ||
def validate_environment(cls, values: Dict) -> Dict: | ||
"""Validate that open_clip and torch libraries are installed.""" | ||
try: | ||
import open_clip | ||
|
||
### Smaller, less performant | ||
# model_name = "ViT-B-32" | ||
# checkpoint = "laion2b_s34b_b79k" | ||
### Larger, more performant | ||
model_name = "ViT-g-14" | ||
checkpoint = "laion2b_s34b_b88k" | ||
model, _, preprocess = open_clip.create_model_and_transforms( | ||
model_name=model_name, pretrained=checkpoint | ||
) | ||
tokenizer = open_clip.get_tokenizer(model_name) | ||
values["model"] = model | ||
values["preprocess"] = preprocess | ||
values["tokenizer"] = tokenizer | ||
|
||
except ImportError: | ||
raise ImportError( | ||
"Please ensure both open_clip and torch libraries are installed. " | ||
"pip install open_clip_torch torch" | ||
) | ||
return values | ||
|
||
def embed_documents(self, texts: List[str]) -> List[List[float]]: | ||
text_features = [] | ||
for text in texts: | ||
# Tokenize the text | ||
tokenized_text = self.tokenizer(text) | ||
|
||
# Encode the text to get the embeddings | ||
embeddings_tensor = self.model.encode_text(tokenized_text) | ||
|
||
# Normalize the embeddings | ||
norm = embeddings_tensor.norm(p=2, dim=1, keepdim=True) | ||
normalized_embeddings_tensor = embeddings_tensor.div(norm) | ||
|
||
# Convert normalized tensor to list and add to the text_features list | ||
embeddings_list = normalized_embeddings_tensor.squeeze(0).tolist() | ||
text_features.append(embeddings_list) | ||
|
||
return text_features | ||
|
||
def embed_query(self, text: str) -> List[float]: | ||
return self.embed_documents([text])[0] | ||
|
||
def embed_image(self, uris: List[str]) -> List[List[float]]: | ||
try: | ||
from PIL import Image as _PILImage | ||
except ImportError: | ||
raise ImportError("Please install the PIL library: pip install pillow") | ||
|
||
# Open images directly as PIL images | ||
pil_images = [_PILImage.open(uri) for uri in uris] | ||
|
||
image_features = [] | ||
for pil_image in pil_images: | ||
# Preprocess the image for the model | ||
preprocessed_image = self.preprocess(pil_image).unsqueeze(0) | ||
|
||
# Encode the image to get the embeddings | ||
embeddings_tensor = self.model.encode_image(preprocessed_image) | ||
|
||
# Normalize the embeddings tensor | ||
norm = embeddings_tensor.norm(p=2, dim=1, keepdim=True) | ||
normalized_embeddings_tensor = embeddings_tensor.div(norm) | ||
|
||
# Convert tensor to list and add to the image_features list | ||
embeddings_list = normalized_embeddings_tensor.squeeze(0).tolist() | ||
|
||
image_features.append(embeddings_list) | ||
|
||
return image_features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,6 @@ | |
"QianfanEmbeddingsEndpoint", | ||
"JohnSnowLabsEmbeddings", | ||
"VoyageEmbeddings", | ||
"OpenCLIPEmbeddings", | ||
] | ||
|
||
|
||
|