In [12]:
from docarray import Document

#instead of manually specifying embedding, use a DNN with .embed option
q = (Document(uri="../images/pear.png")
     .load_uri_to_image_tensor()
     .set_image_tensor_normalization()
     .set_image_tensor_channel_axis(-1, 0))

#embed it into a vector
import torchvision
model = torchvision.models.resnet50(pretrained=True)
d = q.embed(model)

In [18]:
print(d.embedding.ndim)
print(d.embedding.shape)

1
torch.Size([1000])


In [13]:
#Documents with an .embedding can be “matched” against each other.
q1 = (Document(uri="../images/green-apple.png")
     .load_uri_to_image_tensor()
     .set_image_tensor_normalization()
     .set_image_tensor_channel_axis(-1, 0))

q2 = (Document(uri="../images/pear.png")
     .load_uri_to_image_tensor()
     .set_image_tensor_normalization()
     .set_image_tensor_channel_axis(-1, 0))

q3 = (Document(uri="../images/red-apple.jpeg")
     .load_uri_to_image_tensor()
     .set_image_tensor_normalization()
     .set_image_tensor_channel_axis(-1, 0))

print(q1.summary())
print(q2.summary())
print(q3.summary())

import torchvision
model = torchvision.models.resnet50(pretrained=True)
d1 = q1.embed(model)
d2 = q2.embed(model)
d3 = q3.embed(model)

None


None


None




In [24]:
from docarray import DocumentArray

da = DocumentArray([d1,d2,d3])
#print(da.contents)
print(da.embeddings.shape)
print(da.embeddings)

d1.match(da)
d2.match(da)
d3.match(da)
da.count(d1)



torch.Size([3, 1000])
tensor([[-0.3996,  2.3971, -0.5317,  ...,  0.1066,  0.7234,  0.8281],
        [-0.4882,  1.5700, -0.9340,  ..., -0.3381,  1.5007,  1.4241],
        [-1.3994,  0.4810, -0.7436,  ...,  1.3343,  1.2314,  1.7705]])
<Document ('id', 'tensor', 'mime_type', 'uri', 'embedding', 'matches') at 92bcbbac91a424e6f70ca0e8fb10135a>


1

In [27]:
from docarray import Document
import numpy as np

#we create ten Documents and put them into a DocumentArray, and then use another Document to search against them.
da = DocumentArray.empty(10)
da.embeddings = np.random.random([10, 256])
q = Document(embedding=np.random.random([256]))
da.summary()
for i in da:
     print(i,da.index(i))
q.match(da)
q.summary()

<Document ('id', 'embedding') at 982b083677af62ce5efe53cdcba167a8> 0
<Document ('id', 'embedding') at b24cd27d7f5bd37c8a13bab3f60103e8> 1
<Document ('id', 'embedding') at 50f2f550456ba58a502f46e75f60d8c2> 2
<Document ('id', 'embedding') at 76906687e7fad1cf365c9fcf5744bd7f> 3
<Document ('id', 'embedding') at b54ce1a542a728fb4586820dcdbb7d62> 4
<Document ('id', 'embedding') at ef3c1b86dd3fdda66f1ed8fb88ece135> 5
<Document ('id', 'embedding') at 34eaa990a624d7b41936c9683e77a3f4> 6
<Document ('id', 'embedding') at 7084d959eed509fabef62b9500f7c7f4> 7
<Document ('id', 'embedding') at 831ab6468907d9efd61ecb43985b38e4> 8
<Document ('id', 'embedding') at 483975ea465dfca1c6dc5cd4da5ec505> 9


In [None]:
################
#using feature hasing for embedding and distance metric is 'jaccard distance'
#--https://en.wikipedia.org/wiki/Feature_hashing
#--https://en.wikipedia.org/wiki/Jaccard_index
#searching for top 5 similar sentences
from docarray import Document, DocumentArray

d = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text()
da = DocumentArray(Document(text=s.strip()) for s in d.text.split('\n') if s.strip())
da.apply(Document.embed_feature_hashing, backend='process')

q = (
    Document(text='such manners')
    .embed_feature_hashing()
    .match(da, metric='jaccard', use_scipy=True)
)

print(q.matches[:5, ('text', 'scores__jaccard__value')])