# Text search engine with DocArray

In this notebook, we use [DocArray](https://docarray.jina.ai/) to show how you can set up a simple text search.

First, let’s create the DocumentArray instance:

In [None]:
from docarray import Document, DocumentArray

da = DocumentArray(
    storage='elasticsearch',
    config={'hosts': 'http://es01:9200',
            'index_name': 'text_search', 'n_dim': 768},
)

Or without datastore:

In [None]:
from docarray import Document, DocumentArray

da = DocumentArray()

Then, we can index some Documents:

In [None]:
from docarray import Document

with da:
    da.extend(
        [
            Document(text='Das rote Auto steht neben dem Baum.'),
            Document(text='Das Haus neben der Strasse ist blau angemalt.'),
            Document(text='Nicht weit vom Baum gibt es einen See voller Fische.'),
            Document(text='Der Barsch schwimmt im Bodensee.'),
            Document(text='Das Segelschiff auf dem Bodensee hat einen gelben Segel.'),
            Document(text='Im Bodensee gibt es keine Korallen.'),
            Document(text='Im Meer gibt es sehr viele Korallen.'),
            Document(text='She is lying on the bed and watching Youtube.'),
            Document(text='The cat is playing with a mouse.'),
        ]
    )


Or single words (experiment):

In [None]:
from docarray import Document

with da:
    da.extend(
        [
            Document(text='Schokolade'),
            Document(text='Banane'),
            Document(text='Apfel'),
            Document(text='Zitrone'),
            Document(text='Mandarine'),
            Document(text='T-Shirt'),
            Document(text='Autospielzeug'),
        ]
    )


Now, we can generate embeddings inside the database using the BERT model:

In [None]:
from transformers import AutoModel, AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
#model = AutoModel.from_pretrained('bert-base-multilingual-uncased')

#tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
#model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Using BERT based models for best semantic search
# https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def collate_fn(da):
    return tokenizer(da.texts, truncation=True, padding=True, return_tensors='pt')

da.embed(model, collate_fn=collate_fn)


Finally, we can query the database and print the results:

In [None]:
results = da.find(
    DocumentArray([Document(text='Wer spielt mit der Maus?')]).embed(
        model, collate_fn=collate_fn
    ),
    limit=3,
)

for doc in results[0]:
    print(doc.scores['cosine'].value, doc.text)
