In [None]:
!pip3 install pyvespa vespacli

In [21]:
import os

os.environ["TENANT_NAME"] = "article"
application = "hybridsearch"
vespa_cli_command = (
    f'vespa config set application {os.environ["TENANT_NAME"]}.{application}'
)

!vespa config set target cloud
!{vespa_cli_command}
!vespa auth cert -N

[32mSuccess:[0m Certificate written to [36m'/home/gabriel/.vespa/article.hybridsearch.default/data-plane-public-cert.pem'[0m
[32mSuccess:[0m Private key written to [36m'/home/gabriel/.vespa/article.hybridsearch.default/data-plane-private-key.pem'[0m


In [22]:
import os
from os.path import exists
from pathlib import Path

cert_path = (
    Path.home()
    / ".vespa"
    / f"{os.environ['TENANT_NAME']}.{application}.default/data-plane-public-cert.pem"
)
key_path = (
    Path.home()
    / ".vespa"
    / f"{os.environ['TENANT_NAME']}.{application}.default/data-plane-private-key.pem"
)

if not exists(cert_path) or not exists(key_path):
    print(
        "ERROR: set the correct paths to security credentials. Correct paths above and rerun until you do not see this error"
    )


In [23]:
!vespa auth api-key -f

from pathlib import Path

api_key_path = Path.home() / ".vespa" / f"{os.environ['TENANT_NAME']}.api-key.pem"

[32mSuccess:[0m Developer private key for tenant [36marticle[0m written to '/home/gabriel/.vespa/article.api-key.pem'

This is your public key:
[32m-----BEGIN PUBLIC KEY-----
MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEsnlw2nhmDCeOKtqdvsIDsE2uKYTY
K2SJLhswamJG9DxvH/vinr4eyj4wJuAAEjsRNwwGMEw8lsghcZGGNO2epw==
-----END PUBLIC KEY-----
[0m
Its fingerprint is:
[36maf:98:4f:0b:78:3d:ca:05:e3:51:f9:51:bb:fb:98:8a[0m

To use this key in Vespa Cloud click 'Add custom key' at
[36mhttps://console.vespa-cloud.com/tenant/article/account/keys[0m
and paste the entire public key including the BEGIN and END lines.


In [24]:
from vespa.package import ApplicationPackage, Field, Schema, Document, RankProfile, HNSW, RankProfile, Component, Parameter, FieldSet, GlobalPhaseRanking, Function
from vespa.deployment import VespaDocker
from datasets import load_dataset
from vespa.io import VespaResponse, VespaQueryResponse


package = ApplicationPackage(
        name="hybridsearch",
        schema=[Schema(
            name="doc",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(name="title", type="string", indexing=["index", "summary"], index="enable-bm25"),
                    Field(name="body", type="string", indexing=["index", "summary"], index="enable-bm25", bolding=True),
                    Field(name="embedding", type="tensor<float>(x[384])",
                        indexing=["input title . \" \" . input body", "embed", "index", "attribute"],
                        ann=HNSW(distance_metric="angular"),
                        is_document_field=False
                    )
                ]
            ),
            fieldsets=[
                FieldSet(name = "default", fields = ["title", "body"])
            ],
            rank_profiles=[
                RankProfile(
                    name="bm25",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    functions=[Function(
                        name="bm25sum", expression="bm25(title) + bm25(body)"
                    )],
                    first_phase="bm25sum"
                ),
                RankProfile(
                    name="semantic",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    first_phase="closeness(field, embedding)"
                ),
                RankProfile(
                    name="fusion",
                    inherits="bm25",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    first_phase="closeness(embedding)",
                    global_phase=GlobalPhaseRanking(
                        expression="bm25sum + closeness(embedding)",
                        rerank_count=1000
                    )
                )
            ]
        )
        ],
        components=[Component(id="e5", type="hugging-face-embedder",
            parameters=[
                Parameter("transformer-model", {"url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"}),
                Parameter("tokenizer-model", {"url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"})
            ]
        )]
    )

In [25]:
from vespa.deployment import VespaCloud
from datasets import load_dataset
from vespa.io import VespaResponse, VespaQueryResponse


def read_secret():
    """Read the API key from the environment variable. This is
    only used for CI/CD purposes."""
    t = os.getenv("VESPA_TEAM_API_KEY")
    if t:
        return t.replace(r"\n", "\n")
    else:
        return t


vespa_cloud = VespaCloud(
    tenant=os.environ["TENANT_NAME"],
    application=application,
    key_content=read_secret() if read_secret() else None,
    key_location=api_key_path,
    application_package=package,
)

In [26]:
app = vespa_cloud.deploy()

Deployment started in run 4 of dev-aws-us-east-1c for article.hybridsearch. This may take a few minutes the first time.
INFO    [23:55:22]  Deploying platform version 8.349.46 and application dev build 3 for dev-aws-us-east-1c of default ...
INFO    [23:55:22]  Using CA signed certificate version 1
INFO    [23:55:23]  Using 1 nodes in container cluster 'hybridsearch_container'
INFO    [23:55:26]  Session 25 for tenant 'article' prepared and activated.
INFO    [23:55:30]  ######## Details for all nodes ########
INFO    [23:55:35]  h93183a.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [23:55:35]  --- platform vespa/cloud-tenant-rhel8:8.349.46
INFO    [23:55:35]  --- storagenode on port 19102 has not started 
INFO    [23:55:35]  --- searchnode on port 19107 has not started 
INFO    [23:55:35]  --- distributor on port 19111 has not started 
INFO    [23:55:35]  --- metricsproxy-container on port 19092 has not started 
INFO    [23:55:35]  h93284b.dev.aws-us-east

In [27]:
endpoint = "https://eb38f856.cc1e1530.z.vespa-app.cloud/"

In [30]:

DATA_DIR = "./data/"
DATA_FILES = ["arxiv-metadata-oai-snapshot.json"]
SPLIT_SIZE_LIMIT = 10000
dataset = load_dataset(
    "json",
    data_dir=DATA_DIR,
    data_files=DATA_FILES,
    split=f"train[0:{SPLIT_SIZE_LIMIT}]",
)
vespa_feed = dataset.map(lambda x: {"id": x["id"], "fields": { "title": x["title"], "body": x["abstract"], "id": x["id"]}})
app.feed_iterable(vespa_feed, schema="doc", namespace="article")

Map: 100%|██████████| 10000/10000 [00:02<00:00, 4658.67 examples/s]


In [32]:
from vespa.application import Vespa

the_app = Vespa(endpoint, cert=cert_path, key=key_path)

res = the_app.query(
                yql=f"select * from sources * where userQuery() limit {5}",
                query="related to boson",
                ranking="fusion",
            )
res.hits[0]


{'id': 'id:article:doc::0704.0619',
 'relevance': 9.233171111430481,
 'source': 'hybridsearch_content',
 'fields': {'sddocname': 'doc',
  'body': "  The search for MSSM Higgs <hi>bosons</hi> will be an important goal at the LHC. We\nanalyze the search reach of the CMS experiment for the heavy neutral MSSM Higgs\n<hi>bosons</hi> with an integrated luminosity of 30 or 60 fb^-1. This is done by\ncombining the latest results for the CMS experimental sensitivities based on\nfull simulation studies with state-of-the-art theoretical predictions of MSSM\nHiggs-<hi>boson</hi> properties. The results are interpreted in MSSM benchmark scenarios\nin terms of the parameters tan_beta and the Higgs-<hi>boson</hi> mass scale, M_A. We\nstudy the dependence of the 5 sigma discovery contours in the M_A-tan_beta\nplane on variations of the other supersymmetric parameters. The largest effects\narise from a change in the higgsino mass parameter mu, which enters both via\nhigher-order radiative corrections a