### Imports

In [1]:
import pandas as pd
import itertools
from vespa.package import ApplicationPackage, Field, Schema, Document, RankProfile, HNSW, RankProfile, Component, Parameter, FieldSet, GlobalPhaseRanking, Function
from vespa.deployment import VespaDocker
from datasets import load_dataset
from vespa.io import VespaResponse, VespaQueryResponse

  from .autonotebook import tqdm as notebook_tqdm


### Settings

In [2]:
DATA_DIR = "../../"
DATA_FILES = ["arxiv-metadata-oai-snapshot.json"]
SPLIT_SIZE_LIMIT = 100

In [3]:
class ArticLE:

    def __init__(self, a: 'ArticLE' = None):
        self.startup(a)
    
    def startup(self, a: 'ArticLE' = None):
        if a is not None:
            self.package = a.package
            self.docker = a.docker
            self.app = a.app
        else:
            self.set_package()
            self.set_docker()
            self.set_app()
    
    def set_package(self):
        self.package = ApplicationPackage(
        name="hybridsearch",
        schema=[Schema(
            name="doc",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(name="title", type="string", indexing=["index", "summary"], index="enable-bm25"),
                    Field(name="body", type="string", indexing=["index", "summary"], index="enable-bm25", bolding=True),
                    Field(name="embedding", type="tensor<float>(x[384])",
                        indexing=["input title . \" \" . input body", "embed", "index", "attribute"],
                        ann=HNSW(distance_metric="angular"),
                        is_document_field=False
                    )
                ]
            ),
            fieldsets=[
                FieldSet(name = "default", fields = ["title", "body"])
            ],
            rank_profiles=[
                RankProfile(
                    name="bm25",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    functions=[Function(
                        name="bm25sum", expression="bm25(title) + bm25(body)"
                    )],
                    first_phase="bm25sum"
                ),
                RankProfile(
                    name="semantic",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    first_phase="closeness(field, embedding)"
                ),
                RankProfile(
                    name="fusion",
                    inherits="bm25",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    first_phase="closeness(field, embedding)",
                    global_phase=GlobalPhaseRanking(
                        expression="reciprocal_rank_fusion(bm25sum, closeness(field, embedding))",
                        rerank_count=1000
                    )
                )
            ]
        )
        ],
        components=[Component(id="e5", type="hugging-face-embedder",
            parameters=[
                Parameter("transformer-model", {"url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"}),
                Parameter("tokenizer-model", {"url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"})
            ]
        )]
    )

    def set_docker(self):
        self.docker = VespaDocker()
    
    def set_app(self):
        self.app = self.docker.deploy(application_package=self.package)
    
    def callback(self, response):
        if not response.is_successful():
            print(f"Error when feeding document {id}: {response.get_json()}")
    
    def feed_json(self, data_files, **kwargs):
        dataset = load_dataset(
            "json",
            data_dir=DATA_DIR,
            data_files=data_files,
            split=f"train[0:{SPLIT_SIZE_LIMIT}]",
        )
        vespa_feed = dataset.map(lambda x: {"id": x["id"], "fields": { "title": x["title"], "body": x["abstract"], "id": x["id"]}})
        self.app.feed_iterable(vespa_feed, schema="doc", namespace="article", callback=self.callback)
        
    def hits_to_df(self, response:VespaQueryResponse) -> pd.DataFrame:
        records = []
        fields = ["id", "title", "body"]
        for hit in response.hits:
            record = {}
            for field in fields:
                record[field] = hit['fields'][field]
            record["relevance"] = hit["relevance"]
            records.append(record)
        return pd.DataFrame(records)
    
    def query(self, query, n_hits: int = 5):
        with self.app.syncio(connections=1) as session:
            response:VespaQueryResponse = session.query(
                yql=f"select * from sources * where userQuery() limit {n_hits}",
                query=query,
                ranking="bm25",
            )
        assert(response.is_successful())
        return self.hits_to_df(response)


In [4]:
a = ArticLE()

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Waiting for configuration server, 10/300 seconds...
Waiting for configuration server, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8

In [5]:
a.feed_json(DATA_FILES)

Exception in user callback for id 0704.0001
Traceback (most recent call last):
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\vespa\application.py", line 426, in _handle_result_callback
    callback(response,id=id)
TypeError: callback() got an unexpected keyword argument 'id'
Exception in user callback for id 0704.0002
Traceback (most recent call last):
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\vespa\application.py", line 426, in _handle_result_callback
    callback(response,id=id)
TypeError: callback() got an unexpected keyword argument 'id'
Exception in user callback for id 0704.0003
Traceback (most recent call last):
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\vespa\application.py", line 426, in _handle_result_callback
    callback(response,id=id)
TypeError: callback() got an unexpected keyword argument 'id'
Exception in user callback for id 0704.0004
Traceback (most recent 

In [13]:
df = a.query("Machine learning and data science and stock market", n_hits=10)
df

Unnamed: 0,id,title,body,relevance
0,704.0773,Collective behavior of stock price movements i...,To investigate the universality of the struc...,33.164795
1,705.0076,Deterministic Factors of Stock Networks based ...,The <hi>stock</hi> <hi>market</hi> has been ...,31.788707
2,704.0664,Stock market return distributions: from past t...,We show that recent <hi>stock</hi> <hi>marke...,30.39402
3,704.2115,Uncovering the Internal Structure of the India...,The cross-correlations between price fluctua...,23.351139
4,704.3905,Ensemble Learning for Free with Evolutionary A...,Evolutionary <hi>Learning</hi> proceeds by e...,21.473587
5,704.3453,An Adaptive Strategy for the Classification of...,One of the major problems in computational b...,21.080254
6,705.0666,Validating module network learning algorithms ...,"In recent years, several authors have used p...",20.383808
7,705.4023,The limit order book on different time scales,Financial <hi>markets</hi> can be described ...,19.814361
8,704.2139,Why only few are so successful ?,In many professons employees are rewarded ac...,18.524608
9,704.1099,The Epps effect revisited,We analyse the dependence of <hi>stock</hi> ...,18.062398


In [16]:
print(df.loc[0, "body"])

  To investigate the universality of the structure of interactions in different
<hi>markets</hi>, we analyze the cross-correlation matrix C of <hi>stock</hi> price fluctuations
in the National <hi>Stock</hi> Exchange (NSE) of India. We find that this emerging
<hi>market</hi> exhibits strong correlations in the movement of <hi>stock</hi> prices compared to
developed <hi>markets</hi>, such as the New York <hi>Stock</hi> Exchange (NYSE). This is shown to
be due to the dominant influence of a common <hi>market</hi> mode on the <hi>stock</hi> prices.
By comparison, interactions between related stocks, e.g., those belonging to
the same business sector, are much weaker. This lack of distinct sector
identity in emerging <hi>markets</hi> is explicitly shown by reconstructing the network
of mutually interacting stocks. Spectral analysis of C for NSE reveals that,
the few largest eigenvalues deviate from the bulk of the spectrum predicted by
random matrix theory, but they are far fewer in number 