In [43]:
from vespa.package import ApplicationPackage, Field, RankProfile
from vespa.deployment import VespaDocker
from vespa.application import Vespa
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.vespa_retriever import VespaRetriever

import numpy as np
import pandas as pd
import tempfile

from pprint import pprint

temp_dir = tempfile.mktemp("vespa")
print(f"{temp_dir = }")


temp_dir = '/tmp/tmpjzpl384dvespa'


In [2]:
vespa_app_package = ApplicationPackage(name="crazyfrogger")

vespa_app_package.schema.add_fields(
    Field(
        name="track_id",
        type="string",
        indexing=["summary"]
    ),
    Field(
        name="track_name",
        type="string",
        indexing=["index", "summary"],
        index="enable-bm25",
    ),
    Field(
        name="lyrics",
        type="string",
        indexing=["index", "summary"],
        index="enable-bm25"
    ),
    Field(
        name="track_name_embedding",
        type="tensor<float>(x[384])",
        indexing=["attribute", "summary"],
        attribute=["distance-metric: angular"],
    ),
)

vespa_app_package.schema.add_rank_profile(
    RankProfile(
        name="track_name_semantic",
        inputs=[("query(query_embedding)", "tensor<float>(x[384])")],
        first_phase="closeness(field, track_name_embedding)",
    )
)

vespa_app_package.to_files(temp_dir)

vespa_resource_limit_config= """
        <tuning>
            <resource-limits>
                <disk>0.95</disk>
            </resource-limits>
        </tuning>
"""

with open(f"{temp_dir}/services.xml", 'r') as file:
    lines = file.readlines()
    lines.insert(15, vespa_resource_limit_config)

with open(f"{temp_dir}/services.xml", 'w') as file:
    file.writelines(lines)

vespa_docker = VespaDocker()
vespa_app = vespa_docker.deploy_from_disk(
    application_name="crazyfrogger",
    application_root=temp_dir
)


Waiting for configuration server, 0/300 seconds...


KeyboardInterrupt: 

In [3]:
df = pd.read_csv("../data/spotify-songs.csv")[:1000]
df.head(n=2)


Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
0,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,...,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,tl
1,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,en


In [20]:
embeddings = HuggingFaceEmbeddings(
    model_name=f"sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)


In [5]:
df["track_name_embedding"] = embeddings.embed_documents(df["track_name"].tolist())
df["lyrics_embedding"] = embeddings.embed_documents(df["lyrics"].tolist())

iter_data = [
    dict(
        id=row["track_id"],
        fields=dict(
            track_id=row["track_id"],
            track_name=row["track_name"],
            lyrics=row["lyrics"],
            track_name_embedding=row["track_name_embedding"],
            lyrics_embedding=row["lyrics_embedding"]))
    for row in df.to_dict("records")
]

In [13]:
from vespa.io import VespaResponse

def callback(response: VespaResponse, id: str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")

vespa_app.feed_iterable(
    iter=iter_data,
    callback=callback,
)


In [49]:
vespa_app = Vespa(
    url="http://localhost",
    port="8080",
    application_package=vespa_app_package)

query = "love"
query_embedding = embeddings.embed_query(query)

response = vespa_app.query(
    body={
        "yql": "select * from sources * where ({targetHits:10}nearestNeighbor(track_name_embedding, query_embedding))",
        "ranking.profile": "track_name_semantic",
        "input.query(query_embedding)": query_embedding,
    },
)

assert response.is_successful()

print("Number of hist:", len(response.hits))

for hit in response.hits:
    pprint(hit["relevance"])
    print(hit["fields"]["track_name"])


Number of hist: 10
1.0
Love
0.602923495700834
My Love
0.5777291141640409
True Love
0.5534076622949895
This Is Love
0.5480982007882358
What Do You Love
0.5306103891680277
First Love
0.5244149972489062
Love No More
0.521183115288168
Without Love
0.520412493430962
Love Is a Bitch
0.516824839624279
Love Myself
