In [6]:
from vespa.package import ApplicationPackage, Field, RankProfile
from vespa.deployment import VespaDocker
from langchain_community.embeddings import HuggingFaceEmbeddings

import numpy as np
import pandas as pd
import tempfile

from pprint import pprint

temp_dir = tempfile.mktemp("vespa")
print(f"{temp_dir = }")


temp_dir = '/tmp/tmphtxyo8dtvespa'


In [2]:
vespa_app_package = ApplicationPackage(name="crazyfrogger")

vespa_app_package.schema.add_fields(
    Field(
        name="track_id",
        type="string",
        indexing=["summary"]
    ),
    Field(
        name="track_name",
        type="string",
        indexing=["index", "summary"],
        index="enable-bm25",
    ),
    Field(
        name="lyrics",
        type="string",
        indexing=["index", "summary"],
        index="enable-bm25"
    ),
    Field(
        name="track_name_embedding",
        type="tensor<float>(x[384])",
        indexing=["attribute", "summary"],
        attribute=["distance-metric: angular"],
    ),
)

vespa_app_package.schema.add_rank_profile(
    RankProfile(
        name="track_name_semantic",
        inputs=[("query(query_embedding)", "tensor<float>(x[384])")],
        first_phase="closeness(field, track_name_embedding)",
    )
)

vespa_app_package.to_files(temp_dir)

vespa_resource_limit_config= """
        <tuning>
            <resource-limits>
                <disk>0.95</disk>
            </resource-limits>
        </tuning>
"""

with open(f"{temp_dir}/services.xml", 'r') as file:
    lines = file.readlines()
    lines.insert(15, vespa_resource_limit_config)

with open(f"{temp_dir}/services.xml", 'w') as file:
    file.writelines(lines)

vespa_docker = VespaDocker()
vespa_app = vespa_docker.deploy_from_disk(
    application_name="crazyfrogger",
    application_root=temp_dir
)


In [3]:
df = pd.read_csv("../data/spotify-songs.csv")
df.head()


Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
0,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,...,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,tl
1,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,en
2,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",0,6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,...,-7.504,0,0.216,0.00432,0.00723,0.489,0.65,111.904,262467,en
3,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,41,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,...,-5.819,0,0.0341,0.689,0.0,0.0664,0.405,118.593,243067,en
4,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,65,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,...,-1.993,1,0.0409,0.037,0.0,0.138,0.24,130.018,193160,en


In [7]:
embeddings = HuggingFaceEmbeddings(
    model_name=f"sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)


In [5]:
vespa_iter_data = [
    dict(
        id=row["track_id"],
        fields=dict(
            track_id=row["track_id"],
            track_name=row["track_name"],
            lyrics=row["lyrics"],
            track_name_embedding=embeddings.embed_query(row["track_name"])))
    # track_name_embedding
    # lyrics_embedding)
    for row in df.to_dict("records")[:50]]


In [13]:
from vespa.io import VespaResponse

def callback(response: VespaResponse, id: str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")

vespa_app.feed_iterable(
    iter=vespa_iter_data,
    callback=callback,
)



In [14]:
with vespa_app.syncio(connections=1) as session:
    query = "Fast"
    query_embedding = embeddings.embed_query(query)

    response = session.query(
        body={
            "yql": "select * from sources * where ({targetHits:10}nearestNeighbor(track_name_embedding, query_embedding))",
            "ranking.profile": "track_name_semantic",
            "input.query(query_embedding)": query_embedding,
        },
    )

assert response.is_successful()

# print(response.get_json())
print("Number of hist:", len(response.hits))

for hit in response.hits:
    print(hit["fields"]["track_name"])


Number of hist: 10
Hot
Laps
I Feel Alive
Try
Changes
Ooh
Poison
Phenomenal
Secrets
Been A While
