In [13]:
import pandas as pd
df = pd.read_csv('../10000_songs.csv')

In [14]:
df = df.dropna() # remove any NaN values as it blows up serialization
data = df.sample(700).to_dict('records') # Get only 700 records. More records will make it slower to index
len(data)
df.head(15)

Unnamed: 0,artistId,artistName,collectionCensoredName,collectionId,collectionName,collectionPrice,contentAdvisoryRating,country,currency,discCount,...,primaryGenreName,releaseDate,trackCensoredName,trackCount,trackExplicitness,trackId,trackName,trackNumber,trackPrice,trackTimeMillis
0,46087,Erick Sermon,Music,298321651,Music,9.99,Explicit,USA,USD,1,...,Hip-Hop/Rap,2001-08-27T12:00:00Z,Music (feat. Marvin Gaye),16,explicit,298321904,Music,4,1.29,223133
4,46087,Erick Sermon,Music,298429528,Music,9.99,Clean,USA,USD,1,...,Hip-Hop/Rap,2001-05-29T07:00:00Z,Music (feat. Marvin Gaye),16,cleaned,298429596,Music,4,1.29,223133
8,138226712,Fun.,Some Nights,486040153,Some Nights,9.99,Explicit,USA,USD,1,...,Alternative,2012-02-21T08:00:00Z,Some Nights,11,explicit,486040194,Some Nights,2,1.29,277040
43,1419227,Beyoncé,BEYONCÉ,780330041,BEYONCÉ,15.99,Explicit,USA,USD,1,...,Pop,2013-12-13T08:00:00Z,Drunk in Love (feat. Jay Z),14,explicit,780330308,Drunk in Love (feat. Jay Z),3,1.29,323486
61,260388828,3OH!3,Want,281750061,Want,9.99,Explicit,USA,USD,1,...,Alternative,2008-06-01T07:00:00Z,Don't Trust Me,12,explicit,281750089,Don't Trust Me,3,1.29,192573
62,348580754,Meghan Trainor,Title (Deluxe Edition),929825574,Title (Deluxe Edition),12.99,Explicit,USA,USD,1,...,Pop,2014-06-30T07:00:00Z,All About That Bass,15,explicit,929825615,All About That Bass,2,1.29,191489
67,342826213,Waka Flocka Flame,No Hands (feat. Roscoe Dash & Wale) - Single,389062036,No Hands (feat. Roscoe Dash & Wale) - Single,1.29,Explicit,USA,USD,1,...,Hip-Hop/Rap,2010-08-17T07:00:00Z,No Hands (feat. Roscoe Dash & Wale),1,explicit,389062076,No Hands (feat. Roscoe Dash & Wale),1,1.29,262533
72,278873078,Bruno Mars,Unorthodox Jukebox,573962245,Unorthodox Jukebox,10.99,Explicit,USA,USD,1,...,Pop,2012-12-07T08:00:00Z,Treasure,10,explicit,573962553,Treasure,4,1.29,178572
73,3444975,Buckcherry,15,140862914,15,10.99,Explicit,USA,USD,1,...,Hard Rock,2005-10-17T07:00:00Z,Crazy Bitch,11,explicit,140862717,Crazy Bitch,7,1.29,202720
81,174680978,CeeLo Green,F**k You - Deluxe Single,390849614,Fuck You - Deluxe Single,2.99,Explicit,USA,USD,1,...,R&B/Soul,2010-08-19T07:00:00Z,F**k You,2,explicit,390849615,Fuck You,1,1.29,222933


In [15]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [16]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [17]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [18]:
# Create collection to store wines
qdrant.recreate_collection(
    collection_name="top_songs",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

True

In [19]:
# vectorize!
qdrant.upload_points(
    collection_name="top_songs",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["primaryGenreName"]).tolist(),
            payload=doc,
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

In [20]:
user_prompt = "Suggest me an amazing pop Song!"

In [21]:
# Search time for great tunes!

hits = qdrant.search(
    collection_name="top_songs",
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'artistId': 14400049, 'artistName': 'Mark Ronson', 'collectionCensoredName': 'Uptown Special', 'collectionId': 943946661, 'collectionName': 'Uptown Special', 'collectionPrice': 9.99, 'contentAdvisoryRating': 'Explicit', 'country': 'USA', 'currency': 'USD', 'discCount': 1, 'discNumber': 1, 'isStreamable': True, 'kind': 'song', 'previewUrl': 'https://audio-ssl.itunes.apple.com/itunes-assets/AudioPreview125/v4/d3/c9/c2/d3c9c235-d905-1f5c-ef17-ee2e1c484102/mzaf_8910497199692577423.plus.aac.p.m4a', 'primaryGenreName': 'Pop', 'releaseDate': '2015-01-12T12:00:00Z', 'trackCensoredName': 'Feel Right (feat. Mystikal)', 'trackCount': 11, 'trackExplicitness': 'explicit', 'trackId': 943946669, 'trackName': 'Feel Right (feat. Mystikal)', 'trackNumber': 3, 'trackPrice': 1.29, 'trackTimeMillis': 222550} score: 0.47350481386095533
{'artistId': 980795202, 'artistName': 'Marshmello & Anne-Marie', 'collectionCensoredName': 'Speak Your Mind', 'collectionId': 1351246855, 'collectionName': 'Speak Your Mind'

In [22]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]

In [23]:
# avoid parallelism
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [24]:
# Now time to connect to the local large language model
from openai import OpenAI
client = OpenAI(
    base_url= "http://127.0.0.1:8080/v1", # This is the location for the local LLM model. Note that v1 is appended to what appears in browser
    api_key = "sk-no-key-required"
)

completion = client.chat.completions.create(
    model="LLaMA_CPP",
    messages=[
        {"role": "system", "content": "You are chatbot, a music lover. Your top priority is to help users find songs and guide them with their requests."},
        {"role": "user", "content": "Suggest for me a great 1990s pop song."},
        {"role": "assistant", "content": str(search_results)}
    ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='I suggest you to listen to "Blinding Lights" by The Weeknd. It\'s a great 1990s pop song with a catchy beat and lyrics that will make you want to dance.', role='assistant', function_call=None, tool_calls=None)
