In [1]:
#!pip install pandas
import os
import time
import numpy as np
import pandas as pd 
from qdrant_client import QdrantClient
from qdrant_client.http import models
from getpass import getpass
from openai import OpenAI

### Spotify songs dataset from Kaggle

The dataset is available for download in the below URL.

https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

The dataset contains ~114000 song information fom various popular artists.

In [2]:
## Read the data and filter out the un-necessary columns
songs_dataset = pd.read_csv('spotify.zip', compression='zip', header=0)
songs_dataset = songs_dataset.loc[:, ~songs_dataset.columns.str.contains('^Unnamed')]
songs_dataset.shape

(114000, 20)

In [3]:
## Filter the data for Linkin Park and choose only the album name, track name and popularity
## columns for further processing

lp_dataset = songs_dataset[songs_dataset['artists'].str.contains("Linkin Park", na=False)]
lp_dataset = lp_dataset[['album_name', 'track_name', 'popularity']]
lp_dataset = lp_dataset.drop_duplicates(subset=['album_name', 'track_name'])
lp_dataset = lp_dataset.sort_values('popularity', ascending=False)
# lp_dataset = lp_dataset[:10]
print(lp_dataset.head())
print(lp_dataset.shape)

                         album_name       track_name  popularity
3006  Hybrid Theory (Bonus Edition)       In the End          85
3007                        Meteora             Numb          83
3262  Hybrid Theory (Bonus Edition)  One Step Closer          78
3263                        Meteora            Faint          77
3311  Hybrid Theory (Bonus Edition)         Crawling          76
(154, 3)


### Configurations

Before we proceed, we need an Open AI account and get the secret API key for the account. Instructions to find the secret API key is available in the below URL.

https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key

Open AI limits upto 200 API requests per day for free tier. So be careful wth the number of API calls. The COUNT variable can be used to limit the API calls.

We will use the text-embedding-ada-002 model from OpenAI to generate the embeddings

In [4]:
# Configs
COLLECTION_NAME = 'linkin_park'  # Collection name in Qdrant
DIMENSION = 1536  # Embeddings size, depends on the embedding model.
COUNT = 1000  # Max Number of titles to embed and insert.
OPENAI_MODEL = 'text-embedding-ada-002'  # Embedding model to be used
OPEN_AI_API_KEY = getpass('OpenAI API Key: ')

OpenAI API Key:  ········


In [5]:
# Create the connection to Qdrant DB. 

qdrant_client = QdrantClient("localhost", port=6333, timeout=60)


In [6]:
# Create the collection in qdrant
qdrant_client.delete_collection(collection_name=COLLECTION_NAME)

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(size=DIMENSION,
                                       distance=models.Distance.COSINE),
)

True

In [7]:
# Get the embeddings for the text using OpenAI
openai_client = OpenAI(
    api_key=OPEN_AI_API_KEY,
)

def embed(text):
    return openai_client.embeddings.create(
        input=text, 
        model=OPENAI_MODEL).data[0].embedding

In [8]:
# Generate embeddings and payload

count = 0
payloads = []
embds = []

for index, row in lp_dataset.iterrows():
    payloads.append({'track_name': row['track_name'],
                    'album_name':row['album_name'],
                    'popularity':row['popularity']})
    embds.append(embed(row['track_name']))

    count+=1
    if count%10==0 and count>0:
        print(f'Embedding generated for {count} song titles')
    if count==COUNT:
        break

# generate IDs

ids = [x for x in range(len(payloads))]

Embedding generated for 10 song titles
Embedding generated for 20 song titles
Embedding generated for 30 song titles
Embedding generated for 40 song titles
Embedding generated for 50 song titles
Embedding generated for 60 song titles
Embedding generated for 70 song titles
Embedding generated for 80 song titles
Embedding generated for 90 song titles
Embedding generated for 100 song titles
Embedding generated for 110 song titles
Embedding generated for 120 song titles
Embedding generated for 130 song titles
Embedding generated for 140 song titles
Embedding generated for 150 song titles


In [9]:
# Batch insert

qdrant_client.upsert(
    collection_name=COLLECTION_NAME,
    points=models.Batch(
        ids=ids,
        payloads=payloads,
        vectors=embds,
    ),
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [10]:
# Search the database based on input text
search_term='catalyst'

search_vec = embed(search_term)
qdrant_client.search(
    collection_name=COLLECTION_NAME,
    query_vector=( search_vec),
    limit=3,
    query_filter=models.Filter(
    must=[
        models.FieldCondition(
                key="popularity",
    range=models.Range(
        gte=10
    ),
    )
    ]
),
)

[ScoredPoint(id=140, version=0, score=0.93740886, payload={'album_name': 'A Thousand Suns', 'popularity': 36, 'track_name': 'The Catalyst'}, vector=None, shard_key=None),
 ScoredPoint(id=129, version=0, score=0.78908646, payload={'album_name': 'A Thousand Suns', 'popularity': 42, 'track_name': 'Iridescent'}, vector=None, shard_key=None),
 ScoredPoint(id=147, version=0, score=0.77680504, payload={'album_name': 'A Thousand Suns', 'popularity': 30, 'track_name': 'Burning in the Skies'}, vector=None, shard_key=None)]