In [None]:
! pip install pymilvus openai==0.28 protobuf

import os
import time
import numpy as np
import pandas as pd 
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, connections, utility
from getpass import getpass
from openai import OpenAI

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


### Spotify songs dataset from Kaggle

The dataset is available for download in the below URL.

https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

The dataset contains ~114000 song information fom various popular artists.

In [4]:
## Read the data and filter out the un-necessary columns
songs_dataset = pd.read_csv('spotify.zip', compression='zip', header=0)
songs_dataset = songs_dataset.loc[:, ~songs_dataset.columns.str.contains('^Unnamed')]
songs_dataset.shape

(114000, 20)

In [11]:
## Filter the data for Linkin Park and choose only the album name, track name and popularity
## columns for further processing

lp_dataset = songs_dataset[songs_dataset['artists'].str.contains("Linkin Park", na=False)]
lp_dataset = lp_dataset[['album_name', 'track_name', 'popularity']]
lp_dataset = lp_dataset.drop_duplicates(subset=['album_name', 'track_name'])
lp_dataset = lp_dataset.sort_values('popularity', ascending=False)
# lp_dataset = lp_dataset[:10]
print(lp_dataset.head())
print(lp_dataset.shape)

                         album_name       track_name  popularity
3006  Hybrid Theory (Bonus Edition)       In the End          85
3007                        Meteora             Numb          83
3262  Hybrid Theory (Bonus Edition)  One Step Closer          78
3263                        Meteora            Faint          77
3311  Hybrid Theory (Bonus Edition)         Crawling          76
(154, 3)


### Configurations

Before we proceed, we need an Open AI account and get the secret API key for the account. Instructions to find the secret API key is available in the below URL.

https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key

Open AI limits upto 200 API requests per day for free tier. So be careful wth the number of API calls. The COUNT variable can be used to limit the API calls.

We will use the text-embedding-ada-002 model from OpenAI to generate the embeddings

In [12]:
# Configs
COLLECTION_NAME = 'linkin_park'  # Collection name in Milvus
DIMENSION = 1536  # Embeddings size, depends on the embedding model.
COUNT = 1000  # Number of titles to embed and insert.
OPENAI_ENGINE = 'text-embedding-ada-002'  # Embedding model to be used
openai.api_key = getpass('OpenAI API Key: ')  # Use your own Open AI API Key here

OpenAI API Key:  ········


In [50]:
# Create the connection to Milvus. 

connections.connect(
  alias="default",
  host='192.168.0.157',
  port='19530',
  user='root',
  password=getpass('Milvus Password: ')
)

Milvus Password:  ········


In [51]:
# Remove collection if it already exists
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

# Create collection which includes the id, title, and embedding.
track_name_field = FieldSchema(name='track_name', dtype=DataType.VARCHAR, max_length=1024,is_primary=True, auto_id=False)
album_name_field = FieldSchema(name='album_name', dtype=DataType.VARCHAR, max_length=1024)
popularity_field = FieldSchema(name='popularity', dtype=DataType.INT64)
embedding_field = FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=DIMENSION)

fields = [track_name_field, album_name_field, popularity_field, embedding_field]
schema = CollectionSchema(fields=fields, description='Linkin Park Songs collection')
collection = Collection(name=COLLECTION_NAME, schema=schema)

# Create an index for the collection.

index_params = {
    'index_type': 'IVF_FLAT',
    'metric_type': 'L2',
    'params': {'nlist': 64}
}
collection.create_index(field_name="embedding", index_params=index_params)

Status(code=0, message=)

In [55]:
# Get the embeddings for the text using OpenAI
#def embed(text):
#    return openai.Embedding.create(
#        input=text, 
#        engine=OPENAI_ENGINE)["data"][0]["embedding"]

from openai import OpenAI
client = OpenAI(api_key=openai.api_key)

def embed(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding



In [56]:
# Get embedding
count = 0
tracks = []
albums = []
popularity = []
embds = []
for index, row in lp_dataset.iterrows():
    tracks.append(row['track_name'])
    albums.append(row['album_name'])
    popularity.append(row['popularity'])
    
    embds.append(embed(row['track_name']))

    # data_to_insert.append(data)
    time.sleep(1)
    count+=1
    if count%10==0 and count>0:
        print(f'Embedding generated for {count} song titles')
    if count==COUNT:
        break

data_to_insert = [tracks, albums, popularity, embds]


Embedding generated for 10 song titles
Embedding generated for 20 song titles
Embedding generated for 30 song titles
Embedding generated for 40 song titles
Embedding generated for 50 song titles
Embedding generated for 60 song titles
Embedding generated for 70 song titles
Embedding generated for 80 song titles
Embedding generated for 90 song titles
Embedding generated for 100 song titles
Embedding generated for 110 song titles
Embedding generated for 120 song titles
Embedding generated for 130 song titles
Embedding generated for 140 song titles
Embedding generated for 150 song titles


In [57]:
# Insert the data to Milvus collection
collection.insert(data_to_insert)

(insert count: 154, delete count: 0, upsert count: 0, timestamp: 447901976899092482, success count: 154, err count: 0)

In [77]:
# Load the collection into memory for searching
collection.load()

# Search the database based on input text
def search(text):
    # Search parameters for the index
    search_params={
        "metric_type": "L2"
    }

    results=collection.search(
        data=[embed(text)],  # Embeded search value
        anns_field="embedding",  # Search across embeddings
        param=search_params,
        limit=5,  # Limit to five results per search
        output_fields=['album_name', 'popularity']  # Include title field in result
    )

    return results

search_terms="habit"
test=search(search_terms)
print(test)


['["id: Heavy, distance: 1.1296672821044922, entity: {\'popularity\': 18, \'album_name\': \'Heavy\'}", "id: Breaking the Habit, distance: 1.2977659702301025, entity: {\'popularity\': 66, \'album_name\': \'Meteora\'}", "id: Figure.09, distance: 1.390805959701538, entity: {\'popularity\': 51, \'album_name\': \'Meteora\'}"]']


In [76]:
print(len(data_to_insert))

4


In [78]:
search_terms="bleed"
results = search(search_terms)

for indx, result in enumerate(results[0]):
    print('='*20)
    print('Song name: ', result.id)
    print('Album name: ', result.entity.get('album_name'))
    print('Popularity: ', result.entity.get('popularity'))

Song name:  Bleed It Out
Album name:  Minutes to Midnight (Deluxe Edition)
Popularity:  43
Song name:  Faint
Album name:  Meteora
Popularity:  77
Song name:  Blackout
Album name:  A Thousand Suns
Popularity:  23
