# Ingesting the data into the VectorDB

This notebook is just to get things started - don't want to use the Api latency when I can do it without it

In [2]:
import numpy as np
import pandas as pd
from pymilvus import (
    connections,
    utility,
    FieldSchema, 
    CollectionSchema, 
    DataType,
    Collection,
)
from sentence_transformers import SentenceTransformer 

  from tqdm.autonotebook import tqdm, trange


In [18]:
# get df
df = pd.read_csv("../song_data.csv")
df

Unnamed: 0.1,Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_genre,duration_ms,lyrics,track_id
0,0,Dance Monkey,Tones and I,100,Dance Monkey (Stripped Back) / Dance Monkey,2019-10-17,latin,209438,"[Verse 1]\nThey say, ""Oh my god, I see the way...",7188d3af-ebc1-431e-b181-c5b27bd81f6a
1,1,ROXANNE,Arizona Zervas,99,ROXANNE,2019-10-10,r&b,163636,[Intro]\nAll for the 'Gram\nBitches love the '...,ba6cf733-0155-4005-a347-07bff6498c58
2,2,Blinding Lights,The Weeknd,98,Blinding Lights,2019-11-29,latin,201573,[Intro]\nYeah\n[Verse 1]\nI've been tryna call...,9ea810d2-de74-4e35-918a-e64816148a86
3,3,Circles,Post Malone,98,Hollywood's Bleeding,2019-09-06,pop,215280,"[Intro]\nOh, oh, oh\nOh, oh, oh\nOh, oh, oh,...",0a2cc686-4fef-4f14-9e22-36cbafdf0a12
4,4,Tusa,KAROL G,98,Tusa,2019-11-07,rap,200960,"11/1\nAlan Walker - \n""Avem (The Aviation Them...",4639949c-65d1-45f8-8071-445fe5a9a227
...,...,...,...,...,...,...,...,...,...,...
3834,3834,Quem me Colocou pra Beber,Os Barões Da Pisadinha,65,Quem me Colocou pra Beber,2019-11-08,edm,138837,Prepara o olho que lá vem o choro\nPrepara o b...,581f0699-2ed5-45d3-a791-135fbedcdb40
3835,3835,Bark at the Moon,Ozzy Osbourne,65,Bark At The Moon (Expanded Edition),1983-12-10,rock,257120,[Verse 1]\nScreams break the silence\nWaking f...,797be00a-6009-4f6f-b460-60113ae0a8b4
3836,3836,Ain't Talkin' 'Bout Love - 2015 Remaster,Van Halen,65,Van Halen (Remastered),1978-02-10,rock,227800,"Scarface\nBy: Oliver Stone\n""Enjoy yourself --...",65ba932d-9786-462d-b255-3327eba55b86
3837,3837,MMMBop,Hanson,65,Middle Of Nowhere,1997-01-01,rock,268653,24. November 1996 – 18. Januar 1997\nBackstree...,a2fa30ed-88aa-4c1a-89f3-ce9603056849


In [19]:
# setup milvus components

def addCollection(collection_name):
    version_name = collection_name
    
    dim = 384

    print("---start connecting to Milvus---")
    connections.connect("default", host="localhost", port="19530")

    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=64),
        FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]

    schema = CollectionSchema(fields, description="a Collection to Store the Songs")
    print(f"---created schema---")

    print(f"---Create collection {version_name}---")
    collection_milvus = Collection(version_name, schema, consistency_level="Strong")
    
    return collection_milvus

addCollection("song_collection")

---start connecting to Milvus---
---created schema---
---Create collection song_collection---


<Collection>:
-------------
<name>: song_collection
<description>: a Collection to Store the Songs
<schema>: {
  auto_id: False
  description: a Collection to Store the Songs
  fields: [{
    name: id
    description: 
    type: 21
    params: {'max_length': 64}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 384}
  }]
}

In [21]:

def ingestData():
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name="song_collection") 

    model = SentenceTransformer('intfloat/multilingual-e5-small')

    for index, row in df.iterrows():
        lyrics = row['lyrics']
        id = row['track_id']

        embedding = model.encode(lyrics)

        entities = [
            [id], 
            [embedding]
        ]   

        collection_milvus.insert(entities)

        print(f"index: {index} added: {row['track_name']}")

    print("---Start Creating index IVF_FLAT---")
    index = {
        "index_type": "IVF_FLAT",
        "metric_type": "L2",
        "params": {"nlist": 128},
    }

    collection_milvus.create_index("embeddings", index)

    print("---ingested data---")


ingestData()


index: 0 added: Dance Monkey
index: 1 added: ROXANNE
index: 2 added: Blinding Lights
index: 3 added: Circles
index: 4 added: Tusa
index: 5 added: The Box
index: 6 added: Memories
index: 7 added: Don't Start Now
index: 8 added: Falling
index: 9 added: everything i wanted
index: 10 added: RITMO (Bad Boys For Life)
index: 11 added: Yummy
index: 12 added: bad guy
index: 13 added: hot girl bummer
index: 14 added: Ride It
index: 15 added: Someone You Loved
index: 16 added: My Oh My (feat. DaBaby)
index: 17 added: HIGHEST IN THE ROOM
index: 18 added: Heartless
index: 19 added: Vete
index: 20 added: Life Is Good (feat. Drake)
index: 21 added: China
index: 22 added: Señorita
index: 23 added: BOP
index: 24 added: Lose You To Love Me
index: 25 added: Lucid Dreams
index: 26 added: Bandit (with YoungBoy Never Broke Again)
index: 27 added: Fantasias
index: 28 added: No Idea
index: 29 added: Futsal Shuffle 2020
index: 30 added: How Do You Sleep?
index: 31 added: Adore You
index: 32 added: Hola - Remi

In [1]:
def test_index():
    print("---testing collection search---")
    print("---loading in collection---")
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name="song_collection") 
    # asking milvus

    collection_milvus.load()
    
    # model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    model = SentenceTransformer('intfloat/multilingual-e5-small')
    
    question_embedding = model.encode(["""Ah!
Ya Jenni' llegó
Presente, jajaja (hey...)
Here we go!
Let's get loud, let's get loud
Turn the music up, let's do it
C'mon, people, let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do
If you wanna live your life, live it all the way and don't you waste it
Every feeling, every beat
Can be so very sweet, you gotta taste it, mmm-hmm!
You gotta do it (you gotta do it), you gotta do it your way
You gotta prove it (you gotta prove it), you gotta mean what you say (c'mon, you know what we're here for!)
You gotta do it (do it), you gotta do it your way
You gotta prove it (prove it), you gotta mean what you say
Life's a party, make it hot
Dance don't ever stop, whatever rhythm
Every minute, every day
Take them all the way, you gotta live 'em ('cause I'ma to live my life)
You gotta do it (you gotta do it), you gotta do it your way
You gotta prove it (you gotta prove it), you gotta mean what you say
You gotta do it (do it), you gotta do it your way
You gotta prove it (prove it), you gotta mean what you say
Let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do, oh no
Let's get down, let's get down
Let's get down, let's get down (it's just a party, baby, c'mon!)
Let's get loud, let's get loud
Let's get loud (pump it up!)
Ah, hey, hey, hey, hey, hey, hey, hey, hey
Hey, hey, hey, hey, hey, hey, ah!
Hey, hey, hey, hey, hey, hey, hey, hey
Hey, hey, hey, hey
Life is meant to be big fun, you're not hurting anyone
Nobody loses
Let the music make you free, be what you wanna be
Make no excuses
You gotta do it (do what you want), you gotta do it your way
You gotta prove it (say what), you gotta mean what you say (you wanna say)
You gotta do it (go where you wanna go), you gotta do it your way
You gotta prove it, you gotta mean what you say (just do it, oh!)
Let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do (¡eso!)
Así, así es que me gusta, ya tú sabe
¡Echale candela!
Let's get loud
Let's get loud
Let's get loud!
Let's get loud!
Let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do
You didn't know I could do that, did ya?"""])

    print("question embedding----------")#
    print(question_embedding)

    print("---Start searching based on vector similarity---")

    search_params = {
        "metric_type": "L2",
        "params": {"nprobe": 10},
    }

    result = collection_milvus.search(question_embedding, "embeddings", search_params, limit=6, output_fields=["id"])

    df = pd.read_csv("../song_data.csv")

    for hits in result:
        for hit in hits:
            print(df.loc[df['track_id'] == hit.id, ['track_name', 'track_artist']])
            print(f"id: {hit.id}")
            print("------------")

test_index()

---testing collection search---
---loading in collection---


NameError: name 'connections' is not defined

In [15]:
def del_collection():
    collection_name = "song_collection"

    print("---loading in collection---")
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name=collection_name) 
    
    # Check if the collection exists
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
        print(f"Collection '{collection_name}' has been deleted.")
    else:
        print(f"Collection '{collection_name}' does not exist.")

    

---loading in collection---
Collection 'song_collection' has been deleted.


In [None]:
# tested a bit around with searching the songs with some sense description and had good results

# extremely intersting - songs are solid picks for the vibe the base song goes for 