# Ingesting the data into the VectorDB

This notebook is just to get things started - don't want to use the Api latency when I can do it without it

In [2]:
import numpy as np
import pandas as pd
from pymilvus import (
    connections,
    utility,
    FieldSchema, 
    CollectionSchema, 
    DataType,
    Collection,
)
from sentence_transformers import SentenceTransformer 

  from tqdm.autonotebook import tqdm, trange


In [3]:
# get df
df = pd.read_csv("../big_data_scraping/data_preprocessing/song_data_70k.csv")
df

Unnamed: 0.1,Unnamed: 0,track_artist,track_name,track_lyrics,track_id
0,0,Tommy Richman,MILLION DOLLAR BABY,"[Intro]\r\nDo it, baby, do what I should think...",8887d78d-4a41-48a7-8590-9d44959d8e6f
1,1,Kendrick Lamar,Not Like Us,"[Intro]\r\nPsst, I see dead people\r\n(\r\nMus...",1df2b141-83a4-4602-b64f-df03cda9aae5
2,2,Artemas,i like the way you kiss me,[Intro]\r\nI like the way you kiss me\r\nI lik...,6071b86a-55c1-47f1-bf66-6b90c7885026
3,3,Miley Cyrus,Flowers,"[Verse 1]\r\nWe were good, we were gold\r\nKin...",3d3fc1d8-16e8-4d62-8c84-bf9cf4280789
4,4,Eminem,Houdini,"[Skit: Paul Rosenberg]\r\nHey, Em, it's Paul\r...",22ba0d2f-1bcf-4e4c-8840-ff1bead128f1
...,...,...,...,...,...
71120,72380,Showtek,Booyah - Radio Edit,"Yes, all we care about is dem party\r\nKeeping...",70eaf07d-4d9e-4faa-bcb6-a92254573b27
71121,72381,Tiësto,Wasted,[Intro]\r\nI like us better when we're wasted\...,a1d0b478-0f77-4891-864a-4129b1d31d55
71122,72382,Ferry Corsten feat. Jenny Wahlstrom,Many Ways - Radio Edit,Think of all the places we have seen together\...,aaba0e31-f892-4ae1-8fdc-268607b7c1f4
71123,72383,Lush & Simon,City Of Lights - Official Radio Edit,Where colors collide\r\nAnd beam up the sky\r\...,f51a2f26-8155-479d-a44d-de1e3c097d9d


In [6]:
# setup milvus components

def addCollection(collection_name):
    version_name = collection_name
    
    dim = 384

    print("---start connecting to Milvus---")
    connections.connect("default", host="localhost", port="19530")

    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=64),
        FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]

    schema = CollectionSchema(fields, description="a Collection to Store the Songs")
    print(f"---created schema---")

    print(f"---Create collection {version_name}---")
    collection_milvus = Collection(version_name, schema, consistency_level="Strong")
    
    return collection_milvus

addCollection("song_collection")

---start connecting to Milvus---
---created schema---
---Create collection song_collection---


[93m[has_collection] retry:4, cost: 0.27s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m
[93m[has_collection] retry:5, cost: 0.81s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m
[93m[has_collection] retry:6, cost: 2.43s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m
[93m[has_collection] retry:7, cost: 7.29s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m
[93m[has_collection] retry:8, cost: 21.87s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m


<Collection>:
-------------
<name>: song_collection
<description>: a Collection to Store the Songs
<schema>: {
  auto_id: False
  description: a Collection to Store the Songs
  fields: [{
    name: id
    description: 
    type: 21
    params: {'max_length': 64}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 384}
  }]
}

In [7]:
def clearIndex(collection_name: str):
    connections.connect("default", host="localhost", port="19530")

    print("existing collections (before del): ")
    collections = utility.list_collections()
    print(collections)

    collection = Collection(collection_name)
    collection.drop()
    
    print("existing collections (after del): ")
    collections = utility.list_collections()
    print(collections)

In [37]:
# still needs testing
def filterText(txt: str) -> str:
    words_to_remove = ['and', 'but', 'or', 'so', 'yet', 'in', 'on', 'at', 'of', 'for', 'with', 'to', 'the', 'a', 'an']

    return ' '.join([word for word in txt.split() if word.lower() not in words_to_remove])

def ingestData():
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name="song_collection") 
    
    existing_ids = collection_milvus.query(expr='id != ""', output_fields=["id"])
    existing_ids = [id_json['id'] for id_json in existing_ids]

    model = SentenceTransformer('intfloat/multilingual-e5-small')

    for index, row in df.iterrows():
        if row['track_id'] in existing_ids: continue

        lyrics = filterText(row['track_lyrics'])
        id = row['track_id']

        embedding = model.encode(lyrics)

        entities = [
            [id], 
            [embedding]
        ]   

        collection_milvus.insert(entities)

        print(f"index: {index} id: {row['track_id']} added: {row['track_name']}")

    print("---Start Creating index IVF_FLAT---")
    index = {
        "index_type": "IVF_FLAT",
        "metric_type": "L2",
        "params": {"nlist": 128},
    }

    collection_milvus.create_index("embeddings", index)

    print("---ingested data---")

ingestData()


index: 19642 id: b201a8da-3975-4253-9db9-ce812da62057 added: Drive My Car
index: 19643 id: 802bd35e-fc02-4931-86e7-1ba16a3a484a added: Early Days
index: 19644 id: 0d682c8a-7442-4329-bb3e-1e21d524e59f added: Famous Groupies
index: 19645 id: e2be0890-9d3c-49de-b3de-34b02c65413d added: Feet In The Clouds
index: 19646 id: d46addf4-ae6f-4cc7-a674-179e471dbb05 added: Flaming Pie
index: 19647 id: 1828279f-227f-4bd9-9580-46c45313e188 added: Flying To My Home
index: 19648 id: 2f1c0543-7917-47e2-bb1c-895f9efd49a9 added: Footprints
index: 19649 id: bca4f330-0b5b-4b7c-afc2-8876e07515c8 added: Get Back
index: 19650 id: ea02f8c8-cc0e-42c2-be9a-c682f730218c added: Get It
index: 19651 id: a27d16cd-fae8-486d-8b24-3ec44a3fde5c added: Get Out Of My Way
index: 19652 id: 30cf737c-e7bf-437c-96b2-4bdd7db52440 added: Getting Better
index: 19653 id: 7dcbab63-b21a-464d-a3d3-b3272f8bf368 added: Girlfriend
index: 19654 id: 94aa56b3-3f26-4c64-bda4-7d85f2e7e11c added: Girls School
index: 19655 id: 8f60ac6c-a315-4f4

RPC error: [create_index], <MilvusException: (code=1, message=create index failed, collection is loaded, please release it first)>, <Time:{'RPC start': '2024-09-28 20:59:34.532230', 'RPC error': '2024-09-28 20:59:34.556456'}>


index: 71124 id: d056f708-1116-4b04-9c54-0cbb0799238b added: Sweet Surrender - Radio Edit
---Start Creating index IVF_FLAT---


MilvusException: <MilvusException: (code=1, message=create index failed, collection is loaded, please release it first)>

In [6]:
def test_index():
    print("---testing collection search---")
    print("---loading in collection---")
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name="song_collection") 
    # asking milvus

    collection_milvus.load()
    
    # model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    model = SentenceTransformer('intfloat/multilingual-e5-small')
    
    question_embedding = model.encode(["""Ah!
Ya Jenni' llegó
Presente, jajaja (hey...)
Here we go!
Let's get loud, let's get loud
Turn the music up, let's do it
C'mon, people, let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do
If you wanna live your life, live it all the way and don't you waste it
Every feeling, every beat
Can be so very sweet, you gotta taste it, mmm-hmm!
You gotta do it (you gotta do it), you gotta do it your way
You gotta prove it (you gotta prove it), you gotta mean what you say (c'mon, you know what we're here for!)
You gotta do it (do it), you gotta do it your way
You gotta prove it (prove it), you gotta mean what you say
Life's a party, make it hot
Dance don't ever stop, whatever rhythm
Every minute, every day
Take them all the way, you gotta live 'em ('cause I'ma to live my life)
You gotta do it (you gotta do it), you gotta do it your way
You gotta prove it (you gotta prove it), you gotta mean what you say
You gotta do it (do it), you gotta do it your way
You gotta prove it (prove it), you gotta mean what you say
Let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do, oh no
Let's get down, let's get down
Let's get down, let's get down (it's just a party, baby, c'mon!)
Let's get loud, let's get loud
Let's get loud (pump it up!)
Ah, hey, hey, hey, hey, hey, hey, hey, hey
Hey, hey, hey, hey, hey, hey, ah!
Hey, hey, hey, hey, hey, hey, hey, hey
Hey, hey, hey, hey
Life is meant to be big fun, you're not hurting anyone
Nobody loses
Let the music make you free, be what you wanna be
Make no excuses
You gotta do it (do what you want), you gotta do it your way
You gotta prove it (say what), you gotta mean what you say (you wanna say)
You gotta do it (go where you wanna go), you gotta do it your way
You gotta prove it, you gotta mean what you say (just do it, oh!)
Let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do (¡eso!)
Así, así es que me gusta, ya tú sabe
¡Echale candela!
Let's get loud
Let's get loud
Let's get loud!
Let's get loud!
Let's get loud, let's get loud
Turn the music up to hear that sound
Let's get loud, let's get loud
Ain't nobody gotta tell ya what you gotta do
You didn't know I could do that, did ya?"""])

    print("question embedding----------")#
    print(question_embedding)

    print("---Start searching based on vector similarity---")

    search_params = {
        "metric_type": "L2",
        "params": {"nprobe": 10},
    }

    result = collection_milvus.search(question_embedding, "embeddings", search_params, limit=6, output_fields=["id"])

    df = pd.read_csv("../big_data_scraping/data_preprocessing/song_data_70k.csv")

    for hits in result:
        for hit in hits:
            print(df.loc[df['track_id'] == hit.id, ['track_name', 'track_artist']])
            print(f"id: {hit.id}")
            print("------------")

test_index()

---testing collection search---
---loading in collection---
question embedding----------
[[-0.00747095  0.00257968 -0.0348444  -0.04638833  0.07885943 -0.02049727
   0.00548802  0.00868194  0.02448561  0.01067042  0.04073911  0.0589422
   0.0501571  -0.02702927 -0.02002883  0.03739037  0.0554362  -0.045541
   0.01535303 -0.05473274  0.02363514  0.00166546 -0.00186963  0.03064924
   0.03205242 -0.00589514 -0.04467153  0.02543856  0.05681364 -0.01276324
  -0.049142   -0.06740464  0.07330898 -0.07931832  0.07566448  0.0080807
  -0.07075278 -0.02521575  0.09397265 -0.05086026 -0.04367087  0.01971498
   0.02733225  0.07894653  0.06404363  0.03102125 -0.04515716  0.04466894
  -0.05309312 -0.0330974  -0.07788707  0.0816078   0.04060754  0.05916335
   0.04174423 -0.04165281 -0.07284707 -0.04713142 -0.06543812  0.04543765
   0.10717574 -0.03914286  0.00617758  0.05152401  0.10742613  0.04600738
  -0.00452845  0.02715812 -0.02804584 -0.01935982 -0.04199956  0.04494206
  -0.03536873 -0.04226765 -

In [15]:
def del_collection():
    collection_name = "song_collection"

    print("---loading in collection---")
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name=collection_name) 
    
    # Check if the collection exists
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
        print(f"Collection '{collection_name}' has been deleted.")
    else:
        print(f"Collection '{collection_name}' does not exist.")

    

---loading in collection---
Collection 'song_collection' has been deleted.


In [None]:
# tested a bit around with searching the songs with some sense description and had good results

# extremely intersting - songs are solid picks for the vibe the base song goes for 