# Ingesting the data into the VectorDB

This notebook is just to get things started - don't want to use the Api latency when I can do it without it

In [1]:
import numpy as np
import pandas as pd
from pymilvus import (
    connections,
    utility,
    FieldSchema, 
    CollectionSchema, 
    DataType,
    Collection,
)
from sentence_transformers import SentenceTransformer 

  from tqdm.autonotebook import tqdm, trange


In [14]:
# get df
df = pd.read_csv("../song_data.csv")


In [4]:
# setup milvus components

def addCollection(collection_name):
    version_name = collection_name
    
    dim = 384

    print("---start connecting to Milvus---")
    connections.connect("default", host="localhost", port="19530")

    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=64),
        FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]

    schema = CollectionSchema(fields, description="a Collection to Store the Songs")
    print(f"---created schema---")

    print(f"---Create collection {version_name}---")
    collection_milvus = Collection(version_name, schema, consistency_level="Strong")
    
    return collection_milvus

addCollection("song_collection")

---start connecting to Milvus---
---created schema---
---Create collection song_collection---


<Collection>:
-------------
<name>: song_collection
<description>: a Collection to Store the Songs
<schema>: {
  auto_id: False
  description: a Collection to Store the Songs
  fields: [{
    name: id
    description: 
    type: 21
    params: {'max_length': 64}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 384}
  }]
}

In [6]:
model = SentenceTransformer('intfloat/multilingual-e5-small')

In [18]:

def ingestData():
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name="song_collection") 

    model = SentenceTransformer('intfloat/multilingual-e5-small')

    stop = 0

    for index, row in df.iterrows():
        lyrics = row['lyrics']
        id = row['track_id']

        embedding = model.encode(lyrics)

        entities = [
            [id], 
            [embedding]
        ]   

        collection_milvus.insert(entities)

        print(row)

        if stop > 5:
            break
        stop += 1

    print("---Start Creating index IVF_FLAT---")
    index = {
        "index_type": "IVF_FLAT",
        "metric_type": "L2",
        "params": {"nlist": 128},
    }

    collection_milvus.create_index("embeddings", index)

    print("---ingested data---")


ingestData()


Unnamed: 0                                                                  0
track_name                                                       Dance Monkey
track_artist                                                      Tones and I
track_popularity                                                          100
track_album_name                  Dance Monkey (Stripped Back) / Dance Monkey
track_album_release_date                                           2019-10-17
playlist_genre                                                          latin
duration_ms                                                            209438
lyrics                      [Verse 1]\nThey say, "Oh my god, I see the way...
track_id                                 7188d3af-ebc1-431e-b181-c5b27bd81f6a
Name: 0, dtype: object
Unnamed: 0                                                                  1
track_name                                                            ROXANNE
track_artist                             

In [27]:
def test_index():
    print("---testing collection search---")
    print("---loading in collection---")
    connections.connect("default", host="localhost", port="19530")

    collection_milvus = Collection(name="song_collection") 
    # asking milvus

    collection_milvus.load()
    
    # model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    model = SentenceTransformer('intfloat/multilingual-e5-small')
    
    question_embedding = model.encode([" The song is based on the loss of the band's manager and Levine's friend, Jordan Feldstein who died in December 2017, from pulmonary embolism"])

    print("---Start searching based on vector similarity---")

    search_params = {
        "metric_type": "L2",
        "params": {"nprobe": 10},
    }

    result = collection_milvus.search(question_embedding, "embeddings", search_params, limit=3, output_fields=["id"])

    for hits in result:
        for hit in hits:
            print(f"hit: {hit}")
            print("------------")

test_index()

---testing collection search---
---loading in collection---
---Start searching based on vector similarity---
hit: id: 90ec806b-56e3-4461-a629-535dbda97c87, distance: 0.3806748390197754, entity: {'id': '90ec806b-56e3-4461-a629-535dbda97c87'}
------------
hit: id: 0a2cc686-4fef-4f14-9e22-36cbafdf0a12, distance: 0.3866581320762634, entity: {'id': '0a2cc686-4fef-4f14-9e22-36cbafdf0a12'}
------------
hit: id: 7188d3af-ebc1-431e-b181-c5b27bd81f6a, distance: 0.3926306366920471, entity: {'id': '7188d3af-ebc1-431e-b181-c5b27bd81f6a'}
------------


In [None]:
# tested a bit around with searching the songs with some sense description and had good results