## 전처리

In [123]:
import pandas as pd
from ast import literal_eval

movies = pd.read_csv('./data/movies_metadata.csv', nrows=30)[['id', 'title', 'genres', 'vote_average']]
movies['genres'] = movies['genres'].apply(literal_eval).apply(lambda genres : ', '.join([g['name'] for g in genres]))
movies['id'] = movies['id'].astype(int)
movies.head(3)

Unnamed: 0,id,title,genres,vote_average
0,862,Toy Story,"Animation, Comedy, Family",7.7
1,8844,Jumanji,"Adventure, Fantasy, Family",6.9
2,15602,Grumpier Old Men,"Romance, Comedy",6.5


## Embedding

In [124]:
import torch
from sentence_transformers import SentenceTransformer

device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
model = SentenceTransformer("jhgan/ko-sroberta-multitask").to(device=device)

embeddings = model.encode(movies['title'], convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

* embedding 컬럼 추가

In [125]:
movies['embeddings'] = embeddings.tolist()
movies.head(3)

Unnamed: 0,id,title,genres,vote_average,embeddings
0,862,Toy Story,"Animation, Comedy, Family",7.7,"[0.09099487215280533, -0.518666684627533, -0.3..."
1,8844,Jumanji,"Adventure, Fantasy, Family",6.9,"[0.3581549823284149, 0.09385906904935837, 0.45..."
2,15602,Grumpier Old Men,"Romance, Comedy",6.5,"[-0.09264998883008957, -0.34356689453125, 0.47..."


* dimension 체크

In [131]:
movies['embeddings'].shape

(30,)

## 콜렉션 생성
* Collection : RDB의 Table과 비슷. 하나 이상의 파티션으로 구성. 기본적으로 단일 컬렉션에는 두 개의 샤드가 포함된다.
* DataType 및 속성 : https://milvus.io/docs/create_collection.md

In [132]:
from pymilvus import FieldSchema, CollectionSchema, DataType

def init_schema() -> CollectionSchema :
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=256),
        FieldSchema(name="genres", dtype=DataType.VARCHAR, max_length=256),
        FieldSchema(name="vote_average", dtype=DataType.DOUBLE),
        FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=768) # dim 벡터 차원
    ]
    return CollectionSchema(fields, "movie")

##  Milvus에 데이터 추가

In [None]:
from milvus import default_server
from pymilvus import connections, Collection, utility

with default_server:
    default_server.set_base_dir('milvus_data')

    # 서버 연결
    connections.connect(host='127.0.0.1', port=default_server.listen_port)

    schema = init_schema()
    utility.drop_collection("movie")

    # 컬렉션 생성
    collection = Collection("movie",
                          schema,
                          using="default", # 서버 별칭을 사용하여 컬렉션을 생성할 서버명을 지정 가능
                          shards_num=2 # 샤드 수
                          )

    # 데이터 삽입
    collection.insert(movies)
    collection.flush() # 세그먼트는 특정 크기 이상이어야 sealed 됨. 강제로 sealed하여 인덱싱

    # 색인 저장
    index = {
        "index_type": "IVF_FLAT", # 벡터 검색 가속화 설정
        "metric_type": "L2", # 벡터의 유사성을 측정하는 메트릭 유형 L2는 유클리드 거리
        "params": {"nlist": 128 } # IVF_FLAT는 벡터를 nlist클러스터 단위로 나누고 입력 벡터와 각 클러스터 중심 간의 거리를 비교
    }
    collection.create_index("embeddings", index)

    # 메모리 로드
    collection.load()
    q = model.encode("toy story")
    search_params = {
        "metric_type": "L2",
        "params": {"nprobe": 10 } 
    }
    results = collection.search(data=[q], 
                                anns_field="embeddings", # 검색 대상
                                param=search_params, 
                                limit=3, 
                                output_fields=["id", "title"], # 출력 필드
                                consistency_level="Strong" # 검색의 일관성 수준
                               )
    print(results)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.8-lite
 Process:   65678
 Started:   2023-05-27 02:06:08
 Config:    /Users/eunmi.kim/source/study/training-ml/ch_milvus/milvus_data/configs/milvus.yaml
 Logs:      /Users/eunmi.kim/source/study/training-ml/ch_milvus/milvus_data/logs

 Ctrl+C to exit ...
['["id: 862, distance: 15.68191909790039, entity: {\'id\': 862, \'title\': \'Toy Story\'}", "id: 31357, distance: 138.7854461669922, entity: {\'id\': 31357, \'title\': \'Waiting to Exhale\'}", "id: 45325, distance: 162.78607177734375, entity: {\'id\': 45325, \'title\': \'To